diff mbox series

[05/14] xfs: repair free space btrees

Message ID 153292970169.24509.4581630892233165448.stgit@magnolia (mailing list archive)
State Superseded, archived
Headers show
Series xfs-4.19: online repair support | expand

Commit Message

Darrick J. Wong July 30, 2018, 5:48 a.m. UTC
From: Darrick J. Wong <darrick.wong@oracle.com>

Rebuild the free space btrees from the gaps in the rmap btree.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/Makefile             |    1 
 fs/xfs/scrub/alloc.c        |    1 
 fs/xfs/scrub/alloc_repair.c |  581 +++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/scrub/common.c       |    8 +
 fs/xfs/scrub/repair.h       |    2 
 fs/xfs/scrub/scrub.c        |    4 
 fs/xfs/scrub/trace.h        |    2 
 fs/xfs/xfs_extent_busy.c    |   14 +
 fs/xfs/xfs_extent_busy.h    |    2 
 9 files changed, 610 insertions(+), 5 deletions(-)
 create mode 100644 fs/xfs/scrub/alloc_repair.c



--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Brian Foster July 31, 2018, 5:47 p.m. UTC | #1
On Sun, Jul 29, 2018 at 10:48:21PM -0700, Darrick J. Wong wrote:
> From: Darrick J. Wong <darrick.wong@oracle.com>
> 
> Rebuild the free space btrees from the gaps in the rmap btree.
> 
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> ---
>  fs/xfs/Makefile             |    1 
>  fs/xfs/scrub/alloc.c        |    1 
>  fs/xfs/scrub/alloc_repair.c |  581 +++++++++++++++++++++++++++++++++++++++++++
>  fs/xfs/scrub/common.c       |    8 +
>  fs/xfs/scrub/repair.h       |    2 
>  fs/xfs/scrub/scrub.c        |    4 
>  fs/xfs/scrub/trace.h        |    2 
>  fs/xfs/xfs_extent_busy.c    |   14 +
>  fs/xfs/xfs_extent_busy.h    |    2 
>  9 files changed, 610 insertions(+), 5 deletions(-)
>  create mode 100644 fs/xfs/scrub/alloc_repair.c
> 
> 
...
> diff --git a/fs/xfs/scrub/alloc_repair.c b/fs/xfs/scrub/alloc_repair.c
> new file mode 100644
> index 000000000000..b228c2906de2
> --- /dev/null
> +++ b/fs/xfs/scrub/alloc_repair.c
> @@ -0,0 +1,581 @@
...
> +/* Record extents that aren't in use from gaps in the rmap records. */
> +STATIC int
> +xrep_abt_walk_rmap(
> +	struct xfs_btree_cur	*cur,
> +	struct xfs_rmap_irec	*rec,
> +	void			*priv)
> +{
> +	struct xrep_abt		*ra = priv;
> +	struct xrep_abt_extent	*rae;
> +	xfs_fsblock_t		fsb;
> +	int			error;
> +
> +	/* Record all the OWN_AG blocks... */
> +	if (rec->rm_owner == XFS_RMAP_OWN_AG) {
> +		fsb = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_private.a.agno,
> +				rec->rm_startblock);
> +		error = xfs_bitmap_set(ra->btlist, fsb, rec->rm_blockcount);
> +		if (error)
> +			return error;
> +	}
> +
> +	/* ...and all the rmapbt blocks... */
> +	error = xfs_bitmap_set_btcur_path(&ra->nobtlist, cur);
> +	if (error)
> +		return error;
> +
> +	/* ...and all the free space. */
> +	if (rec->rm_startblock > ra->next_bno) {
> +		trace_xrep_abt_walk_rmap(cur->bc_mp, cur->bc_private.a.agno,
> +				ra->next_bno, rec->rm_startblock - ra->next_bno,
> +				XFS_RMAP_OWN_NULL, 0, 0);
> +
> +		rae = kmem_alloc(sizeof(struct xrep_abt_extent), KM_MAYFAIL);
> +		if (!rae)
> +			return -ENOMEM;
> +		INIT_LIST_HEAD(&rae->list);
> +		rae->bno = ra->next_bno;
> +		rae->len = rec->rm_startblock - ra->next_bno;
> +		list_add_tail(&rae->list, ra->extlist);

Any reason we don't use a bitmap for this one?

> +		ra->nr_records++;
> +		ra->nr_blocks += rae->len;
> +	}
> +	ra->next_bno = max_t(xfs_agblock_t, ra->next_bno,
> +			rec->rm_startblock + rec->rm_blockcount);

The max_t() is to cover the record overlap case, right? If so, another
one liner comment would be good.

> +	return 0;
> +}
> +
...
> +/* Free an extent, which creates a record in the bnobt/cntbt. */
> +STATIC int
> +xrep_abt_free_extent(
> +	struct xfs_scrub	*sc,
> +	xfs_fsblock_t		fsbno,
> +	xfs_extlen_t		len,
> +	struct xfs_owner_info	*oinfo)
> +{
> +	int			error;
> +
> +	error = xfs_free_extent(sc->tp, fsbno, len, oinfo, 0);
> +	if (error)
> +		return error;
> +	error = xrep_roll_ag_trans(sc);
> +	if (error)
> +		return error;
> +	return xfs_mod_fdblocks(sc->mp, -(int64_t)len, false);

What's this call for? Is it because the blocks we're freeing were
already free? (Similar question on the other xfs_mod_fdblocks() call
further down).

BTW, what prevents some other task from coming along and screwing with
this? For example, could a large falloc or buffered write come in and
allocate these global blocks before we take them away here (causing the
whole sequence to fail)?

> +}
> +
...
> +/*
> + * Allocate a block from the (cached) first extent in the AG.  In theory
> + * this should never fail, since we already checked that there was enough
> + * space to handle the new btrees.
> + */
> +STATIC xfs_fsblock_t
> +xrep_abt_alloc_block(
> +	struct xfs_scrub	*sc,
> +	struct list_head	*free_extents)
> +{
> +	struct xrep_abt_extent	*ext;
> +
> +	/* Pull the first free space extent off the list, and... */
> +	ext = list_first_entry(free_extents, struct xrep_abt_extent, list);
> +
> +	/* ...take its first block. */
> +	ext->bno++;
> +	ext->len--;
> +	if (ext->len == 0) {
> +		list_del(&ext->list);
> +		kmem_free(ext);
> +	}
> +
> +	return XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, ext->bno - 1);

Looks like a potential use after free of ext.

> +}
> +
...
> +/*
> + * Reset the global free block counter and the per-AG counters to make it look
> + * like this AG has no free space.
> + */
> +STATIC int
> +xrep_abt_reset_counters(
> +	struct xfs_scrub	*sc,
> +	int			*log_flags)
> +{
> +	struct xfs_perag	*pag = sc->sa.pag;
> +	struct xfs_agf		*agf;
> +	xfs_agblock_t		new_btblks;
> +	xfs_agblock_t		to_free;
> +	int			error;
> +
> +	/*
> +	 * Since we're abandoning the old bnobt/cntbt, we have to decrease
> +	 * fdblocks by the # of blocks in those trees.  btreeblks counts the
> +	 * non-root blocks of the free space and rmap btrees.  Do this before
> +	 * resetting the AGF counters.
> +	 */

Hmm, I'm not quite following the comment wrt to the xfs_mod_fdblocks()
below. to_free looks like it's the count of all current btree blocks
minus rmap blocks (i.e., old bno/cnt btree blocks). Are we "allocating"
those blocks here because we're going to free them later?

> +	agf = XFS_BUF_TO_AGF(sc->sa.agf_bp);
> +
> +	/* rmap_blocks accounts root block, btreeblks doesn't */
> +	new_btblks = be32_to_cpu(agf->agf_rmap_blocks) - 1;
> +
> +	/* btreeblks doesn't account bno/cnt root blocks */
> +	to_free = pag->pagf_btreeblks + 2;
> +
> +	/* and don't account for the blocks we aren't freeing */
> +	to_free -= new_btblks;
> +
> +	error = xfs_mod_fdblocks(sc->mp, -(int64_t)to_free, false);
> +	if (error)
> +		return error;
> +
> +	/*
> +	 * Reset the per-AG info, both incore and ondisk.  Mark the incore
> +	 * state stale in case we fail out of here.
> +	 */
> +	ASSERT(pag->pagf_init);
> +	pag->pagf_init = 0;
> +	pag->pagf_btreeblks = new_btblks;
> +	pag->pagf_freeblks = 0;
> +	pag->pagf_longest = 0;
> +
> +	agf->agf_btreeblks = cpu_to_be32(new_btblks);
> +	agf->agf_freeblks = 0;
> +	agf->agf_longest = 0;
> +	*log_flags |= XFS_AGF_BTREEBLKS | XFS_AGF_LONGEST | XFS_AGF_FREEBLKS;
> +
> +	return 0;
> +}
> +
> +/* Initialize a new free space btree root and implant into AGF. */
> +STATIC int
> +xrep_abt_reset_btree(
> +	struct xfs_scrub	*sc,
> +	xfs_btnum_t		btnum,
> +	struct list_head	*free_extents)
> +{
> +	struct xfs_owner_info	oinfo;
> +	struct xfs_buf		*bp;
> +	struct xfs_perag	*pag = sc->sa.pag;
> +	struct xfs_mount	*mp = sc->mp;
> +	struct xfs_agf		*agf = XFS_BUF_TO_AGF(sc->sa.agf_bp);
> +	xfs_fsblock_t		fsbno;
> +	int			error;
> +
> +	/* Allocate new root block. */
> +	fsbno = xrep_abt_alloc_block(sc, free_extents);

xrep_abt_alloc_block() converts an agbno to return an fsb. This function
passes the fsb to the init call just below and then converts it back to
an agbno in two places. It seems like there might be less conversions to
follow if the above just returned an agbno and we converted it to an fsb
once for xrep_init_btblock().

> +	if (fsbno == NULLFSBLOCK)
> +		return -ENOSPC;
> +
> +	/* Initialize new tree root. */
> +	error = xrep_init_btblock(sc, fsbno, &bp, btnum, &xfs_allocbt_buf_ops);
> +	if (error)
> +		return error;
> +
> +	/* Implant into AGF. */
> +	agf->agf_roots[btnum] = cpu_to_be32(XFS_FSB_TO_AGBNO(mp, fsbno));
> +	agf->agf_levels[btnum] = cpu_to_be32(1);
> +
> +	/* Add rmap records for the btree roots */
> +	xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG);
> +	error = xfs_rmap_alloc(sc->tp, sc->sa.agf_bp, sc->sa.agno,
> +			XFS_FSB_TO_AGBNO(mp, fsbno), 1, &oinfo);
> +	if (error)
> +		return error;
> +
> +	/* Reset the incore state. */
> +	pag->pagf_levels[btnum] = 1;
> +
> +	return 0;
> +}
> +
...
> +
> +/*
> + * Make our new freespace btree roots permanent so that we can start freeing
> + * unused space back into the AG.
> + */
> +STATIC int
> +xrep_abt_commit_new(
> +	struct xfs_scrub	*sc,
> +	struct xfs_bitmap	*old_allocbt_blocks,
> +	int			log_flags)
> +{
> +	int			error;
> +
> +	xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, log_flags);
> +
> +	/* Invalidate the old freespace btree blocks and commit. */
> +	error = xrep_invalidate_blocks(sc, old_allocbt_blocks);
> +	if (error)
> +		return error;

It looks like the above invalidation all happens in the same
transaction. Those aren't logging buffer data or anything, but any idea
how many log formats we can get away with in this single transaction?

> +	error = xrep_roll_ag_trans(sc);
> +	if (error)
> +		return error;
> +
> +	/* Now that we've succeeded, mark the incore state valid again. */
> +	sc->sa.pag->pagf_init = 1;
> +	return 0;
> +}
> +
> +/* Build new free space btrees and dispose of the old one. */
> +STATIC int
> +xrep_abt_rebuild_trees(
> +	struct xfs_scrub	*sc,
> +	struct list_head	*free_extents,
> +	struct xfs_bitmap	*old_allocbt_blocks)
> +{
> +	struct xfs_owner_info	oinfo;
> +	struct xrep_abt_extent	*rae;
> +	struct xrep_abt_extent	*n;
> +	struct xrep_abt_extent	*longest;
> +	int			error;
> +
> +	xfs_rmap_skip_owner_update(&oinfo);
> +
> +	/*
> +	 * Insert the longest free extent in case it's necessary to
> +	 * refresh the AGFL with multiple blocks.  If there is no longest
> +	 * extent, we had exactly the free space we needed; we're done.
> +	 */

I'm confused by the last sentence. longest should only be NULL if the
free space list is empty and haven't we already bailed out with -ENOSPC
if that's the case?

> +	longest = xrep_abt_get_longest(free_extents);
> +	if (!longest)
> +		goto done;
> +	error = xrep_abt_free_extent(sc,
> +			XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, longest->bno),
> +			longest->len, &oinfo);
> +	list_del(&longest->list);
> +	kmem_free(longest);
> +	if (error)
> +		return error;
> +
> +	/* Insert records into the new btrees. */
> +	list_for_each_entry_safe(rae, n, free_extents, list) {
> +		error = xrep_abt_free_extent(sc,
> +				XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, rae->bno),
> +				rae->len, &oinfo);
> +		if (error)
> +			return error;
> +		list_del(&rae->list);
> +		kmem_free(rae);
> +	}

Ok, at this point we've reset the btree roots and we start freeing the
free ranges that were discovered via the rmapbt analysis. AFAICT, if we
fail or crash at this point, we leave the allocbts in a partially
constructed state. I take it that is Ok with respect to the broader
repair algorithm because we'd essentially start over by inspecting the
rmapbt again on a retry.

The blocks allocated for the btrees that we've begun to construct here
end up mapped in the rmapbt as we go, right? IIUC, that means we don't
necessarily have infinite retries to make sure this completes. IOW,
suppose that a first repair attempt finds just enough free space to
construct new trees, gets far enough along to consume most of that free
space and then crashes. Is it possible that a subsequent repair attempt
includes the btree blocks allocated during the previous failed repair
attempt in the sum of "old btree blocks" and determines we don't have
enough free space to repair?

> +
> +done:
> +	/* Free all the OWN_AG blocks that are not in the rmapbt/agfl. */
> +	xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG);
> +	return xrep_reap_extents(sc, old_allocbt_blocks, &oinfo,
> +			XFS_AG_RESV_NONE);
> +}
> +
...
> diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
> index 0ed68379e551..82f99633a597 100644
> --- a/fs/xfs/xfs_extent_busy.c
> +++ b/fs/xfs/xfs_extent_busy.c
> @@ -657,3 +657,17 @@ xfs_extent_busy_ag_cmp(
>  		diff = b1->bno - b2->bno;
>  	return diff;
>  }
> +
> +/* Are there any busy extents in this AG? */
> +bool
> +xfs_extent_busy_list_empty(
> +	struct xfs_perag	*pag)
> +{
> +	spin_lock(&pag->pagb_lock);
> +	if (pag->pagb_tree.rb_node) {

RB_EMPTY_ROOT()?

Brian

> +		spin_unlock(&pag->pagb_lock);
> +		return false;
> +	}
> +	spin_unlock(&pag->pagb_lock);
> +	return true;
> +}
> diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h
> index 990ab3891971..2f8c73c712c6 100644
> --- a/fs/xfs/xfs_extent_busy.h
> +++ b/fs/xfs/xfs_extent_busy.h
> @@ -65,4 +65,6 @@ static inline void xfs_extent_busy_sort(struct list_head *list)
>  	list_sort(NULL, list, xfs_extent_busy_ag_cmp);
>  }
>  
> +bool xfs_extent_busy_list_empty(struct xfs_perag *pag);
> +
>  #endif /* __XFS_EXTENT_BUSY_H__ */
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Darrick J. Wong July 31, 2018, 10:01 p.m. UTC | #2
On Tue, Jul 31, 2018 at 01:47:23PM -0400, Brian Foster wrote:
> On Sun, Jul 29, 2018 at 10:48:21PM -0700, Darrick J. Wong wrote:
> > From: Darrick J. Wong <darrick.wong@oracle.com>
> > 
> > Rebuild the free space btrees from the gaps in the rmap btree.
> > 
> > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > ---
> >  fs/xfs/Makefile             |    1 
> >  fs/xfs/scrub/alloc.c        |    1 
> >  fs/xfs/scrub/alloc_repair.c |  581 +++++++++++++++++++++++++++++++++++++++++++
> >  fs/xfs/scrub/common.c       |    8 +
> >  fs/xfs/scrub/repair.h       |    2 
> >  fs/xfs/scrub/scrub.c        |    4 
> >  fs/xfs/scrub/trace.h        |    2 
> >  fs/xfs/xfs_extent_busy.c    |   14 +
> >  fs/xfs/xfs_extent_busy.h    |    2 
> >  9 files changed, 610 insertions(+), 5 deletions(-)
> >  create mode 100644 fs/xfs/scrub/alloc_repair.c
> > 
> > 
> ...
> > diff --git a/fs/xfs/scrub/alloc_repair.c b/fs/xfs/scrub/alloc_repair.c
> > new file mode 100644
> > index 000000000000..b228c2906de2
> > --- /dev/null
> > +++ b/fs/xfs/scrub/alloc_repair.c
> > @@ -0,0 +1,581 @@
> ...
> > +/* Record extents that aren't in use from gaps in the rmap records. */
> > +STATIC int
> > +xrep_abt_walk_rmap(
> > +	struct xfs_btree_cur	*cur,
> > +	struct xfs_rmap_irec	*rec,
> > +	void			*priv)
> > +{
> > +	struct xrep_abt		*ra = priv;
> > +	struct xrep_abt_extent	*rae;
> > +	xfs_fsblock_t		fsb;
> > +	int			error;
> > +
> > +	/* Record all the OWN_AG blocks... */
> > +	if (rec->rm_owner == XFS_RMAP_OWN_AG) {
> > +		fsb = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_private.a.agno,
> > +				rec->rm_startblock);
> > +		error = xfs_bitmap_set(ra->btlist, fsb, rec->rm_blockcount);
> > +		if (error)
> > +			return error;
> > +	}
> > +
> > +	/* ...and all the rmapbt blocks... */
> > +	error = xfs_bitmap_set_btcur_path(&ra->nobtlist, cur);
> > +	if (error)
> > +		return error;
> > +
> > +	/* ...and all the free space. */
> > +	if (rec->rm_startblock > ra->next_bno) {
> > +		trace_xrep_abt_walk_rmap(cur->bc_mp, cur->bc_private.a.agno,
> > +				ra->next_bno, rec->rm_startblock - ra->next_bno,
> > +				XFS_RMAP_OWN_NULL, 0, 0);
> > +
> > +		rae = kmem_alloc(sizeof(struct xrep_abt_extent), KM_MAYFAIL);
> > +		if (!rae)
> > +			return -ENOMEM;
> > +		INIT_LIST_HEAD(&rae->list);
> > +		rae->bno = ra->next_bno;
> > +		rae->len = rec->rm_startblock - ra->next_bno;
> > +		list_add_tail(&rae->list, ra->extlist);
> 
> Any reason we don't use a bitmap for this one?
> 
> > +		ra->nr_records++;
> > +		ra->nr_blocks += rae->len;
> > +	}
> > +	ra->next_bno = max_t(xfs_agblock_t, ra->next_bno,
> > +			rec->rm_startblock + rec->rm_blockcount);
> 
> The max_t() is to cover the record overlap case, right? If so, another
> one liner comment would be good.

Right.  Will add a comment.

> > +	return 0;
> > +}
> > +
> ...
> > +/* Free an extent, which creates a record in the bnobt/cntbt. */
> > +STATIC int
> > +xrep_abt_free_extent(
> > +	struct xfs_scrub	*sc,
> > +	xfs_fsblock_t		fsbno,
> > +	xfs_extlen_t		len,
> > +	struct xfs_owner_info	*oinfo)
> > +{
> > +	int			error;
> > +
> > +	error = xfs_free_extent(sc->tp, fsbno, len, oinfo, 0);
> > +	if (error)
> > +		return error;
> > +	error = xrep_roll_ag_trans(sc);
> > +	if (error)
> > +		return error;
> > +	return xfs_mod_fdblocks(sc->mp, -(int64_t)len, false);
> 
> What's this call for? Is it because the blocks we're freeing were
> already free? (Similar question on the other xfs_mod_fdblocks() call
> further down).

Yes.  The goal here is to free the (already free) extent with no net
change in fdblocks...

> BTW, what prevents some other task from coming along and screwing with
> this? For example, could a large falloc or buffered write come in and
> allocate these global blocks before we take them away here (causing the
> whole sequence to fail)?

...but you're right that here is a window of opportunity for someone to
swoop in and reserve the blocks while we still have the AGF locked,
which means that we'll fail here even though that other process will
never get the space.

Thinking about this a bit more, what we really want to do is to skip the
xfs_trans_mod_sb(len) that happens after xfs_free_ag_extent inserts the
record into the bno/cntbt.  Hm.  If a record insertion requires an
expansion of the bnobt/cntbt, we'll pull blocks from the AGFL, but we
separately force those to be accounted to XFS_AG_RESV_AGFL.  Therefore,
we could make a "fake" per-AG reservation type that would skip the
fdblocks update.  That avoids the problem where we commit the free space
record but someone else reserves all the free space and then we blow out
with ENOSPC and a half-rebuilt bnobt.

For the second case (which I assume is xrep_abt_reset_counters?) I'll
respond below.

> > +}
> > +
> ...
> > +/*
> > + * Allocate a block from the (cached) first extent in the AG.  In theory
> > + * this should never fail, since we already checked that there was enough
> > + * space to handle the new btrees.
> > + */
> > +STATIC xfs_fsblock_t
> > +xrep_abt_alloc_block(
> > +	struct xfs_scrub	*sc,
> > +	struct list_head	*free_extents)
> > +{
> > +	struct xrep_abt_extent	*ext;
> > +
> > +	/* Pull the first free space extent off the list, and... */
> > +	ext = list_first_entry(free_extents, struct xrep_abt_extent, list);

Missing a if (!ext) return NULLFSBLOCK; here for some reason...

> > +	/* ...take its first block. */
> > +	ext->bno++;
> > +	ext->len--;
> > +	if (ext->len == 0) {
> > +		list_del(&ext->list);
> > +		kmem_free(ext);
> > +	}
> > +
> > +	return XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, ext->bno - 1);
> 
> Looks like a potential use after free of ext.

Oops, good catch!  I'll add a temporary variable to hold the value for
the return.

> > +}
> > +
> ...
> > +/*
> > + * Reset the global free block counter and the per-AG counters to make it look
> > + * like this AG has no free space.
> > + */
> > +STATIC int
> > +xrep_abt_reset_counters(
> > +	struct xfs_scrub	*sc,
> > +	int			*log_flags)
> > +{
> > +	struct xfs_perag	*pag = sc->sa.pag;
> > +	struct xfs_agf		*agf;
> > +	xfs_agblock_t		new_btblks;
> > +	xfs_agblock_t		to_free;
> > +	int			error;
> > +
> > +	/*
> > +	 * Since we're abandoning the old bnobt/cntbt, we have to decrease
> > +	 * fdblocks by the # of blocks in those trees.  btreeblks counts the
> > +	 * non-root blocks of the free space and rmap btrees.  Do this before
> > +	 * resetting the AGF counters.
> > +	 */
> 
> Hmm, I'm not quite following the comment wrt to the xfs_mod_fdblocks()
> below. to_free looks like it's the count of all current btree blocks
> minus rmap blocks (i.e., old bno/cnt btree blocks). Are we "allocating"
> those blocks here because we're going to free them later?

Yes.  Though now that I have a XFS_AG_RESV_IGNORE, maybe I should just
pass that to xrep_reap_extents in xrep_abt_rebuild_trees and then I can
skip the racy mod_fdblocks thing here too.

> > +	agf = XFS_BUF_TO_AGF(sc->sa.agf_bp);
> > +
> > +	/* rmap_blocks accounts root block, btreeblks doesn't */
> > +	new_btblks = be32_to_cpu(agf->agf_rmap_blocks) - 1;
> > +
> > +	/* btreeblks doesn't account bno/cnt root blocks */
> > +	to_free = pag->pagf_btreeblks + 2;
> > +
> > +	/* and don't account for the blocks we aren't freeing */
> > +	to_free -= new_btblks;
> > +
> > +	error = xfs_mod_fdblocks(sc->mp, -(int64_t)to_free, false);
> > +	if (error)
> > +		return error;
> > +
> > +	/*
> > +	 * Reset the per-AG info, both incore and ondisk.  Mark the incore
> > +	 * state stale in case we fail out of here.
> > +	 */
> > +	ASSERT(pag->pagf_init);
> > +	pag->pagf_init = 0;
> > +	pag->pagf_btreeblks = new_btblks;
> > +	pag->pagf_freeblks = 0;
> > +	pag->pagf_longest = 0;
> > +
> > +	agf->agf_btreeblks = cpu_to_be32(new_btblks);
> > +	agf->agf_freeblks = 0;
> > +	agf->agf_longest = 0;
> > +	*log_flags |= XFS_AGF_BTREEBLKS | XFS_AGF_LONGEST | XFS_AGF_FREEBLKS;
> > +
> > +	return 0;
> > +}
> > +
> > +/* Initialize a new free space btree root and implant into AGF. */
> > +STATIC int
> > +xrep_abt_reset_btree(
> > +	struct xfs_scrub	*sc,
> > +	xfs_btnum_t		btnum,
> > +	struct list_head	*free_extents)
> > +{
> > +	struct xfs_owner_info	oinfo;
> > +	struct xfs_buf		*bp;
> > +	struct xfs_perag	*pag = sc->sa.pag;
> > +	struct xfs_mount	*mp = sc->mp;
> > +	struct xfs_agf		*agf = XFS_BUF_TO_AGF(sc->sa.agf_bp);
> > +	xfs_fsblock_t		fsbno;
> > +	int			error;
> > +
> > +	/* Allocate new root block. */
> > +	fsbno = xrep_abt_alloc_block(sc, free_extents);
> 
> xrep_abt_alloc_block() converts an agbno to return an fsb. This function
> passes the fsb to the init call just below and then converts it back to
> an agbno in two places. It seems like there might be less conversions to
> follow if the above just returned an agbno and we converted it to an fsb
> once for xrep_init_btblock().

Yep, will fix.

> > +	if (fsbno == NULLFSBLOCK)
> > +		return -ENOSPC;
> > +
> > +	/* Initialize new tree root. */
> > +	error = xrep_init_btblock(sc, fsbno, &bp, btnum, &xfs_allocbt_buf_ops);
> > +	if (error)
> > +		return error;
> > +
> > +	/* Implant into AGF. */
> > +	agf->agf_roots[btnum] = cpu_to_be32(XFS_FSB_TO_AGBNO(mp, fsbno));
> > +	agf->agf_levels[btnum] = cpu_to_be32(1);
> > +
> > +	/* Add rmap records for the btree roots */
> > +	xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG);
> > +	error = xfs_rmap_alloc(sc->tp, sc->sa.agf_bp, sc->sa.agno,
> > +			XFS_FSB_TO_AGBNO(mp, fsbno), 1, &oinfo);
> > +	if (error)
> > +		return error;
> > +
> > +	/* Reset the incore state. */
> > +	pag->pagf_levels[btnum] = 1;
> > +
> > +	return 0;
> > +}
> > +
> ...
> > +
> > +/*
> > + * Make our new freespace btree roots permanent so that we can start freeing
> > + * unused space back into the AG.
> > + */
> > +STATIC int
> > +xrep_abt_commit_new(
> > +	struct xfs_scrub	*sc,
> > +	struct xfs_bitmap	*old_allocbt_blocks,
> > +	int			log_flags)
> > +{
> > +	int			error;
> > +
> > +	xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, log_flags);
> > +
> > +	/* Invalidate the old freespace btree blocks and commit. */
> > +	error = xrep_invalidate_blocks(sc, old_allocbt_blocks);
> > +	if (error)
> > +		return error;
> 
> It looks like the above invalidation all happens in the same
> transaction. Those aren't logging buffer data or anything, but any idea
> how many log formats we can get away with in this single transaction?

Hm... well, on my computer a log format is ~88 bytes.  Assuming 4K
blocks, the max AG size of 1TB, maximum free space fragmentation, and
two btrees, the tree could be up to ~270 million records.  Assuming ~505
records per block, that's ... ~531,000 leaf blocks and ~1100 node blocks
for both btrees.  If we invalidate both, that's ~46M of RAM?

> > +	error = xrep_roll_ag_trans(sc);
> > +	if (error)
> > +		return error;
> > +
> > +	/* Now that we've succeeded, mark the incore state valid again. */
> > +	sc->sa.pag->pagf_init = 1;
> > +	return 0;
> > +}
> > +
> > +/* Build new free space btrees and dispose of the old one. */
> > +STATIC int
> > +xrep_abt_rebuild_trees(
> > +	struct xfs_scrub	*sc,
> > +	struct list_head	*free_extents,
> > +	struct xfs_bitmap	*old_allocbt_blocks)
> > +{
> > +	struct xfs_owner_info	oinfo;
> > +	struct xrep_abt_extent	*rae;
> > +	struct xrep_abt_extent	*n;
> > +	struct xrep_abt_extent	*longest;
> > +	int			error;
> > +
> > +	xfs_rmap_skip_owner_update(&oinfo);
> > +
> > +	/*
> > +	 * Insert the longest free extent in case it's necessary to
> > +	 * refresh the AGFL with multiple blocks.  If there is no longest
> > +	 * extent, we had exactly the free space we needed; we're done.
> > +	 */
> 
> I'm confused by the last sentence. longest should only be NULL if the
> free space list is empty and haven't we already bailed out with -ENOSPC
> if that's the case?
> 
> > +	longest = xrep_abt_get_longest(free_extents);

xrep_abt_rebuild_trees is called after we allocate and initialize two
new btree roots in xrep_abt_reset_btrees.  If free_extents is an empty
list here, then we found exactly two blocks worth of free space and used
them to set up new btree roots.

> > +	if (!longest)
> > +		goto done;
> > +	error = xrep_abt_free_extent(sc,
> > +			XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, longest->bno),
> > +			longest->len, &oinfo);
> > +	list_del(&longest->list);
> > +	kmem_free(longest);
> > +	if (error)
> > +		return error;
> > +
> > +	/* Insert records into the new btrees. */
> > +	list_for_each_entry_safe(rae, n, free_extents, list) {
> > +		error = xrep_abt_free_extent(sc,
> > +				XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, rae->bno),
> > +				rae->len, &oinfo);
> > +		if (error)
> > +			return error;
> > +		list_del(&rae->list);
> > +		kmem_free(rae);
> > +	}
> 
> Ok, at this point we've reset the btree roots and we start freeing the
> free ranges that were discovered via the rmapbt analysis. AFAICT, if we
> fail or crash at this point, we leave the allocbts in a partially
> constructed state. I take it that is Ok with respect to the broader
> repair algorithm because we'd essentially start over by inspecting the
> rmapbt again on a retry.

Right.  Though in the crash/shutdown case, you'll end up with the
filesystem in an offline state at some point before you can retry the
scrub, it's probably faster to run xfs_repair to fix the damage.

> The blocks allocated for the btrees that we've begun to construct here
> end up mapped in the rmapbt as we go, right? IIUC, that means we don't
> necessarily have infinite retries to make sure this completes. IOW,
> suppose that a first repair attempt finds just enough free space to
> construct new trees, gets far enough along to consume most of that free
> space and then crashes. Is it possible that a subsequent repair attempt
> includes the btree blocks allocated during the previous failed repair
> attempt in the sum of "old btree blocks" and determines we don't have
> enough free space to repair?

Yes, that's a risk of running the free space repair.

> > +
> > +done:
> > +	/* Free all the OWN_AG blocks that are not in the rmapbt/agfl. */
> > +	xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG);
> > +	return xrep_reap_extents(sc, old_allocbt_blocks, &oinfo,
> > +			XFS_AG_RESV_NONE);
> > +}
> > +
> ...
> > diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
> > index 0ed68379e551..82f99633a597 100644
> > --- a/fs/xfs/xfs_extent_busy.c
> > +++ b/fs/xfs/xfs_extent_busy.c
> > @@ -657,3 +657,17 @@ xfs_extent_busy_ag_cmp(
> >  		diff = b1->bno - b2->bno;
> >  	return diff;
> >  }
> > +
> > +/* Are there any busy extents in this AG? */
> > +bool
> > +xfs_extent_busy_list_empty(
> > +	struct xfs_perag	*pag)
> > +{
> > +	spin_lock(&pag->pagb_lock);
> > +	if (pag->pagb_tree.rb_node) {
> 
> RB_EMPTY_ROOT()?

Good suggestion, thank you!

--D

> Brian
> 
> > +		spin_unlock(&pag->pagb_lock);
> > +		return false;
> > +	}
> > +	spin_unlock(&pag->pagb_lock);
> > +	return true;
> > +}
> > diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h
> > index 990ab3891971..2f8c73c712c6 100644
> > --- a/fs/xfs/xfs_extent_busy.h
> > +++ b/fs/xfs/xfs_extent_busy.h
> > @@ -65,4 +65,6 @@ static inline void xfs_extent_busy_sort(struct list_head *list)
> >  	list_sort(NULL, list, xfs_extent_busy_ag_cmp);
> >  }
> >  
> > +bool xfs_extent_busy_list_empty(struct xfs_perag *pag);
> > +
> >  #endif /* __XFS_EXTENT_BUSY_H__ */
> > 
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Brian Foster Aug. 1, 2018, 11:54 a.m. UTC | #3
On Tue, Jul 31, 2018 at 03:01:25PM -0700, Darrick J. Wong wrote:
> On Tue, Jul 31, 2018 at 01:47:23PM -0400, Brian Foster wrote:
> > On Sun, Jul 29, 2018 at 10:48:21PM -0700, Darrick J. Wong wrote:
> > > From: Darrick J. Wong <darrick.wong@oracle.com>
> > > 
> > > Rebuild the free space btrees from the gaps in the rmap btree.
> > > 
> > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > > ---
> > >  fs/xfs/Makefile             |    1 
> > >  fs/xfs/scrub/alloc.c        |    1 
> > >  fs/xfs/scrub/alloc_repair.c |  581 +++++++++++++++++++++++++++++++++++++++++++
> > >  fs/xfs/scrub/common.c       |    8 +
> > >  fs/xfs/scrub/repair.h       |    2 
> > >  fs/xfs/scrub/scrub.c        |    4 
> > >  fs/xfs/scrub/trace.h        |    2 
> > >  fs/xfs/xfs_extent_busy.c    |   14 +
> > >  fs/xfs/xfs_extent_busy.h    |    2 
> > >  9 files changed, 610 insertions(+), 5 deletions(-)
> > >  create mode 100644 fs/xfs/scrub/alloc_repair.c
> > > 
> > > 
> > ...
> > > diff --git a/fs/xfs/scrub/alloc_repair.c b/fs/xfs/scrub/alloc_repair.c
> > > new file mode 100644
> > > index 000000000000..b228c2906de2
> > > --- /dev/null
> > > +++ b/fs/xfs/scrub/alloc_repair.c
> > > @@ -0,0 +1,581 @@
> > ...
> > > +/* Record extents that aren't in use from gaps in the rmap records. */
> > > +STATIC int
> > > +xrep_abt_walk_rmap(
> > > +	struct xfs_btree_cur	*cur,
> > > +	struct xfs_rmap_irec	*rec,
> > > +	void			*priv)
> > > +{
> > > +	struct xrep_abt		*ra = priv;
> > > +	struct xrep_abt_extent	*rae;
> > > +	xfs_fsblock_t		fsb;
> > > +	int			error;
> > > +
> > > +	/* Record all the OWN_AG blocks... */
> > > +	if (rec->rm_owner == XFS_RMAP_OWN_AG) {
> > > +		fsb = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_private.a.agno,
> > > +				rec->rm_startblock);
> > > +		error = xfs_bitmap_set(ra->btlist, fsb, rec->rm_blockcount);
> > > +		if (error)
> > > +			return error;
> > > +	}
> > > +
> > > +	/* ...and all the rmapbt blocks... */
> > > +	error = xfs_bitmap_set_btcur_path(&ra->nobtlist, cur);
> > > +	if (error)
> > > +		return error;
> > > +
> > > +	/* ...and all the free space. */
> > > +	if (rec->rm_startblock > ra->next_bno) {
> > > +		trace_xrep_abt_walk_rmap(cur->bc_mp, cur->bc_private.a.agno,
> > > +				ra->next_bno, rec->rm_startblock - ra->next_bno,
> > > +				XFS_RMAP_OWN_NULL, 0, 0);
> > > +
> > > +		rae = kmem_alloc(sizeof(struct xrep_abt_extent), KM_MAYFAIL);
> > > +		if (!rae)
> > > +			return -ENOMEM;
> > > +		INIT_LIST_HEAD(&rae->list);
> > > +		rae->bno = ra->next_bno;
> > > +		rae->len = rec->rm_startblock - ra->next_bno;
> > > +		list_add_tail(&rae->list, ra->extlist);
> > 
> > Any reason we don't use a bitmap for this one?
> > 

??

> > > +		ra->nr_records++;
> > > +		ra->nr_blocks += rae->len;
> > > +	}
> > > +	ra->next_bno = max_t(xfs_agblock_t, ra->next_bno,
> > > +			rec->rm_startblock + rec->rm_blockcount);
> > 
> > The max_t() is to cover the record overlap case, right? If so, another
> > one liner comment would be good.
> 
> Right.  Will add a comment.
> 
> > > +	return 0;
> > > +}
> > > +
> > ...
> > > +/* Free an extent, which creates a record in the bnobt/cntbt. */
> > > +STATIC int
> > > +xrep_abt_free_extent(
> > > +	struct xfs_scrub	*sc,
> > > +	xfs_fsblock_t		fsbno,
> > > +	xfs_extlen_t		len,
> > > +	struct xfs_owner_info	*oinfo)
> > > +{
> > > +	int			error;
> > > +
> > > +	error = xfs_free_extent(sc->tp, fsbno, len, oinfo, 0);
> > > +	if (error)
> > > +		return error;
> > > +	error = xrep_roll_ag_trans(sc);
> > > +	if (error)
> > > +		return error;
> > > +	return xfs_mod_fdblocks(sc->mp, -(int64_t)len, false);
> > 
> > What's this call for? Is it because the blocks we're freeing were
> > already free? (Similar question on the other xfs_mod_fdblocks() call
> > further down).
> 
> Yes.  The goal here is to free the (already free) extent with no net
> change in fdblocks...
> 
> > BTW, what prevents some other task from coming along and screwing with
> > this? For example, could a large falloc or buffered write come in and
> > allocate these global blocks before we take them away here (causing the
> > whole sequence to fail)?
> 
> ...but you're right that here is a window of opportunity for someone to
> swoop in and reserve the blocks while we still have the AGF locked,
> which means that we'll fail here even though that other process will
> never get the space.
> 
> Thinking about this a bit more, what we really want to do is to skip the
> xfs_trans_mod_sb(len) that happens after xfs_free_ag_extent inserts the
> record into the bno/cntbt.  Hm.  If a record insertion requires an
> expansion of the bnobt/cntbt, we'll pull blocks from the AGFL, but we
> separately force those to be accounted to XFS_AG_RESV_AGFL.  Therefore,
> we could make a "fake" per-AG reservation type that would skip the
> fdblocks update.  That avoids the problem where we commit the free space
> record but someone else reserves all the free space and then we blow out
> with ENOSPC and a half-rebuilt bnobt.
> 

Ok, that sounds a bit more straightforward to me.

> For the second case (which I assume is xrep_abt_reset_counters?) I'll
> respond below.
> 
> > > +}
> > > +
...
> > > +/*
> > > + * Reset the global free block counter and the per-AG counters to make it look
> > > + * like this AG has no free space.
> > > + */
> > > +STATIC int
> > > +xrep_abt_reset_counters(
> > > +	struct xfs_scrub	*sc,
> > > +	int			*log_flags)
> > > +{
> > > +	struct xfs_perag	*pag = sc->sa.pag;
> > > +	struct xfs_agf		*agf;
> > > +	xfs_agblock_t		new_btblks;
> > > +	xfs_agblock_t		to_free;
> > > +	int			error;
> > > +
> > > +	/*
> > > +	 * Since we're abandoning the old bnobt/cntbt, we have to decrease
> > > +	 * fdblocks by the # of blocks in those trees.  btreeblks counts the
> > > +	 * non-root blocks of the free space and rmap btrees.  Do this before
> > > +	 * resetting the AGF counters.
> > > +	 */
> > 
> > Hmm, I'm not quite following the comment wrt to the xfs_mod_fdblocks()
> > below. to_free looks like it's the count of all current btree blocks
> > minus rmap blocks (i.e., old bno/cnt btree blocks). Are we "allocating"
> > those blocks here because we're going to free them later?
> 
> Yes.  Though now that I have a XFS_AG_RESV_IGNORE, maybe I should just
> pass that to xrep_reap_extents in xrep_abt_rebuild_trees and then I can
> skip the racy mod_fdblocks thing here too.
> 

I think I'll ultimately need to see the code to make sure I follow the
ignore thing correctly, but that overall sounds better to me. If we do
retain these kind of calls to undo/work-around underlying
infrastructure, I think we need a bit more specific comments that
describe precisely what behavior the call is offsetting.

> > > +	agf = XFS_BUF_TO_AGF(sc->sa.agf_bp);
> > > +
> > > +	/* rmap_blocks accounts root block, btreeblks doesn't */
> > > +	new_btblks = be32_to_cpu(agf->agf_rmap_blocks) - 1;
> > > +
> > > +	/* btreeblks doesn't account bno/cnt root blocks */
> > > +	to_free = pag->pagf_btreeblks + 2;
> > > +
> > > +	/* and don't account for the blocks we aren't freeing */
> > > +	to_free -= new_btblks;
> > > +
> > > +	error = xfs_mod_fdblocks(sc->mp, -(int64_t)to_free, false);
> > > +	if (error)
> > > +		return error;
> > > +
> > > +	/*
> > > +	 * Reset the per-AG info, both incore and ondisk.  Mark the incore
> > > +	 * state stale in case we fail out of here.
> > > +	 */
> > > +	ASSERT(pag->pagf_init);
> > > +	pag->pagf_init = 0;
> > > +	pag->pagf_btreeblks = new_btblks;
> > > +	pag->pagf_freeblks = 0;
> > > +	pag->pagf_longest = 0;
> > > +
> > > +	agf->agf_btreeblks = cpu_to_be32(new_btblks);
> > > +	agf->agf_freeblks = 0;
> > > +	agf->agf_longest = 0;
> > > +	*log_flags |= XFS_AGF_BTREEBLKS | XFS_AGF_LONGEST | XFS_AGF_FREEBLKS;
> > > +
> > > +	return 0;
> > > +}
> > > +
...
> > > +
> > > +/*
> > > + * Make our new freespace btree roots permanent so that we can start freeing
> > > + * unused space back into the AG.
> > > + */
> > > +STATIC int
> > > +xrep_abt_commit_new(
> > > +	struct xfs_scrub	*sc,
> > > +	struct xfs_bitmap	*old_allocbt_blocks,
> > > +	int			log_flags)
> > > +{
> > > +	int			error;
> > > +
> > > +	xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, log_flags);
> > > +
> > > +	/* Invalidate the old freespace btree blocks and commit. */
> > > +	error = xrep_invalidate_blocks(sc, old_allocbt_blocks);
> > > +	if (error)
> > > +		return error;
> > 
> > It looks like the above invalidation all happens in the same
> > transaction. Those aren't logging buffer data or anything, but any idea
> > how many log formats we can get away with in this single transaction?
> 
> Hm... well, on my computer a log format is ~88 bytes.  Assuming 4K
> blocks, the max AG size of 1TB, maximum free space fragmentation, and
> two btrees, the tree could be up to ~270 million records.  Assuming ~505
> records per block, that's ... ~531,000 leaf blocks and ~1100 node blocks
> for both btrees.  If we invalidate both, that's ~46M of RAM?
> 

I was thinking more about transaction reservation than RAM. It may not
currently be an issue, but it might be worth putting something down in a
comment to note that this is a single transaction and we expect to not
have to invalidate more than N (ballpark) blocks in a single go,
whatever that value happens to be.

> > > +	error = xrep_roll_ag_trans(sc);
> > > +	if (error)
> > > +		return error;
> > > +
> > > +	/* Now that we've succeeded, mark the incore state valid again. */
> > > +	sc->sa.pag->pagf_init = 1;
> > > +	return 0;
> > > +}
> > > +
> > > +/* Build new free space btrees and dispose of the old one. */
> > > +STATIC int
> > > +xrep_abt_rebuild_trees(
> > > +	struct xfs_scrub	*sc,
> > > +	struct list_head	*free_extents,
> > > +	struct xfs_bitmap	*old_allocbt_blocks)
> > > +{
> > > +	struct xfs_owner_info	oinfo;
> > > +	struct xrep_abt_extent	*rae;
> > > +	struct xrep_abt_extent	*n;
> > > +	struct xrep_abt_extent	*longest;
> > > +	int			error;
> > > +
> > > +	xfs_rmap_skip_owner_update(&oinfo);
> > > +
> > > +	/*
> > > +	 * Insert the longest free extent in case it's necessary to
> > > +	 * refresh the AGFL with multiple blocks.  If there is no longest
> > > +	 * extent, we had exactly the free space we needed; we're done.
> > > +	 */
> > 
> > I'm confused by the last sentence. longest should only be NULL if the
> > free space list is empty and haven't we already bailed out with -ENOSPC
> > if that's the case?
> > 
> > > +	longest = xrep_abt_get_longest(free_extents);
> 
> xrep_abt_rebuild_trees is called after we allocate and initialize two
> new btree roots in xrep_abt_reset_btrees.  If free_extents is an empty
> list here, then we found exactly two blocks worth of free space and used
> them to set up new btree roots.
> 

Got it, thanks.

> > > +	if (!longest)
> > > +		goto done;
> > > +	error = xrep_abt_free_extent(sc,
> > > +			XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, longest->bno),
> > > +			longest->len, &oinfo);
> > > +	list_del(&longest->list);
> > > +	kmem_free(longest);
> > > +	if (error)
> > > +		return error;
> > > +
> > > +	/* Insert records into the new btrees. */
> > > +	list_for_each_entry_safe(rae, n, free_extents, list) {
> > > +		error = xrep_abt_free_extent(sc,
> > > +				XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, rae->bno),
> > > +				rae->len, &oinfo);
> > > +		if (error)
> > > +			return error;
> > > +		list_del(&rae->list);
> > > +		kmem_free(rae);
> > > +	}
> > 
> > Ok, at this point we've reset the btree roots and we start freeing the
> > free ranges that were discovered via the rmapbt analysis. AFAICT, if we
> > fail or crash at this point, we leave the allocbts in a partially
> > constructed state. I take it that is Ok with respect to the broader
> > repair algorithm because we'd essentially start over by inspecting the
> > rmapbt again on a retry.
> 
> Right.  Though in the crash/shutdown case, you'll end up with the
> filesystem in an offline state at some point before you can retry the
> scrub, it's probably faster to run xfs_repair to fix the damage.
> 

Can we really assume that if we're already up and running an online
repair? The filesystem has to be mountable in that case in the first
place. If we've already reset and started reconstructing the allocation
btrees then I'd think those transactions would recover just fine on a
power loss or something (perhaps not in the event of some other
corruption related shutdown).

> > The blocks allocated for the btrees that we've begun to construct here
> > end up mapped in the rmapbt as we go, right? IIUC, that means we don't
> > necessarily have infinite retries to make sure this completes. IOW,
> > suppose that a first repair attempt finds just enough free space to
> > construct new trees, gets far enough along to consume most of that free
> > space and then crashes. Is it possible that a subsequent repair attempt
> > includes the btree blocks allocated during the previous failed repair
> > attempt in the sum of "old btree blocks" and determines we don't have
> > enough free space to repair?
> 
> Yes, that's a risk of running the free space repair.
> 

Can we improve on that? For example, are the rmapbt entries for the old
allocation btree blocks necessary once we commit the btree resets? If
not, could we remove those entries before we start tree reconstruction?

Alternatively, could we incorporate use of the old btree blocks? As it
is, we discover those blocks simply so we can free them at the end.
Perhaps we could free them sooner or find a more clever means to
reallocate directly from that in-core list? I guess we have to consider
whether they were really valid/sane btree blocks, but either way ISTM
that the old blocks list is essentially invalidated once we reset the
btrees.

Brian

> > > +
> > > +done:
> > > +	/* Free all the OWN_AG blocks that are not in the rmapbt/agfl. */
> > > +	xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG);
> > > +	return xrep_reap_extents(sc, old_allocbt_blocks, &oinfo,
> > > +			XFS_AG_RESV_NONE);
> > > +}
> > > +
> > ...
> > > diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
> > > index 0ed68379e551..82f99633a597 100644
> > > --- a/fs/xfs/xfs_extent_busy.c
> > > +++ b/fs/xfs/xfs_extent_busy.c
> > > @@ -657,3 +657,17 @@ xfs_extent_busy_ag_cmp(
> > >  		diff = b1->bno - b2->bno;
> > >  	return diff;
> > >  }
> > > +
> > > +/* Are there any busy extents in this AG? */
> > > +bool
> > > +xfs_extent_busy_list_empty(
> > > +	struct xfs_perag	*pag)
> > > +{
> > > +	spin_lock(&pag->pagb_lock);
> > > +	if (pag->pagb_tree.rb_node) {
> > 
> > RB_EMPTY_ROOT()?
> 
> Good suggestion, thank you!
> 
> --D
> 
> > Brian
> > 
> > > +		spin_unlock(&pag->pagb_lock);
> > > +		return false;
> > > +	}
> > > +	spin_unlock(&pag->pagb_lock);
> > > +	return true;
> > > +}
> > > diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h
> > > index 990ab3891971..2f8c73c712c6 100644
> > > --- a/fs/xfs/xfs_extent_busy.h
> > > +++ b/fs/xfs/xfs_extent_busy.h
> > > @@ -65,4 +65,6 @@ static inline void xfs_extent_busy_sort(struct list_head *list)
> > >  	list_sort(NULL, list, xfs_extent_busy_ag_cmp);
> > >  }
> > >  
> > > +bool xfs_extent_busy_list_empty(struct xfs_perag *pag);
> > > +
> > >  #endif /* __XFS_EXTENT_BUSY_H__ */
> > > 
> > > --
> > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > the body of a message to majordomo@vger.kernel.org
> > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Darrick J. Wong Aug. 1, 2018, 4:23 p.m. UTC | #4
On Wed, Aug 01, 2018 at 07:54:09AM -0400, Brian Foster wrote:
> On Tue, Jul 31, 2018 at 03:01:25PM -0700, Darrick J. Wong wrote:
> > On Tue, Jul 31, 2018 at 01:47:23PM -0400, Brian Foster wrote:
> > > On Sun, Jul 29, 2018 at 10:48:21PM -0700, Darrick J. Wong wrote:
> > > > From: Darrick J. Wong <darrick.wong@oracle.com>
> > > > 
> > > > Rebuild the free space btrees from the gaps in the rmap btree.
> > > > 
> > > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > > > ---
> > > >  fs/xfs/Makefile             |    1 
> > > >  fs/xfs/scrub/alloc.c        |    1 
> > > >  fs/xfs/scrub/alloc_repair.c |  581 +++++++++++++++++++++++++++++++++++++++++++
> > > >  fs/xfs/scrub/common.c       |    8 +
> > > >  fs/xfs/scrub/repair.h       |    2 
> > > >  fs/xfs/scrub/scrub.c        |    4 
> > > >  fs/xfs/scrub/trace.h        |    2 
> > > >  fs/xfs/xfs_extent_busy.c    |   14 +
> > > >  fs/xfs/xfs_extent_busy.h    |    2 
> > > >  9 files changed, 610 insertions(+), 5 deletions(-)
> > > >  create mode 100644 fs/xfs/scrub/alloc_repair.c
> > > > 
> > > > 
> > > ...
> > > > diff --git a/fs/xfs/scrub/alloc_repair.c b/fs/xfs/scrub/alloc_repair.c
> > > > new file mode 100644
> > > > index 000000000000..b228c2906de2
> > > > --- /dev/null
> > > > +++ b/fs/xfs/scrub/alloc_repair.c
> > > > @@ -0,0 +1,581 @@
> > > ...
> > > > +/* Record extents that aren't in use from gaps in the rmap records. */
> > > > +STATIC int
> > > > +xrep_abt_walk_rmap(
> > > > +	struct xfs_btree_cur	*cur,
> > > > +	struct xfs_rmap_irec	*rec,
> > > > +	void			*priv)
> > > > +{
> > > > +	struct xrep_abt		*ra = priv;
> > > > +	struct xrep_abt_extent	*rae;
> > > > +	xfs_fsblock_t		fsb;
> > > > +	int			error;
> > > > +
> > > > +	/* Record all the OWN_AG blocks... */
> > > > +	if (rec->rm_owner == XFS_RMAP_OWN_AG) {
> > > > +		fsb = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_private.a.agno,
> > > > +				rec->rm_startblock);
> > > > +		error = xfs_bitmap_set(ra->btlist, fsb, rec->rm_blockcount);
> > > > +		if (error)
> > > > +			return error;
> > > > +	}
> > > > +
> > > > +	/* ...and all the rmapbt blocks... */
> > > > +	error = xfs_bitmap_set_btcur_path(&ra->nobtlist, cur);
> > > > +	if (error)
> > > > +		return error;
> > > > +
> > > > +	/* ...and all the free space. */
> > > > +	if (rec->rm_startblock > ra->next_bno) {
> > > > +		trace_xrep_abt_walk_rmap(cur->bc_mp, cur->bc_private.a.agno,
> > > > +				ra->next_bno, rec->rm_startblock - ra->next_bno,
> > > > +				XFS_RMAP_OWN_NULL, 0, 0);
> > > > +
> > > > +		rae = kmem_alloc(sizeof(struct xrep_abt_extent), KM_MAYFAIL);
> > > > +		if (!rae)
> > > > +			return -ENOMEM;
> > > > +		INIT_LIST_HEAD(&rae->list);
> > > > +		rae->bno = ra->next_bno;
> > > > +		rae->len = rec->rm_startblock - ra->next_bno;
> > > > +		list_add_tail(&rae->list, ra->extlist);
> > > 
> > > Any reason we don't use a bitmap for this one?
> > > 
> 
> ??

Yes, I could probably do that, let's see if it works...

> > > > +		ra->nr_records++;
> > > > +		ra->nr_blocks += rae->len;
> > > > +	}
> > > > +	ra->next_bno = max_t(xfs_agblock_t, ra->next_bno,
> > > > +			rec->rm_startblock + rec->rm_blockcount);
> > > 
> > > The max_t() is to cover the record overlap case, right? If so, another
> > > one liner comment would be good.
> > 
> > Right.  Will add a comment.
> > 
> > > > +	return 0;
> > > > +}
> > > > +
> > > ...
> > > > +/* Free an extent, which creates a record in the bnobt/cntbt. */
> > > > +STATIC int
> > > > +xrep_abt_free_extent(
> > > > +	struct xfs_scrub	*sc,
> > > > +	xfs_fsblock_t		fsbno,
> > > > +	xfs_extlen_t		len,
> > > > +	struct xfs_owner_info	*oinfo)
> > > > +{
> > > > +	int			error;
> > > > +
> > > > +	error = xfs_free_extent(sc->tp, fsbno, len, oinfo, 0);
> > > > +	if (error)
> > > > +		return error;
> > > > +	error = xrep_roll_ag_trans(sc);
> > > > +	if (error)
> > > > +		return error;
> > > > +	return xfs_mod_fdblocks(sc->mp, -(int64_t)len, false);
> > > 
> > > What's this call for? Is it because the blocks we're freeing were
> > > already free? (Similar question on the other xfs_mod_fdblocks() call
> > > further down).
> > 
> > Yes.  The goal here is to free the (already free) extent with no net
> > change in fdblocks...
> > 
> > > BTW, what prevents some other task from coming along and screwing with
> > > this? For example, could a large falloc or buffered write come in and
> > > allocate these global blocks before we take them away here (causing the
> > > whole sequence to fail)?
> > 
> > ...but you're right that here is a window of opportunity for someone to
> > swoop in and reserve the blocks while we still have the AGF locked,
> > which means that we'll fail here even though that other process will
> > never get the space.
> > 
> > Thinking about this a bit more, what we really want to do is to skip the
> > xfs_trans_mod_sb(len) that happens after xfs_free_ag_extent inserts the
> > record into the bno/cntbt.  Hm.  If a record insertion requires an
> > expansion of the bnobt/cntbt, we'll pull blocks from the AGFL, but we
> > separately force those to be accounted to XFS_AG_RESV_AGFL.  Therefore,
> > we could make a "fake" per-AG reservation type that would skip the
> > fdblocks update.  That avoids the problem where we commit the free space
> > record but someone else reserves all the free space and then we blow out
> > with ENOSPC and a half-rebuilt bnobt.
> > 
> 
> Ok, that sounds a bit more straightforward to me.
> 
> > For the second case (which I assume is xrep_abt_reset_counters?) I'll
> > respond below.
> > 
> > > > +}
> > > > +
> ...
> > > > +/*
> > > > + * Reset the global free block counter and the per-AG counters to make it look
> > > > + * like this AG has no free space.
> > > > + */
> > > > +STATIC int
> > > > +xrep_abt_reset_counters(
> > > > +	struct xfs_scrub	*sc,
> > > > +	int			*log_flags)
> > > > +{
> > > > +	struct xfs_perag	*pag = sc->sa.pag;
> > > > +	struct xfs_agf		*agf;
> > > > +	xfs_agblock_t		new_btblks;
> > > > +	xfs_agblock_t		to_free;
> > > > +	int			error;
> > > > +
> > > > +	/*
> > > > +	 * Since we're abandoning the old bnobt/cntbt, we have to decrease
> > > > +	 * fdblocks by the # of blocks in those trees.  btreeblks counts the
> > > > +	 * non-root blocks of the free space and rmap btrees.  Do this before
> > > > +	 * resetting the AGF counters.
> > > > +	 */
> > > 
> > > Hmm, I'm not quite following the comment wrt to the xfs_mod_fdblocks()
> > > below. to_free looks like it's the count of all current btree blocks
> > > minus rmap blocks (i.e., old bno/cnt btree blocks). Are we "allocating"
> > > those blocks here because we're going to free them later?
> > 
> > Yes.  Though now that I have a XFS_AG_RESV_IGNORE, maybe I should just
> > pass that to xrep_reap_extents in xrep_abt_rebuild_trees and then I can
> > skip the racy mod_fdblocks thing here too.
> > 
> 
> I think I'll ultimately need to see the code to make sure I follow the
> ignore thing correctly, but that overall sounds better to me. If we do
> retain these kind of calls to undo/work-around underlying
> infrastructure, I think we need a bit more specific comments that
> describe precisely what behavior the call is offsetting.

I'll push out a new revision after I finish rebasing everything atop
your latest dfops refactoring series.

> > > > +	agf = XFS_BUF_TO_AGF(sc->sa.agf_bp);
> > > > +
> > > > +	/* rmap_blocks accounts root block, btreeblks doesn't */
> > > > +	new_btblks = be32_to_cpu(agf->agf_rmap_blocks) - 1;
> > > > +
> > > > +	/* btreeblks doesn't account bno/cnt root blocks */
> > > > +	to_free = pag->pagf_btreeblks + 2;
> > > > +
> > > > +	/* and don't account for the blocks we aren't freeing */
> > > > +	to_free -= new_btblks;
> > > > +
> > > > +	error = xfs_mod_fdblocks(sc->mp, -(int64_t)to_free, false);
> > > > +	if (error)
> > > > +		return error;
> > > > +
> > > > +	/*
> > > > +	 * Reset the per-AG info, both incore and ondisk.  Mark the incore
> > > > +	 * state stale in case we fail out of here.
> > > > +	 */
> > > > +	ASSERT(pag->pagf_init);
> > > > +	pag->pagf_init = 0;
> > > > +	pag->pagf_btreeblks = new_btblks;
> > > > +	pag->pagf_freeblks = 0;
> > > > +	pag->pagf_longest = 0;
> > > > +
> > > > +	agf->agf_btreeblks = cpu_to_be32(new_btblks);
> > > > +	agf->agf_freeblks = 0;
> > > > +	agf->agf_longest = 0;
> > > > +	*log_flags |= XFS_AGF_BTREEBLKS | XFS_AGF_LONGEST | XFS_AGF_FREEBLKS;
> > > > +
> > > > +	return 0;
> > > > +}
> > > > +
> ...
> > > > +
> > > > +/*
> > > > + * Make our new freespace btree roots permanent so that we can start freeing
> > > > + * unused space back into the AG.
> > > > + */
> > > > +STATIC int
> > > > +xrep_abt_commit_new(
> > > > +	struct xfs_scrub	*sc,
> > > > +	struct xfs_bitmap	*old_allocbt_blocks,
> > > > +	int			log_flags)
> > > > +{
> > > > +	int			error;
> > > > +
> > > > +	xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, log_flags);
> > > > +
> > > > +	/* Invalidate the old freespace btree blocks and commit. */
> > > > +	error = xrep_invalidate_blocks(sc, old_allocbt_blocks);
> > > > +	if (error)
> > > > +		return error;
> > > 
> > > It looks like the above invalidation all happens in the same
> > > transaction. Those aren't logging buffer data or anything, but any idea
> > > how many log formats we can get away with in this single transaction?
> > 
> > Hm... well, on my computer a log format is ~88 bytes.  Assuming 4K
> > blocks, the max AG size of 1TB, maximum free space fragmentation, and
> > two btrees, the tree could be up to ~270 million records.  Assuming ~505
> > records per block, that's ... ~531,000 leaf blocks and ~1100 node blocks
> > for both btrees.  If we invalidate both, that's ~46M of RAM?
> > 
> 
> I was thinking more about transaction reservation than RAM. It may not

Hmm.  tr_itruncate is ~650K on my 2TB SSD, assuming 88 bytes per, that's
about ... ~7300 log format items?  Not a lot, maybe it should roll the
transaction every 1000 invalidations or so...

> currently be an issue, but it might be worth putting something down in a
> comment to note that this is a single transaction and we expect to not
> have to invalidate more than N (ballpark) blocks in a single go,
> whatever that value happens to be.
> 
> > > > +	error = xrep_roll_ag_trans(sc);
> > > > +	if (error)
> > > > +		return error;
> > > > +
> > > > +	/* Now that we've succeeded, mark the incore state valid again. */
> > > > +	sc->sa.pag->pagf_init = 1;
> > > > +	return 0;
> > > > +}
> > > > +
> > > > +/* Build new free space btrees and dispose of the old one. */
> > > > +STATIC int
> > > > +xrep_abt_rebuild_trees(
> > > > +	struct xfs_scrub	*sc,
> > > > +	struct list_head	*free_extents,
> > > > +	struct xfs_bitmap	*old_allocbt_blocks)
> > > > +{
> > > > +	struct xfs_owner_info	oinfo;
> > > > +	struct xrep_abt_extent	*rae;
> > > > +	struct xrep_abt_extent	*n;
> > > > +	struct xrep_abt_extent	*longest;
> > > > +	int			error;
> > > > +
> > > > +	xfs_rmap_skip_owner_update(&oinfo);
> > > > +
> > > > +	/*
> > > > +	 * Insert the longest free extent in case it's necessary to
> > > > +	 * refresh the AGFL with multiple blocks.  If there is no longest
> > > > +	 * extent, we had exactly the free space we needed; we're done.
> > > > +	 */
> > > 
> > > I'm confused by the last sentence. longest should only be NULL if the
> > > free space list is empty and haven't we already bailed out with -ENOSPC
> > > if that's the case?
> > > 
> > > > +	longest = xrep_abt_get_longest(free_extents);
> > 
> > xrep_abt_rebuild_trees is called after we allocate and initialize two
> > new btree roots in xrep_abt_reset_btrees.  If free_extents is an empty
> > list here, then we found exactly two blocks worth of free space and used
> > them to set up new btree roots.
> > 
> 
> Got it, thanks.
> 
> > > > +	if (!longest)
> > > > +		goto done;
> > > > +	error = xrep_abt_free_extent(sc,
> > > > +			XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, longest->bno),
> > > > +			longest->len, &oinfo);
> > > > +	list_del(&longest->list);
> > > > +	kmem_free(longest);
> > > > +	if (error)
> > > > +		return error;
> > > > +
> > > > +	/* Insert records into the new btrees. */
> > > > +	list_for_each_entry_safe(rae, n, free_extents, list) {
> > > > +		error = xrep_abt_free_extent(sc,
> > > > +				XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, rae->bno),
> > > > +				rae->len, &oinfo);
> > > > +		if (error)
> > > > +			return error;
> > > > +		list_del(&rae->list);
> > > > +		kmem_free(rae);
> > > > +	}
> > > 
> > > Ok, at this point we've reset the btree roots and we start freeing the
> > > free ranges that were discovered via the rmapbt analysis. AFAICT, if we
> > > fail or crash at this point, we leave the allocbts in a partially
> > > constructed state. I take it that is Ok with respect to the broader
> > > repair algorithm because we'd essentially start over by inspecting the
> > > rmapbt again on a retry.
> > 
> > Right.  Though in the crash/shutdown case, you'll end up with the
> > filesystem in an offline state at some point before you can retry the
> > scrub, it's probably faster to run xfs_repair to fix the damage.
> > 
> 
> Can we really assume that if we're already up and running an online
> repair? The filesystem has to be mountable in that case in the first
> place. If we've already reset and started reconstructing the allocation
> btrees then I'd think those transactions would recover just fine on a
> power loss or something (perhaps not in the event of some other
> corruption related shutdown).

Right, for the system crash case, whatever transactions committed should
replay just fine, and you can even start up the online repair again, and
if the AG isn't particularly close to ENOSPC then (barring rmap
corruption) it should work just fine.

If the fs went down because either (a) repair hit other corruption or
(b) some other thread hit an error in some other part of the filesystem,
then it's not so clear -- in (b) you could probably try again, but for
(a) you'll definitely have to unmount and run xfs_repair.

Perhaps the guideline here is that if the fs goes down more than once
during online repair then unmount it and run xfs_repair.

> > > The blocks allocated for the btrees that we've begun to construct here
> > > end up mapped in the rmapbt as we go, right? IIUC, that means we don't
> > > necessarily have infinite retries to make sure this completes. IOW,
> > > suppose that a first repair attempt finds just enough free space to
> > > construct new trees, gets far enough along to consume most of that free
> > > space and then crashes. Is it possible that a subsequent repair attempt
> > > includes the btree blocks allocated during the previous failed repair
> > > attempt in the sum of "old btree blocks" and determines we don't have
> > > enough free space to repair?
> > 
> > Yes, that's a risk of running the free space repair.
> > 
> 
> Can we improve on that? For example, are the rmapbt entries for the old
> allocation btree blocks necessary once we commit the btree resets? If
> not, could we remove those entries before we start tree reconstruction?
> 
> Alternatively, could we incorporate use of the old btree blocks? As it
> is, we discover those blocks simply so we can free them at the end.
> Perhaps we could free them sooner or find a more clever means to
> reallocate directly from that in-core list? I guess we have to consider
> whether they were really valid/sane btree blocks, but either way ISTM
> that the old blocks list is essentially invalidated once we reset the
> btrees.

Hmm, it's a little tricky to do that -- we could reap the old bnobt and
cntbt blocks (in the old_allocbt_blocks bitmap) first, but if adding a
record causes a btree split we'll pull blocks from the AGFL, and if
there aren't enough blocks in the bnobt to fill the AGFL back up then
fix_freelist won't succeed.  That complication is why it finds the
longest extent in the unclaimed list and pushes that in first, then
works on the rest of the extents.

I suppose one could try to avoid ENOSPC by pushing that longest extent
in first (since we know that won't trigger a split), then reap the old
alloc btree blocks, and then add everything else back in...

--D

> Brian
> 
> > > > +
> > > > +done:
> > > > +	/* Free all the OWN_AG blocks that are not in the rmapbt/agfl. */
> > > > +	xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG);
> > > > +	return xrep_reap_extents(sc, old_allocbt_blocks, &oinfo,
> > > > +			XFS_AG_RESV_NONE);
> > > > +}
> > > > +
> > > ...
> > > > diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
> > > > index 0ed68379e551..82f99633a597 100644
> > > > --- a/fs/xfs/xfs_extent_busy.c
> > > > +++ b/fs/xfs/xfs_extent_busy.c
> > > > @@ -657,3 +657,17 @@ xfs_extent_busy_ag_cmp(
> > > >  		diff = b1->bno - b2->bno;
> > > >  	return diff;
> > > >  }
> > > > +
> > > > +/* Are there any busy extents in this AG? */
> > > > +bool
> > > > +xfs_extent_busy_list_empty(
> > > > +	struct xfs_perag	*pag)
> > > > +{
> > > > +	spin_lock(&pag->pagb_lock);
> > > > +	if (pag->pagb_tree.rb_node) {
> > > 
> > > RB_EMPTY_ROOT()?
> > 
> > Good suggestion, thank you!
> > 
> > --D
> > 
> > > Brian
> > > 
> > > > +		spin_unlock(&pag->pagb_lock);
> > > > +		return false;
> > > > +	}
> > > > +	spin_unlock(&pag->pagb_lock);
> > > > +	return true;
> > > > +}
> > > > diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h
> > > > index 990ab3891971..2f8c73c712c6 100644
> > > > --- a/fs/xfs/xfs_extent_busy.h
> > > > +++ b/fs/xfs/xfs_extent_busy.h
> > > > @@ -65,4 +65,6 @@ static inline void xfs_extent_busy_sort(struct list_head *list)
> > > >  	list_sort(NULL, list, xfs_extent_busy_ag_cmp);
> > > >  }
> > > >  
> > > > +bool xfs_extent_busy_list_empty(struct xfs_perag *pag);
> > > > +
> > > >  #endif /* __XFS_EXTENT_BUSY_H__ */
> > > > 
> > > > --
> > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > the body of a message to majordomo@vger.kernel.org
> > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > --
> > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > the body of a message to majordomo@vger.kernel.org
> > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Brian Foster Aug. 1, 2018, 6:39 p.m. UTC | #5
On Wed, Aug 01, 2018 at 09:23:16AM -0700, Darrick J. Wong wrote:
> On Wed, Aug 01, 2018 at 07:54:09AM -0400, Brian Foster wrote:
> > On Tue, Jul 31, 2018 at 03:01:25PM -0700, Darrick J. Wong wrote:
> > > On Tue, Jul 31, 2018 at 01:47:23PM -0400, Brian Foster wrote:
> > > > On Sun, Jul 29, 2018 at 10:48:21PM -0700, Darrick J. Wong wrote:
> > > > > From: Darrick J. Wong <darrick.wong@oracle.com>
> > > > > 
> > > > > Rebuild the free space btrees from the gaps in the rmap btree.
> > > > > 
> > > > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > > > > ---
> > > > >  fs/xfs/Makefile             |    1 
> > > > >  fs/xfs/scrub/alloc.c        |    1 
> > > > >  fs/xfs/scrub/alloc_repair.c |  581 +++++++++++++++++++++++++++++++++++++++++++
> > > > >  fs/xfs/scrub/common.c       |    8 +
> > > > >  fs/xfs/scrub/repair.h       |    2 
> > > > >  fs/xfs/scrub/scrub.c        |    4 
> > > > >  fs/xfs/scrub/trace.h        |    2 
> > > > >  fs/xfs/xfs_extent_busy.c    |   14 +
> > > > >  fs/xfs/xfs_extent_busy.h    |    2 
> > > > >  9 files changed, 610 insertions(+), 5 deletions(-)
> > > > >  create mode 100644 fs/xfs/scrub/alloc_repair.c
> > > > > 
> > > > > 
> > > > ...
> > > > > diff --git a/fs/xfs/scrub/alloc_repair.c b/fs/xfs/scrub/alloc_repair.c
> > > > > new file mode 100644
> > > > > index 000000000000..b228c2906de2
> > > > > --- /dev/null
> > > > > +++ b/fs/xfs/scrub/alloc_repair.c
> > > > > @@ -0,0 +1,581 @@
...
> > > > > +
> > > > > +/*
> > > > > + * Make our new freespace btree roots permanent so that we can start freeing
> > > > > + * unused space back into the AG.
> > > > > + */
> > > > > +STATIC int
> > > > > +xrep_abt_commit_new(
> > > > > +	struct xfs_scrub	*sc,
> > > > > +	struct xfs_bitmap	*old_allocbt_blocks,
> > > > > +	int			log_flags)
> > > > > +{
> > > > > +	int			error;
> > > > > +
> > > > > +	xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, log_flags);
> > > > > +
> > > > > +	/* Invalidate the old freespace btree blocks and commit. */
> > > > > +	error = xrep_invalidate_blocks(sc, old_allocbt_blocks);
> > > > > +	if (error)
> > > > > +		return error;
> > > > 
> > > > It looks like the above invalidation all happens in the same
> > > > transaction. Those aren't logging buffer data or anything, but any idea
> > > > how many log formats we can get away with in this single transaction?
> > > 
> > > Hm... well, on my computer a log format is ~88 bytes.  Assuming 4K
> > > blocks, the max AG size of 1TB, maximum free space fragmentation, and
> > > two btrees, the tree could be up to ~270 million records.  Assuming ~505
> > > records per block, that's ... ~531,000 leaf blocks and ~1100 node blocks
> > > for both btrees.  If we invalidate both, that's ~46M of RAM?
> > > 
> > 
> > I was thinking more about transaction reservation than RAM. It may not
> 
> Hmm.  tr_itruncate is ~650K on my 2TB SSD, assuming 88 bytes per, that's
> about ... ~7300 log format items?  Not a lot, maybe it should roll the
> transaction every 1000 invalidations or so...
> 

I'm not really sure what categorizes as a lot here given that the blocks
would need to be in-core, but rolling on some fixed/safe interval sounds
reasonable to me.

> > currently be an issue, but it might be worth putting something down in a
> > comment to note that this is a single transaction and we expect to not
> > have to invalidate more than N (ballpark) blocks in a single go,
> > whatever that value happens to be.
> > 
> > > > > +	error = xrep_roll_ag_trans(sc);
> > > > > +	if (error)
> > > > > +		return error;
> > > > > +
> > > > > +	/* Now that we've succeeded, mark the incore state valid again. */
> > > > > +	sc->sa.pag->pagf_init = 1;
> > > > > +	return 0;
> > > > > +}
> > > > > +
> > > > > +/* Build new free space btrees and dispose of the old one. */
> > > > > +STATIC int
> > > > > +xrep_abt_rebuild_trees(
> > > > > +	struct xfs_scrub	*sc,
> > > > > +	struct list_head	*free_extents,
> > > > > +	struct xfs_bitmap	*old_allocbt_blocks)
> > > > > +{
> > > > > +	struct xfs_owner_info	oinfo;
> > > > > +	struct xrep_abt_extent	*rae;
> > > > > +	struct xrep_abt_extent	*n;
> > > > > +	struct xrep_abt_extent	*longest;
> > > > > +	int			error;
> > > > > +
> > > > > +	xfs_rmap_skip_owner_update(&oinfo);
> > > > > +
> > > > > +	/*
> > > > > +	 * Insert the longest free extent in case it's necessary to
> > > > > +	 * refresh the AGFL with multiple blocks.  If there is no longest
> > > > > +	 * extent, we had exactly the free space we needed; we're done.
> > > > > +	 */
> > > > 
> > > > I'm confused by the last sentence. longest should only be NULL if the
> > > > free space list is empty and haven't we already bailed out with -ENOSPC
> > > > if that's the case?
> > > > 
> > > > > +	longest = xrep_abt_get_longest(free_extents);
> > > 
> > > xrep_abt_rebuild_trees is called after we allocate and initialize two
> > > new btree roots in xrep_abt_reset_btrees.  If free_extents is an empty
> > > list here, then we found exactly two blocks worth of free space and used
> > > them to set up new btree roots.
> > > 
> > 
> > Got it, thanks.
> > 
> > > > > +	if (!longest)
> > > > > +		goto done;
> > > > > +	error = xrep_abt_free_extent(sc,
> > > > > +			XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, longest->bno),
> > > > > +			longest->len, &oinfo);
> > > > > +	list_del(&longest->list);
> > > > > +	kmem_free(longest);
> > > > > +	if (error)
> > > > > +		return error;
> > > > > +
> > > > > +	/* Insert records into the new btrees. */
> > > > > +	list_for_each_entry_safe(rae, n, free_extents, list) {
> > > > > +		error = xrep_abt_free_extent(sc,
> > > > > +				XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, rae->bno),
> > > > > +				rae->len, &oinfo);
> > > > > +		if (error)
> > > > > +			return error;
> > > > > +		list_del(&rae->list);
> > > > > +		kmem_free(rae);
> > > > > +	}
> > > > 
> > > > Ok, at this point we've reset the btree roots and we start freeing the
> > > > free ranges that were discovered via the rmapbt analysis. AFAICT, if we
> > > > fail or crash at this point, we leave the allocbts in a partially
> > > > constructed state. I take it that is Ok with respect to the broader
> > > > repair algorithm because we'd essentially start over by inspecting the
> > > > rmapbt again on a retry.
> > > 
> > > Right.  Though in the crash/shutdown case, you'll end up with the
> > > filesystem in an offline state at some point before you can retry the
> > > scrub, it's probably faster to run xfs_repair to fix the damage.
> > > 
> > 
> > Can we really assume that if we're already up and running an online
> > repair? The filesystem has to be mountable in that case in the first
> > place. If we've already reset and started reconstructing the allocation
> > btrees then I'd think those transactions would recover just fine on a
> > power loss or something (perhaps not in the event of some other
> > corruption related shutdown).
> 
> Right, for the system crash case, whatever transactions committed should
> replay just fine, and you can even start up the online repair again, and
> if the AG isn't particularly close to ENOSPC then (barring rmap
> corruption) it should work just fine.
> 
> If the fs went down because either (a) repair hit other corruption or
> (b) some other thread hit an error in some other part of the filesystem,
> then it's not so clear -- in (b) you could probably try again, but for
> (a) you'll definitely have to unmount and run xfs_repair.
> 

Indeed, there are certainly cases where we simply won't be able to do an
online repair. I'm trying to think about scenarios where we should be
able to do an online repair, but we lose power or hit some kind of
transient error like a memory allocation failure before it completes. It
would be nice if the online repair itself didn't contribute (within
reason) to the inability to simply try again just because the fs was
close to -ENOSPC.

For one, I think it's potentially confusing behavior. Second, it might
be concerning to regular users who perceive it as an online repair
leaving the fs in a worse off state. Us fs devs know that may not really
be the case, but I think we're better for addressing it if we can
reasonably do so.

> Perhaps the guideline here is that if the fs goes down more than once
> during online repair then unmount it and run xfs_repair.
> 

Yep, I think that makes sense if the filesystem or repair itself is
tripping over other corruptions that fail to keep it active for the
duration of the repair.

> > > > The blocks allocated for the btrees that we've begun to construct here
> > > > end up mapped in the rmapbt as we go, right? IIUC, that means we don't
> > > > necessarily have infinite retries to make sure this completes. IOW,
> > > > suppose that a first repair attempt finds just enough free space to
> > > > construct new trees, gets far enough along to consume most of that free
> > > > space and then crashes. Is it possible that a subsequent repair attempt
> > > > includes the btree blocks allocated during the previous failed repair
> > > > attempt in the sum of "old btree blocks" and determines we don't have
> > > > enough free space to repair?
> > > 
> > > Yes, that's a risk of running the free space repair.
> > > 
> > 
> > Can we improve on that? For example, are the rmapbt entries for the old
> > allocation btree blocks necessary once we commit the btree resets? If
> > not, could we remove those entries before we start tree reconstruction?
> > 
> > Alternatively, could we incorporate use of the old btree blocks? As it
> > is, we discover those blocks simply so we can free them at the end.
> > Perhaps we could free them sooner or find a more clever means to
> > reallocate directly from that in-core list? I guess we have to consider
> > whether they were really valid/sane btree blocks, but either way ISTM
> > that the old blocks list is essentially invalidated once we reset the
> > btrees.
> 
> Hmm, it's a little tricky to do that -- we could reap the old bnobt and
> cntbt blocks (in the old_allocbt_blocks bitmap) first, but if adding a
> record causes a btree split we'll pull blocks from the AGFL, and if
> there aren't enough blocks in the bnobt to fill the AGFL back up then
> fix_freelist won't succeed.  That complication is why it finds the
> longest extent in the unclaimed list and pushes that in first, then
> works on the rest of the extents.
> 

Hmm, but doesn't a btree split require at least one full space btree
block per-level? In conjunction, the agfl minimum size requirement grows
with the height of the tree, which implies available free space..? I
could be missing something, perhaps we have to account for the rmapbt in
that case as well? Regardless...

> I suppose one could try to avoid ENOSPC by pushing that longest extent
> in first (since we know that won't trigger a split), then reap the old
> alloc btree blocks, and then add everything else back in...
> 

I think it would be reasonable to seed the btree with the longest record
or some fixed number of longest records (~1/2 a root block, for example)
before making actual use of the btrees to reap the old blocks. I think
then you'd only have a very short window of a single block leak on a
poorly timed power loss and repair retry sequence before you start
actually freeing originally used space (which in practice, I think
solves the problem).

Given that we're starting from empty, I wonder if another option may be
to over fill the agfl with old btree blocks or something. The first real
free should shift enough blocks back into the btrees to ensure the agfl
can be managed from that point forward, right? That may be more work
than it's worth though and/or a job for another patch. (FWIW, we also
have that NOSHRINK agfl fixup flag for userspace repair.)

Brian

> --D
> 
> > Brian
> > 
> > > > > +
> > > > > +done:
> > > > > +	/* Free all the OWN_AG blocks that are not in the rmapbt/agfl. */
> > > > > +	xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG);
> > > > > +	return xrep_reap_extents(sc, old_allocbt_blocks, &oinfo,
> > > > > +			XFS_AG_RESV_NONE);
> > > > > +}
> > > > > +
> > > > ...
> > > > > diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
> > > > > index 0ed68379e551..82f99633a597 100644
> > > > > --- a/fs/xfs/xfs_extent_busy.c
> > > > > +++ b/fs/xfs/xfs_extent_busy.c
> > > > > @@ -657,3 +657,17 @@ xfs_extent_busy_ag_cmp(
> > > > >  		diff = b1->bno - b2->bno;
> > > > >  	return diff;
> > > > >  }
> > > > > +
> > > > > +/* Are there any busy extents in this AG? */
> > > > > +bool
> > > > > +xfs_extent_busy_list_empty(
> > > > > +	struct xfs_perag	*pag)
> > > > > +{
> > > > > +	spin_lock(&pag->pagb_lock);
> > > > > +	if (pag->pagb_tree.rb_node) {
> > > > 
> > > > RB_EMPTY_ROOT()?
> > > 
> > > Good suggestion, thank you!
> > > 
> > > --D
> > > 
> > > > Brian
> > > > 
> > > > > +		spin_unlock(&pag->pagb_lock);
> > > > > +		return false;
> > > > > +	}
> > > > > +	spin_unlock(&pag->pagb_lock);
> > > > > +	return true;
> > > > > +}
> > > > > diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h
> > > > > index 990ab3891971..2f8c73c712c6 100644
> > > > > --- a/fs/xfs/xfs_extent_busy.h
> > > > > +++ b/fs/xfs/xfs_extent_busy.h
> > > > > @@ -65,4 +65,6 @@ static inline void xfs_extent_busy_sort(struct list_head *list)
> > > > >  	list_sort(NULL, list, xfs_extent_busy_ag_cmp);
> > > > >  }
> > > > >  
> > > > > +bool xfs_extent_busy_list_empty(struct xfs_perag *pag);
> > > > > +
> > > > >  #endif /* __XFS_EXTENT_BUSY_H__ */
> > > > > 
> > > > > --
> > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > the body of a message to majordomo@vger.kernel.org
> > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > --
> > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > the body of a message to majordomo@vger.kernel.org
> > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > --
> > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > the body of a message to majordomo@vger.kernel.org
> > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Darrick J. Wong Aug. 2, 2018, 6:28 a.m. UTC | #6
On Wed, Aug 01, 2018 at 02:39:20PM -0400, Brian Foster wrote:
> On Wed, Aug 01, 2018 at 09:23:16AM -0700, Darrick J. Wong wrote:
> > On Wed, Aug 01, 2018 at 07:54:09AM -0400, Brian Foster wrote:
> > > On Tue, Jul 31, 2018 at 03:01:25PM -0700, Darrick J. Wong wrote:
> > > > On Tue, Jul 31, 2018 at 01:47:23PM -0400, Brian Foster wrote:
> > > > > On Sun, Jul 29, 2018 at 10:48:21PM -0700, Darrick J. Wong wrote:
> > > > > > From: Darrick J. Wong <darrick.wong@oracle.com>
> > > > > > 
> > > > > > Rebuild the free space btrees from the gaps in the rmap btree.
> > > > > > 
> > > > > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > > > > > ---
> > > > > >  fs/xfs/Makefile             |    1 
> > > > > >  fs/xfs/scrub/alloc.c        |    1 
> > > > > >  fs/xfs/scrub/alloc_repair.c |  581 +++++++++++++++++++++++++++++++++++++++++++
> > > > > >  fs/xfs/scrub/common.c       |    8 +
> > > > > >  fs/xfs/scrub/repair.h       |    2 
> > > > > >  fs/xfs/scrub/scrub.c        |    4 
> > > > > >  fs/xfs/scrub/trace.h        |    2 
> > > > > >  fs/xfs/xfs_extent_busy.c    |   14 +
> > > > > >  fs/xfs/xfs_extent_busy.h    |    2 
> > > > > >  9 files changed, 610 insertions(+), 5 deletions(-)
> > > > > >  create mode 100644 fs/xfs/scrub/alloc_repair.c
> > > > > > 
> > > > > > 
> > > > > ...
> > > > > > diff --git a/fs/xfs/scrub/alloc_repair.c b/fs/xfs/scrub/alloc_repair.c
> > > > > > new file mode 100644
> > > > > > index 000000000000..b228c2906de2
> > > > > > --- /dev/null
> > > > > > +++ b/fs/xfs/scrub/alloc_repair.c
> > > > > > @@ -0,0 +1,581 @@
> ...
> > > > > > +
> > > > > > +/*
> > > > > > + * Make our new freespace btree roots permanent so that we can start freeing
> > > > > > + * unused space back into the AG.
> > > > > > + */
> > > > > > +STATIC int
> > > > > > +xrep_abt_commit_new(
> > > > > > +	struct xfs_scrub	*sc,
> > > > > > +	struct xfs_bitmap	*old_allocbt_blocks,
> > > > > > +	int			log_flags)
> > > > > > +{
> > > > > > +	int			error;
> > > > > > +
> > > > > > +	xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, log_flags);
> > > > > > +
> > > > > > +	/* Invalidate the old freespace btree blocks and commit. */
> > > > > > +	error = xrep_invalidate_blocks(sc, old_allocbt_blocks);
> > > > > > +	if (error)
> > > > > > +		return error;
> > > > > 
> > > > > It looks like the above invalidation all happens in the same
> > > > > transaction. Those aren't logging buffer data or anything, but any idea
> > > > > how many log formats we can get away with in this single transaction?
> > > > 
> > > > Hm... well, on my computer a log format is ~88 bytes.  Assuming 4K
> > > > blocks, the max AG size of 1TB, maximum free space fragmentation, and
> > > > two btrees, the tree could be up to ~270 million records.  Assuming ~505
> > > > records per block, that's ... ~531,000 leaf blocks and ~1100 node blocks
> > > > for both btrees.  If we invalidate both, that's ~46M of RAM?
> > > > 
> > > 
> > > I was thinking more about transaction reservation than RAM. It may not
> > 
> > Hmm.  tr_itruncate is ~650K on my 2TB SSD, assuming 88 bytes per, that's
> > about ... ~7300 log format items?  Not a lot, maybe it should roll the
> > transaction every 1000 invalidations or so...
> > 
> 
> I'm not really sure what categorizes as a lot here given that the blocks
> would need to be in-core, but rolling on some fixed/safe interval sounds
> reasonable to me.
> 
> > > currently be an issue, but it might be worth putting something down in a
> > > comment to note that this is a single transaction and we expect to not
> > > have to invalidate more than N (ballpark) blocks in a single go,
> > > whatever that value happens to be.
> > > 
> > > > > > +	error = xrep_roll_ag_trans(sc);
> > > > > > +	if (error)
> > > > > > +		return error;
> > > > > > +
> > > > > > +	/* Now that we've succeeded, mark the incore state valid again. */
> > > > > > +	sc->sa.pag->pagf_init = 1;
> > > > > > +	return 0;
> > > > > > +}
> > > > > > +
> > > > > > +/* Build new free space btrees and dispose of the old one. */
> > > > > > +STATIC int
> > > > > > +xrep_abt_rebuild_trees(
> > > > > > +	struct xfs_scrub	*sc,
> > > > > > +	struct list_head	*free_extents,
> > > > > > +	struct xfs_bitmap	*old_allocbt_blocks)
> > > > > > +{
> > > > > > +	struct xfs_owner_info	oinfo;
> > > > > > +	struct xrep_abt_extent	*rae;
> > > > > > +	struct xrep_abt_extent	*n;
> > > > > > +	struct xrep_abt_extent	*longest;
> > > > > > +	int			error;
> > > > > > +
> > > > > > +	xfs_rmap_skip_owner_update(&oinfo);
> > > > > > +
> > > > > > +	/*
> > > > > > +	 * Insert the longest free extent in case it's necessary to
> > > > > > +	 * refresh the AGFL with multiple blocks.  If there is no longest
> > > > > > +	 * extent, we had exactly the free space we needed; we're done.
> > > > > > +	 */
> > > > > 
> > > > > I'm confused by the last sentence. longest should only be NULL if the
> > > > > free space list is empty and haven't we already bailed out with -ENOSPC
> > > > > if that's the case?
> > > > > 
> > > > > > +	longest = xrep_abt_get_longest(free_extents);
> > > > 
> > > > xrep_abt_rebuild_trees is called after we allocate and initialize two
> > > > new btree roots in xrep_abt_reset_btrees.  If free_extents is an empty
> > > > list here, then we found exactly two blocks worth of free space and used
> > > > them to set up new btree roots.
> > > > 
> > > 
> > > Got it, thanks.
> > > 
> > > > > > +	if (!longest)
> > > > > > +		goto done;
> > > > > > +	error = xrep_abt_free_extent(sc,
> > > > > > +			XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, longest->bno),
> > > > > > +			longest->len, &oinfo);
> > > > > > +	list_del(&longest->list);
> > > > > > +	kmem_free(longest);
> > > > > > +	if (error)
> > > > > > +		return error;
> > > > > > +
> > > > > > +	/* Insert records into the new btrees. */
> > > > > > +	list_for_each_entry_safe(rae, n, free_extents, list) {
> > > > > > +		error = xrep_abt_free_extent(sc,
> > > > > > +				XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, rae->bno),
> > > > > > +				rae->len, &oinfo);
> > > > > > +		if (error)
> > > > > > +			return error;
> > > > > > +		list_del(&rae->list);
> > > > > > +		kmem_free(rae);
> > > > > > +	}
> > > > > 
> > > > > Ok, at this point we've reset the btree roots and we start freeing the
> > > > > free ranges that were discovered via the rmapbt analysis. AFAICT, if we
> > > > > fail or crash at this point, we leave the allocbts in a partially
> > > > > constructed state. I take it that is Ok with respect to the broader
> > > > > repair algorithm because we'd essentially start over by inspecting the
> > > > > rmapbt again on a retry.
> > > > 
> > > > Right.  Though in the crash/shutdown case, you'll end up with the
> > > > filesystem in an offline state at some point before you can retry the
> > > > scrub, it's probably faster to run xfs_repair to fix the damage.
> > > > 
> > > 
> > > Can we really assume that if we're already up and running an online
> > > repair? The filesystem has to be mountable in that case in the first
> > > place. If we've already reset and started reconstructing the allocation
> > > btrees then I'd think those transactions would recover just fine on a
> > > power loss or something (perhaps not in the event of some other
> > > corruption related shutdown).
> > 
> > Right, for the system crash case, whatever transactions committed should
> > replay just fine, and you can even start up the online repair again, and
> > if the AG isn't particularly close to ENOSPC then (barring rmap
> > corruption) it should work just fine.
> > 
> > If the fs went down because either (a) repair hit other corruption or
> > (b) some other thread hit an error in some other part of the filesystem,
> > then it's not so clear -- in (b) you could probably try again, but for
> > (a) you'll definitely have to unmount and run xfs_repair.
> > 
> 
> Indeed, there are certainly cases where we simply won't be able to do an
> online repair. I'm trying to think about scenarios where we should be
> able to do an online repair, but we lose power or hit some kind of
> transient error like a memory allocation failure before it completes. It
> would be nice if the online repair itself didn't contribute (within
> reason) to the inability to simply try again just because the fs was
> close to -ENOSPC.

Agreed.  Most of the, uh, opportunities to hit ENOMEM happen before we
start modifying on-disk metadata.  If that happens, we just free all the
memory and bail out having done nothing.

> For one, I think it's potentially confusing behavior. Second, it might
> be concerning to regular users who perceive it as an online repair
> leaving the fs in a worse off state. Us fs devs know that may not really
> be the case, but I think we're better for addressing it if we can
> reasonably do so.

<nod> Further in the future I want to add the ability to offline an AG,
so the worst that happens is that scrub turns the AG off, repair doesn't
fix it, and the AG simply stays offline.  That might give us the
ability to survive cancelling the repair transaction, since if the AG's
offline already anyway we could just throw away the dirty buffers and
resurrect the AG later.  I don't know, that's purely speculative.

> > Perhaps the guideline here is that if the fs goes down more than once
> > during online repair then unmount it and run xfs_repair.
> > 
> 
> Yep, I think that makes sense if the filesystem or repair itself is
> tripping over other corruptions that fail to keep it active for the
> duration of the repair.

<nod>

> > > > > The blocks allocated for the btrees that we've begun to construct here
> > > > > end up mapped in the rmapbt as we go, right? IIUC, that means we don't
> > > > > necessarily have infinite retries to make sure this completes. IOW,
> > > > > suppose that a first repair attempt finds just enough free space to
> > > > > construct new trees, gets far enough along to consume most of that free
> > > > > space and then crashes. Is it possible that a subsequent repair attempt
> > > > > includes the btree blocks allocated during the previous failed repair
> > > > > attempt in the sum of "old btree blocks" and determines we don't have
> > > > > enough free space to repair?
> > > > 
> > > > Yes, that's a risk of running the free space repair.
> > > > 
> > > 
> > > Can we improve on that? For example, are the rmapbt entries for the old
> > > allocation btree blocks necessary once we commit the btree resets? If
> > > not, could we remove those entries before we start tree reconstruction?
> > > 
> > > Alternatively, could we incorporate use of the old btree blocks? As it
> > > is, we discover those blocks simply so we can free them at the end.
> > > Perhaps we could free them sooner or find a more clever means to
> > > reallocate directly from that in-core list? I guess we have to consider
> > > whether they were really valid/sane btree blocks, but either way ISTM
> > > that the old blocks list is essentially invalidated once we reset the
> > > btrees.
> > 
> > Hmm, it's a little tricky to do that -- we could reap the old bnobt and
> > cntbt blocks (in the old_allocbt_blocks bitmap) first, but if adding a
> > record causes a btree split we'll pull blocks from the AGFL, and if
> > there aren't enough blocks in the bnobt to fill the AGFL back up then
> > fix_freelist won't succeed.  That complication is why it finds the
> > longest extent in the unclaimed list and pushes that in first, then
> > works on the rest of the extents.
> > 
> 
> Hmm, but doesn't a btree split require at least one full space btree
> block per-level? In conjunction, the agfl minimum size requirement grows
> with the height of the tree, which implies available free space..? I
> could be missing something, perhaps we have to account for the rmapbt in
> that case as well? Regardless...
> 
> > I suppose one could try to avoid ENOSPC by pushing that longest extent
> > in first (since we know that won't trigger a split), then reap the old
> > alloc btree blocks, and then add everything else back in...
> > 
> 
> I think it would be reasonable to seed the btree with the longest record
> or some fixed number of longest records (~1/2 a root block, for example)
> before making actual use of the btrees to reap the old blocks. I think
> then you'd only have a very short window of a single block leak on a
> poorly timed power loss and repair retry sequence before you start
> actually freeing originally used space (which in practice, I think
> solves the problem).
> 
> Given that we're starting from empty, I wonder if another option may be
> to over fill the agfl with old btree blocks or something. The first real
> free should shift enough blocks back into the btrees to ensure the agfl
> can be managed from that point forward, right? That may be more work
> than it's worth though and/or a job for another patch. (FWIW, we also
> have that NOSHRINK agfl fixup flag for userspace repair.)

Yes, I'll give that a try tomorrow, now that I've finished porting all
the 4.19 stuff to xfsprogs. :)

Looping back to something we discussed earlier in this thread, I'd
prefer to hold off on converting the list of already-freed extents to
xfs_bitmap because the same problem exists in all the repair functions
of having to store a large number of records for the rebuilt btree, and
maybe there's some way to <cough> use pageable memory for that, since
the access patterns for that are append, sort, and iterate; for those
three uses we don't necessarily require all the records to be in memory
all the time.  For the allocbt repair I expect the free space records to
be far more numerous than the list of old bnobt/cntbt blocks.

--D

> Brian
> 
> > --D
> > 
> > > Brian
> > > 
> > > > > > +
> > > > > > +done:
> > > > > > +	/* Free all the OWN_AG blocks that are not in the rmapbt/agfl. */
> > > > > > +	xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG);
> > > > > > +	return xrep_reap_extents(sc, old_allocbt_blocks, &oinfo,
> > > > > > +			XFS_AG_RESV_NONE);
> > > > > > +}
> > > > > > +
> > > > > ...
> > > > > > diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
> > > > > > index 0ed68379e551..82f99633a597 100644
> > > > > > --- a/fs/xfs/xfs_extent_busy.c
> > > > > > +++ b/fs/xfs/xfs_extent_busy.c
> > > > > > @@ -657,3 +657,17 @@ xfs_extent_busy_ag_cmp(
> > > > > >  		diff = b1->bno - b2->bno;
> > > > > >  	return diff;
> > > > > >  }
> > > > > > +
> > > > > > +/* Are there any busy extents in this AG? */
> > > > > > +bool
> > > > > > +xfs_extent_busy_list_empty(
> > > > > > +	struct xfs_perag	*pag)
> > > > > > +{
> > > > > > +	spin_lock(&pag->pagb_lock);
> > > > > > +	if (pag->pagb_tree.rb_node) {
> > > > > 
> > > > > RB_EMPTY_ROOT()?
> > > > 
> > > > Good suggestion, thank you!
> > > > 
> > > > --D
> > > > 
> > > > > Brian
> > > > > 
> > > > > > +		spin_unlock(&pag->pagb_lock);
> > > > > > +		return false;
> > > > > > +	}
> > > > > > +	spin_unlock(&pag->pagb_lock);
> > > > > > +	return true;
> > > > > > +}
> > > > > > diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h
> > > > > > index 990ab3891971..2f8c73c712c6 100644
> > > > > > --- a/fs/xfs/xfs_extent_busy.h
> > > > > > +++ b/fs/xfs/xfs_extent_busy.h
> > > > > > @@ -65,4 +65,6 @@ static inline void xfs_extent_busy_sort(struct list_head *list)
> > > > > >  	list_sort(NULL, list, xfs_extent_busy_ag_cmp);
> > > > > >  }
> > > > > >  
> > > > > > +bool xfs_extent_busy_list_empty(struct xfs_perag *pag);
> > > > > > +
> > > > > >  #endif /* __XFS_EXTENT_BUSY_H__ */
> > > > > > 
> > > > > > --
> > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > --
> > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > the body of a message to majordomo@vger.kernel.org
> > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > --
> > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > the body of a message to majordomo@vger.kernel.org
> > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > --
> > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > the body of a message to majordomo@vger.kernel.org
> > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Brian Foster Aug. 2, 2018, 1:48 p.m. UTC | #7
On Wed, Aug 01, 2018 at 11:28:45PM -0700, Darrick J. Wong wrote:
> On Wed, Aug 01, 2018 at 02:39:20PM -0400, Brian Foster wrote:
> > On Wed, Aug 01, 2018 at 09:23:16AM -0700, Darrick J. Wong wrote:
> > > On Wed, Aug 01, 2018 at 07:54:09AM -0400, Brian Foster wrote:
> > > > On Tue, Jul 31, 2018 at 03:01:25PM -0700, Darrick J. Wong wrote:
> > > > > On Tue, Jul 31, 2018 at 01:47:23PM -0400, Brian Foster wrote:
> > > > > > On Sun, Jul 29, 2018 at 10:48:21PM -0700, Darrick J. Wong wrote:
> > > > > > > From: Darrick J. Wong <darrick.wong@oracle.com>
> > > > > > > 
> > > > > > > Rebuild the free space btrees from the gaps in the rmap btree.
> > > > > > > 
> > > > > > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > > > > > > ---
> > > > > > >  fs/xfs/Makefile             |    1 
> > > > > > >  fs/xfs/scrub/alloc.c        |    1 
> > > > > > >  fs/xfs/scrub/alloc_repair.c |  581 +++++++++++++++++++++++++++++++++++++++++++
> > > > > > >  fs/xfs/scrub/common.c       |    8 +
> > > > > > >  fs/xfs/scrub/repair.h       |    2 
> > > > > > >  fs/xfs/scrub/scrub.c        |    4 
> > > > > > >  fs/xfs/scrub/trace.h        |    2 
> > > > > > >  fs/xfs/xfs_extent_busy.c    |   14 +
> > > > > > >  fs/xfs/xfs_extent_busy.h    |    2 
> > > > > > >  9 files changed, 610 insertions(+), 5 deletions(-)
> > > > > > >  create mode 100644 fs/xfs/scrub/alloc_repair.c
> > > > > > > 
> > > > > > > 
> > > > > > ...
> > > > > > > diff --git a/fs/xfs/scrub/alloc_repair.c b/fs/xfs/scrub/alloc_repair.c
> > > > > > > new file mode 100644
> > > > > > > index 000000000000..b228c2906de2
> > > > > > > --- /dev/null
> > > > > > > +++ b/fs/xfs/scrub/alloc_repair.c
> > > > > > > @@ -0,0 +1,581 @@
> > ...
> > > > > > > +
> > > > > > > +/*
> > > > > > > + * Make our new freespace btree roots permanent so that we can start freeing
> > > > > > > + * unused space back into the AG.
> > > > > > > + */
> > > > > > > +STATIC int
> > > > > > > +xrep_abt_commit_new(
> > > > > > > +	struct xfs_scrub	*sc,
> > > > > > > +	struct xfs_bitmap	*old_allocbt_blocks,
> > > > > > > +	int			log_flags)
> > > > > > > +{
> > > > > > > +	int			error;
> > > > > > > +
> > > > > > > +	xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, log_flags);
> > > > > > > +
> > > > > > > +	/* Invalidate the old freespace btree blocks and commit. */
> > > > > > > +	error = xrep_invalidate_blocks(sc, old_allocbt_blocks);
> > > > > > > +	if (error)
> > > > > > > +		return error;
> > > > > > 
> > > > > > It looks like the above invalidation all happens in the same
> > > > > > transaction. Those aren't logging buffer data or anything, but any idea
> > > > > > how many log formats we can get away with in this single transaction?
> > > > > 
> > > > > Hm... well, on my computer a log format is ~88 bytes.  Assuming 4K
> > > > > blocks, the max AG size of 1TB, maximum free space fragmentation, and
> > > > > two btrees, the tree could be up to ~270 million records.  Assuming ~505
> > > > > records per block, that's ... ~531,000 leaf blocks and ~1100 node blocks
> > > > > for both btrees.  If we invalidate both, that's ~46M of RAM?
> > > > > 
> > > > 
> > > > I was thinking more about transaction reservation than RAM. It may not
> > > 
> > > Hmm.  tr_itruncate is ~650K on my 2TB SSD, assuming 88 bytes per, that's
> > > about ... ~7300 log format items?  Not a lot, maybe it should roll the
> > > transaction every 1000 invalidations or so...
> > > 
> > 
> > I'm not really sure what categorizes as a lot here given that the blocks
> > would need to be in-core, but rolling on some fixed/safe interval sounds
> > reasonable to me.
> > 
> > > > currently be an issue, but it might be worth putting something down in a
> > > > comment to note that this is a single transaction and we expect to not
> > > > have to invalidate more than N (ballpark) blocks in a single go,
> > > > whatever that value happens to be.
> > > > 
> > > > > > > +	error = xrep_roll_ag_trans(sc);
> > > > > > > +	if (error)
> > > > > > > +		return error;
> > > > > > > +
> > > > > > > +	/* Now that we've succeeded, mark the incore state valid again. */
> > > > > > > +	sc->sa.pag->pagf_init = 1;
> > > > > > > +	return 0;
> > > > > > > +}
> > > > > > > +
> > > > > > > +/* Build new free space btrees and dispose of the old one. */
> > > > > > > +STATIC int
> > > > > > > +xrep_abt_rebuild_trees(
> > > > > > > +	struct xfs_scrub	*sc,
> > > > > > > +	struct list_head	*free_extents,
> > > > > > > +	struct xfs_bitmap	*old_allocbt_blocks)
> > > > > > > +{
> > > > > > > +	struct xfs_owner_info	oinfo;
> > > > > > > +	struct xrep_abt_extent	*rae;
> > > > > > > +	struct xrep_abt_extent	*n;
> > > > > > > +	struct xrep_abt_extent	*longest;
> > > > > > > +	int			error;
> > > > > > > +
> > > > > > > +	xfs_rmap_skip_owner_update(&oinfo);
> > > > > > > +
> > > > > > > +	/*
> > > > > > > +	 * Insert the longest free extent in case it's necessary to
> > > > > > > +	 * refresh the AGFL with multiple blocks.  If there is no longest
> > > > > > > +	 * extent, we had exactly the free space we needed; we're done.
> > > > > > > +	 */
> > > > > > 
> > > > > > I'm confused by the last sentence. longest should only be NULL if the
> > > > > > free space list is empty and haven't we already bailed out with -ENOSPC
> > > > > > if that's the case?
> > > > > > 
> > > > > > > +	longest = xrep_abt_get_longest(free_extents);
> > > > > 
> > > > > xrep_abt_rebuild_trees is called after we allocate and initialize two
> > > > > new btree roots in xrep_abt_reset_btrees.  If free_extents is an empty
> > > > > list here, then we found exactly two blocks worth of free space and used
> > > > > them to set up new btree roots.
> > > > > 
> > > > 
> > > > Got it, thanks.
> > > > 
> > > > > > > +	if (!longest)
> > > > > > > +		goto done;
> > > > > > > +	error = xrep_abt_free_extent(sc,
> > > > > > > +			XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, longest->bno),
> > > > > > > +			longest->len, &oinfo);
> > > > > > > +	list_del(&longest->list);
> > > > > > > +	kmem_free(longest);
> > > > > > > +	if (error)
> > > > > > > +		return error;
> > > > > > > +
> > > > > > > +	/* Insert records into the new btrees. */
> > > > > > > +	list_for_each_entry_safe(rae, n, free_extents, list) {
> > > > > > > +		error = xrep_abt_free_extent(sc,
> > > > > > > +				XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, rae->bno),
> > > > > > > +				rae->len, &oinfo);
> > > > > > > +		if (error)
> > > > > > > +			return error;
> > > > > > > +		list_del(&rae->list);
> > > > > > > +		kmem_free(rae);
> > > > > > > +	}
> > > > > > 
> > > > > > Ok, at this point we've reset the btree roots and we start freeing the
> > > > > > free ranges that were discovered via the rmapbt analysis. AFAICT, if we
> > > > > > fail or crash at this point, we leave the allocbts in a partially
> > > > > > constructed state. I take it that is Ok with respect to the broader
> > > > > > repair algorithm because we'd essentially start over by inspecting the
> > > > > > rmapbt again on a retry.
> > > > > 
> > > > > Right.  Though in the crash/shutdown case, you'll end up with the
> > > > > filesystem in an offline state at some point before you can retry the
> > > > > scrub, it's probably faster to run xfs_repair to fix the damage.
> > > > > 
> > > > 
> > > > Can we really assume that if we're already up and running an online
> > > > repair? The filesystem has to be mountable in that case in the first
> > > > place. If we've already reset and started reconstructing the allocation
> > > > btrees then I'd think those transactions would recover just fine on a
> > > > power loss or something (perhaps not in the event of some other
> > > > corruption related shutdown).
> > > 
> > > Right, for the system crash case, whatever transactions committed should
> > > replay just fine, and you can even start up the online repair again, and
> > > if the AG isn't particularly close to ENOSPC then (barring rmap
> > > corruption) it should work just fine.
> > > 
> > > If the fs went down because either (a) repair hit other corruption or
> > > (b) some other thread hit an error in some other part of the filesystem,
> > > then it's not so clear -- in (b) you could probably try again, but for
> > > (a) you'll definitely have to unmount and run xfs_repair.
> > > 
> > 
> > Indeed, there are certainly cases where we simply won't be able to do an
> > online repair. I'm trying to think about scenarios where we should be
> > able to do an online repair, but we lose power or hit some kind of
> > transient error like a memory allocation failure before it completes. It
> > would be nice if the online repair itself didn't contribute (within
> > reason) to the inability to simply try again just because the fs was
> > close to -ENOSPC.
> 
> Agreed.  Most of the, uh, opportunities to hit ENOMEM happen before we
> start modifying on-disk metadata.  If that happens, we just free all the
> memory and bail out having done nothing.
> 
> > For one, I think it's potentially confusing behavior. Second, it might
> > be concerning to regular users who perceive it as an online repair
> > leaving the fs in a worse off state. Us fs devs know that may not really
> > be the case, but I think we're better for addressing it if we can
> > reasonably do so.
> 
> <nod> Further in the future I want to add the ability to offline an AG,
> so the worst that happens is that scrub turns the AG off, repair doesn't
> fix it, and the AG simply stays offline.  That might give us the
> ability to survive cancelling the repair transaction, since if the AG's
> offline already anyway we could just throw away the dirty buffers and
> resurrect the AG later.  I don't know, that's purely speculative.
> 
> > > Perhaps the guideline here is that if the fs goes down more than once
> > > during online repair then unmount it and run xfs_repair.
> > > 
> > 
> > Yep, I think that makes sense if the filesystem or repair itself is
> > tripping over other corruptions that fail to keep it active for the
> > duration of the repair.
> 
> <nod>
> 
> > > > > > The blocks allocated for the btrees that we've begun to construct here
> > > > > > end up mapped in the rmapbt as we go, right? IIUC, that means we don't
> > > > > > necessarily have infinite retries to make sure this completes. IOW,
> > > > > > suppose that a first repair attempt finds just enough free space to
> > > > > > construct new trees, gets far enough along to consume most of that free
> > > > > > space and then crashes. Is it possible that a subsequent repair attempt
> > > > > > includes the btree blocks allocated during the previous failed repair
> > > > > > attempt in the sum of "old btree blocks" and determines we don't have
> > > > > > enough free space to repair?
> > > > > 
> > > > > Yes, that's a risk of running the free space repair.
> > > > > 
> > > > 
> > > > Can we improve on that? For example, are the rmapbt entries for the old
> > > > allocation btree blocks necessary once we commit the btree resets? If
> > > > not, could we remove those entries before we start tree reconstruction?
> > > > 
> > > > Alternatively, could we incorporate use of the old btree blocks? As it
> > > > is, we discover those blocks simply so we can free them at the end.
> > > > Perhaps we could free them sooner or find a more clever means to
> > > > reallocate directly from that in-core list? I guess we have to consider
> > > > whether they were really valid/sane btree blocks, but either way ISTM
> > > > that the old blocks list is essentially invalidated once we reset the
> > > > btrees.
> > > 
> > > Hmm, it's a little tricky to do that -- we could reap the old bnobt and
> > > cntbt blocks (in the old_allocbt_blocks bitmap) first, but if adding a
> > > record causes a btree split we'll pull blocks from the AGFL, and if
> > > there aren't enough blocks in the bnobt to fill the AGFL back up then
> > > fix_freelist won't succeed.  That complication is why it finds the
> > > longest extent in the unclaimed list and pushes that in first, then
> > > works on the rest of the extents.
> > > 
> > 
> > Hmm, but doesn't a btree split require at least one full space btree
> > block per-level? In conjunction, the agfl minimum size requirement grows
> > with the height of the tree, which implies available free space..? I
> > could be missing something, perhaps we have to account for the rmapbt in
> > that case as well? Regardless...
> > 
> > > I suppose one could try to avoid ENOSPC by pushing that longest extent
> > > in first (since we know that won't trigger a split), then reap the old
> > > alloc btree blocks, and then add everything else back in...
> > > 
> > 
> > I think it would be reasonable to seed the btree with the longest record
> > or some fixed number of longest records (~1/2 a root block, for example)
> > before making actual use of the btrees to reap the old blocks. I think
> > then you'd only have a very short window of a single block leak on a
> > poorly timed power loss and repair retry sequence before you start
> > actually freeing originally used space (which in practice, I think
> > solves the problem).
> > 
> > Given that we're starting from empty, I wonder if another option may be
> > to over fill the agfl with old btree blocks or something. The first real
> > free should shift enough blocks back into the btrees to ensure the agfl
> > can be managed from that point forward, right? That may be more work
> > than it's worth though and/or a job for another patch. (FWIW, we also
> > have that NOSHRINK agfl fixup flag for userspace repair.)
> 
> Yes, I'll give that a try tomorrow, now that I've finished porting all
> the 4.19 stuff to xfsprogs. :)
> 
> Looping back to something we discussed earlier in this thread, I'd
> prefer to hold off on converting the list of already-freed extents to
> xfs_bitmap because the same problem exists in all the repair functions
> of having to store a large number of records for the rebuilt btree, and
> maybe there's some way to <cough> use pageable memory for that, since
> the access patterns for that are append, sort, and iterate; for those
> three uses we don't necessarily require all the records to be in memory
> all the time.  For the allocbt repair I expect the free space records to
> be far more numerous than the list of old bnobt/cntbt blocks.
> 

Ok, it's fair enough that we'll probably want to find some kind of
generic, more efficient technique for handling this across the various
applicable repair algorithms.

One other high level thing that crossed my mind with regard to the
general btree reconstruction algorithms is whether we need to build up
this kind of central record list at all. For example, rather than slurp
up the entire list of btree records in-core, sort it and dump it back
out, could we take advantage of the fact that our existing on-disk
structure insertion mechanisms already handle out of order records
(simply stated, an extent free knows how to insert the associated record
at the right place in the space btrees)? For example, suppose we reset
the existing btrees first, then scanned the rmapbt and repopulated the
new btrees as records are discovered..?

The obvious problem is that we still have some checks that allow the
whole repair operation to bail out before we determine whether we can
start to rebuild the on-disk btrees. These are things like making sure
we can actually read the associated rmapbt blocks (i.e., no read errors
or verifier failures), basic record sanity checks, etc. But ISTM that
isn't anything we couldn't get around with a multi-pass implementation.
Secondary issues might be things like no longer being able to easily
insert the longest free extent range(s) first (meaning we'd have to
stuff the agfl with old btree blocks or figure out some other approach).

BTW, isn't the typical scrub sequence already multi-pass by virtue of
the xfs_scrub_metadata() implementation? I'm wondering if the ->scrub()
callout could not only detect corruption, but validate whether repair
(if requested) is possible based on the kind of checks that are
currently in the repair side rmapbt walkers. Thoughts? Are there future
changes that are better supported by an in-core tracking structure in
general (assuming we'll eventually replace the linked lists with
something more efficient) as opposed to attempting to optimize out the
need for that tracking at all?

Brian

> --D
> 
> > Brian
> > 
> > > --D
> > > 
> > > > Brian
> > > > 
> > > > > > > +
> > > > > > > +done:
> > > > > > > +	/* Free all the OWN_AG blocks that are not in the rmapbt/agfl. */
> > > > > > > +	xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG);
> > > > > > > +	return xrep_reap_extents(sc, old_allocbt_blocks, &oinfo,
> > > > > > > +			XFS_AG_RESV_NONE);
> > > > > > > +}
> > > > > > > +
> > > > > > ...
> > > > > > > diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
> > > > > > > index 0ed68379e551..82f99633a597 100644
> > > > > > > --- a/fs/xfs/xfs_extent_busy.c
> > > > > > > +++ b/fs/xfs/xfs_extent_busy.c
> > > > > > > @@ -657,3 +657,17 @@ xfs_extent_busy_ag_cmp(
> > > > > > >  		diff = b1->bno - b2->bno;
> > > > > > >  	return diff;
> > > > > > >  }
> > > > > > > +
> > > > > > > +/* Are there any busy extents in this AG? */
> > > > > > > +bool
> > > > > > > +xfs_extent_busy_list_empty(
> > > > > > > +	struct xfs_perag	*pag)
> > > > > > > +{
> > > > > > > +	spin_lock(&pag->pagb_lock);
> > > > > > > +	if (pag->pagb_tree.rb_node) {
> > > > > > 
> > > > > > RB_EMPTY_ROOT()?
> > > > > 
> > > > > Good suggestion, thank you!
> > > > > 
> > > > > --D
> > > > > 
> > > > > > Brian
> > > > > > 
> > > > > > > +		spin_unlock(&pag->pagb_lock);
> > > > > > > +		return false;
> > > > > > > +	}
> > > > > > > +	spin_unlock(&pag->pagb_lock);
> > > > > > > +	return true;
> > > > > > > +}
> > > > > > > diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h
> > > > > > > index 990ab3891971..2f8c73c712c6 100644
> > > > > > > --- a/fs/xfs/xfs_extent_busy.h
> > > > > > > +++ b/fs/xfs/xfs_extent_busy.h
> > > > > > > @@ -65,4 +65,6 @@ static inline void xfs_extent_busy_sort(struct list_head *list)
> > > > > > >  	list_sort(NULL, list, xfs_extent_busy_ag_cmp);
> > > > > > >  }
> > > > > > >  
> > > > > > > +bool xfs_extent_busy_list_empty(struct xfs_perag *pag);
> > > > > > > +
> > > > > > >  #endif /* __XFS_EXTENT_BUSY_H__ */
> > > > > > > 
> > > > > > > --
> > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > --
> > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > --
> > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > the body of a message to majordomo@vger.kernel.org
> > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > --
> > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > the body of a message to majordomo@vger.kernel.org
> > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Darrick J. Wong Aug. 2, 2018, 7:22 p.m. UTC | #8
On Thu, Aug 02, 2018 at 09:48:24AM -0400, Brian Foster wrote:
> On Wed, Aug 01, 2018 at 11:28:45PM -0700, Darrick J. Wong wrote:
> > On Wed, Aug 01, 2018 at 02:39:20PM -0400, Brian Foster wrote:
> > > On Wed, Aug 01, 2018 at 09:23:16AM -0700, Darrick J. Wong wrote:
> > > > On Wed, Aug 01, 2018 at 07:54:09AM -0400, Brian Foster wrote:
> > > > > On Tue, Jul 31, 2018 at 03:01:25PM -0700, Darrick J. Wong wrote:
> > > > > > On Tue, Jul 31, 2018 at 01:47:23PM -0400, Brian Foster wrote:
> > > > > > > On Sun, Jul 29, 2018 at 10:48:21PM -0700, Darrick J. Wong wrote:
> > > > > > > > From: Darrick J. Wong <darrick.wong@oracle.com>
> > > > > > > > 
> > > > > > > > Rebuild the free space btrees from the gaps in the rmap btree.
> > > > > > > > 
> > > > > > > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > > > > > > > ---
> > > > > > > >  fs/xfs/Makefile             |    1 
> > > > > > > >  fs/xfs/scrub/alloc.c        |    1 
> > > > > > > >  fs/xfs/scrub/alloc_repair.c |  581 +++++++++++++++++++++++++++++++++++++++++++
> > > > > > > >  fs/xfs/scrub/common.c       |    8 +
> > > > > > > >  fs/xfs/scrub/repair.h       |    2 
> > > > > > > >  fs/xfs/scrub/scrub.c        |    4 
> > > > > > > >  fs/xfs/scrub/trace.h        |    2 
> > > > > > > >  fs/xfs/xfs_extent_busy.c    |   14 +
> > > > > > > >  fs/xfs/xfs_extent_busy.h    |    2 
> > > > > > > >  9 files changed, 610 insertions(+), 5 deletions(-)
> > > > > > > >  create mode 100644 fs/xfs/scrub/alloc_repair.c
> > > > > > > > 
> > > > > > > > 
> > > > > > > ...
> > > > > > > > diff --git a/fs/xfs/scrub/alloc_repair.c b/fs/xfs/scrub/alloc_repair.c
> > > > > > > > new file mode 100644
> > > > > > > > index 000000000000..b228c2906de2
> > > > > > > > --- /dev/null
> > > > > > > > +++ b/fs/xfs/scrub/alloc_repair.c
> > > > > > > > @@ -0,0 +1,581 @@
> > > ...
> > > > > > > > +
> > > > > > > > +/*
> > > > > > > > + * Make our new freespace btree roots permanent so that we can start freeing
> > > > > > > > + * unused space back into the AG.
> > > > > > > > + */
> > > > > > > > +STATIC int
> > > > > > > > +xrep_abt_commit_new(
> > > > > > > > +	struct xfs_scrub	*sc,
> > > > > > > > +	struct xfs_bitmap	*old_allocbt_blocks,
> > > > > > > > +	int			log_flags)
> > > > > > > > +{
> > > > > > > > +	int			error;
> > > > > > > > +
> > > > > > > > +	xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, log_flags);
> > > > > > > > +
> > > > > > > > +	/* Invalidate the old freespace btree blocks and commit. */
> > > > > > > > +	error = xrep_invalidate_blocks(sc, old_allocbt_blocks);
> > > > > > > > +	if (error)
> > > > > > > > +		return error;
> > > > > > > 
> > > > > > > It looks like the above invalidation all happens in the same
> > > > > > > transaction. Those aren't logging buffer data or anything, but any idea
> > > > > > > how many log formats we can get away with in this single transaction?
> > > > > > 
> > > > > > Hm... well, on my computer a log format is ~88 bytes.  Assuming 4K
> > > > > > blocks, the max AG size of 1TB, maximum free space fragmentation, and
> > > > > > two btrees, the tree could be up to ~270 million records.  Assuming ~505
> > > > > > records per block, that's ... ~531,000 leaf blocks and ~1100 node blocks
> > > > > > for both btrees.  If we invalidate both, that's ~46M of RAM?
> > > > > > 
> > > > > 
> > > > > I was thinking more about transaction reservation than RAM. It may not
> > > > 
> > > > Hmm.  tr_itruncate is ~650K on my 2TB SSD, assuming 88 bytes per, that's
> > > > about ... ~7300 log format items?  Not a lot, maybe it should roll the
> > > > transaction every 1000 invalidations or so...
> > > > 
> > > 
> > > I'm not really sure what categorizes as a lot here given that the blocks
> > > would need to be in-core, but rolling on some fixed/safe interval sounds
> > > reasonable to me.
> > > 
> > > > > currently be an issue, but it might be worth putting something down in a
> > > > > comment to note that this is a single transaction and we expect to not
> > > > > have to invalidate more than N (ballpark) blocks in a single go,
> > > > > whatever that value happens to be.
> > > > > 
> > > > > > > > +	error = xrep_roll_ag_trans(sc);
> > > > > > > > +	if (error)
> > > > > > > > +		return error;
> > > > > > > > +
> > > > > > > > +	/* Now that we've succeeded, mark the incore state valid again. */
> > > > > > > > +	sc->sa.pag->pagf_init = 1;
> > > > > > > > +	return 0;
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +/* Build new free space btrees and dispose of the old one. */
> > > > > > > > +STATIC int
> > > > > > > > +xrep_abt_rebuild_trees(
> > > > > > > > +	struct xfs_scrub	*sc,
> > > > > > > > +	struct list_head	*free_extents,
> > > > > > > > +	struct xfs_bitmap	*old_allocbt_blocks)
> > > > > > > > +{
> > > > > > > > +	struct xfs_owner_info	oinfo;
> > > > > > > > +	struct xrep_abt_extent	*rae;
> > > > > > > > +	struct xrep_abt_extent	*n;
> > > > > > > > +	struct xrep_abt_extent	*longest;
> > > > > > > > +	int			error;
> > > > > > > > +
> > > > > > > > +	xfs_rmap_skip_owner_update(&oinfo);
> > > > > > > > +
> > > > > > > > +	/*
> > > > > > > > +	 * Insert the longest free extent in case it's necessary to
> > > > > > > > +	 * refresh the AGFL with multiple blocks.  If there is no longest
> > > > > > > > +	 * extent, we had exactly the free space we needed; we're done.
> > > > > > > > +	 */
> > > > > > > 
> > > > > > > I'm confused by the last sentence. longest should only be NULL if the
> > > > > > > free space list is empty and haven't we already bailed out with -ENOSPC
> > > > > > > if that's the case?
> > > > > > > 
> > > > > > > > +	longest = xrep_abt_get_longest(free_extents);
> > > > > > 
> > > > > > xrep_abt_rebuild_trees is called after we allocate and initialize two
> > > > > > new btree roots in xrep_abt_reset_btrees.  If free_extents is an empty
> > > > > > list here, then we found exactly two blocks worth of free space and used
> > > > > > them to set up new btree roots.
> > > > > > 
> > > > > 
> > > > > Got it, thanks.
> > > > > 
> > > > > > > > +	if (!longest)
> > > > > > > > +		goto done;
> > > > > > > > +	error = xrep_abt_free_extent(sc,
> > > > > > > > +			XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, longest->bno),
> > > > > > > > +			longest->len, &oinfo);
> > > > > > > > +	list_del(&longest->list);
> > > > > > > > +	kmem_free(longest);
> > > > > > > > +	if (error)
> > > > > > > > +		return error;
> > > > > > > > +
> > > > > > > > +	/* Insert records into the new btrees. */
> > > > > > > > +	list_for_each_entry_safe(rae, n, free_extents, list) {
> > > > > > > > +		error = xrep_abt_free_extent(sc,
> > > > > > > > +				XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, rae->bno),
> > > > > > > > +				rae->len, &oinfo);
> > > > > > > > +		if (error)
> > > > > > > > +			return error;
> > > > > > > > +		list_del(&rae->list);
> > > > > > > > +		kmem_free(rae);
> > > > > > > > +	}
> > > > > > > 
> > > > > > > Ok, at this point we've reset the btree roots and we start freeing the
> > > > > > > free ranges that were discovered via the rmapbt analysis. AFAICT, if we
> > > > > > > fail or crash at this point, we leave the allocbts in a partially
> > > > > > > constructed state. I take it that is Ok with respect to the broader
> > > > > > > repair algorithm because we'd essentially start over by inspecting the
> > > > > > > rmapbt again on a retry.
> > > > > > 
> > > > > > Right.  Though in the crash/shutdown case, you'll end up with the
> > > > > > filesystem in an offline state at some point before you can retry the
> > > > > > scrub, it's probably faster to run xfs_repair to fix the damage.
> > > > > > 
> > > > > 
> > > > > Can we really assume that if we're already up and running an online
> > > > > repair? The filesystem has to be mountable in that case in the first
> > > > > place. If we've already reset and started reconstructing the allocation
> > > > > btrees then I'd think those transactions would recover just fine on a
> > > > > power loss or something (perhaps not in the event of some other
> > > > > corruption related shutdown).
> > > > 
> > > > Right, for the system crash case, whatever transactions committed should
> > > > replay just fine, and you can even start up the online repair again, and
> > > > if the AG isn't particularly close to ENOSPC then (barring rmap
> > > > corruption) it should work just fine.
> > > > 
> > > > If the fs went down because either (a) repair hit other corruption or
> > > > (b) some other thread hit an error in some other part of the filesystem,
> > > > then it's not so clear -- in (b) you could probably try again, but for
> > > > (a) you'll definitely have to unmount and run xfs_repair.
> > > > 
> > > 
> > > Indeed, there are certainly cases where we simply won't be able to do an
> > > online repair. I'm trying to think about scenarios where we should be
> > > able to do an online repair, but we lose power or hit some kind of
> > > transient error like a memory allocation failure before it completes. It
> > > would be nice if the online repair itself didn't contribute (within
> > > reason) to the inability to simply try again just because the fs was
> > > close to -ENOSPC.
> > 
> > Agreed.  Most of the, uh, opportunities to hit ENOMEM happen before we
> > start modifying on-disk metadata.  If that happens, we just free all the
> > memory and bail out having done nothing.
> > 
> > > For one, I think it's potentially confusing behavior. Second, it might
> > > be concerning to regular users who perceive it as an online repair
> > > leaving the fs in a worse off state. Us fs devs know that may not really
> > > be the case, but I think we're better for addressing it if we can
> > > reasonably do so.
> > 
> > <nod> Further in the future I want to add the ability to offline an AG,
> > so the worst that happens is that scrub turns the AG off, repair doesn't
> > fix it, and the AG simply stays offline.  That might give us the
> > ability to survive cancelling the repair transaction, since if the AG's
> > offline already anyway we could just throw away the dirty buffers and
> > resurrect the AG later.  I don't know, that's purely speculative.
> > 
> > > > Perhaps the guideline here is that if the fs goes down more than once
> > > > during online repair then unmount it and run xfs_repair.
> > > > 
> > > 
> > > Yep, I think that makes sense if the filesystem or repair itself is
> > > tripping over other corruptions that fail to keep it active for the
> > > duration of the repair.
> > 
> > <nod>
> > 
> > > > > > > The blocks allocated for the btrees that we've begun to construct here
> > > > > > > end up mapped in the rmapbt as we go, right? IIUC, that means we don't
> > > > > > > necessarily have infinite retries to make sure this completes. IOW,
> > > > > > > suppose that a first repair attempt finds just enough free space to
> > > > > > > construct new trees, gets far enough along to consume most of that free
> > > > > > > space and then crashes. Is it possible that a subsequent repair attempt
> > > > > > > includes the btree blocks allocated during the previous failed repair
> > > > > > > attempt in the sum of "old btree blocks" and determines we don't have
> > > > > > > enough free space to repair?
> > > > > > 
> > > > > > Yes, that's a risk of running the free space repair.
> > > > > > 
> > > > > 
> > > > > Can we improve on that? For example, are the rmapbt entries for the old
> > > > > allocation btree blocks necessary once we commit the btree resets? If
> > > > > not, could we remove those entries before we start tree reconstruction?
> > > > > 
> > > > > Alternatively, could we incorporate use of the old btree blocks? As it
> > > > > is, we discover those blocks simply so we can free them at the end.
> > > > > Perhaps we could free them sooner or find a more clever means to
> > > > > reallocate directly from that in-core list? I guess we have to consider
> > > > > whether they were really valid/sane btree blocks, but either way ISTM
> > > > > that the old blocks list is essentially invalidated once we reset the
> > > > > btrees.
> > > > 
> > > > Hmm, it's a little tricky to do that -- we could reap the old bnobt and
> > > > cntbt blocks (in the old_allocbt_blocks bitmap) first, but if adding a
> > > > record causes a btree split we'll pull blocks from the AGFL, and if
> > > > there aren't enough blocks in the bnobt to fill the AGFL back up then
> > > > fix_freelist won't succeed.  That complication is why it finds the
> > > > longest extent in the unclaimed list and pushes that in first, then
> > > > works on the rest of the extents.
> > > > 
> > > 
> > > Hmm, but doesn't a btree split require at least one full space btree
> > > block per-level? In conjunction, the agfl minimum size requirement grows
> > > with the height of the tree, which implies available free space..? I
> > > could be missing something, perhaps we have to account for the rmapbt in
> > > that case as well? Regardless...
> > > 
> > > > I suppose one could try to avoid ENOSPC by pushing that longest extent
> > > > in first (since we know that won't trigger a split), then reap the old
> > > > alloc btree blocks, and then add everything else back in...
> > > > 
> > > 
> > > I think it would be reasonable to seed the btree with the longest record
> > > or some fixed number of longest records (~1/2 a root block, for example)
> > > before making actual use of the btrees to reap the old blocks. I think
> > > then you'd only have a very short window of a single block leak on a
> > > poorly timed power loss and repair retry sequence before you start
> > > actually freeing originally used space (which in practice, I think
> > > solves the problem).
> > > 
> > > Given that we're starting from empty, I wonder if another option may be
> > > to over fill the agfl with old btree blocks or something. The first real
> > > free should shift enough blocks back into the btrees to ensure the agfl
> > > can be managed from that point forward, right? That may be more work
> > > than it's worth though and/or a job for another patch. (FWIW, we also
> > > have that NOSHRINK agfl fixup flag for userspace repair.)
> > 
> > Yes, I'll give that a try tomorrow, now that I've finished porting all
> > the 4.19 stuff to xfsprogs. :)
> > 
> > Looping back to something we discussed earlier in this thread, I'd
> > prefer to hold off on converting the list of already-freed extents to
> > xfs_bitmap because the same problem exists in all the repair functions
> > of having to store a large number of records for the rebuilt btree, and
> > maybe there's some way to <cough> use pageable memory for that, since
> > the access patterns for that are append, sort, and iterate; for those
> > three uses we don't necessarily require all the records to be in memory
> > all the time.  For the allocbt repair I expect the free space records to
> > be far more numerous than the list of old bnobt/cntbt blocks.
> > 
> 
> Ok, it's fair enough that we'll probably want to find some kind of
> generic, more efficient technique for handling this across the various
> applicable repair algorithms.
> 
> One other high level thing that crossed my mind with regard to the
> general btree reconstruction algorithms is whether we need to build up
> this kind of central record list at all. For example, rather than slurp
> up the entire list of btree records in-core, sort it and dump it back
> out, could we take advantage of the fact that our existing on-disk
> structure insertion mechanisms already handle out of order records
> (simply stated, an extent free knows how to insert the associated record
> at the right place in the space btrees)? For example, suppose we reset
> the existing btrees first, then scanned the rmapbt and repopulated the
> new btrees as records are discovered..?

I tried that in an earlier draft of the bnobt repair function.  The
biggest problem with inserting as we go is dealing with the inevitable
transaction rolls (right now we do after every record insertion to avoid
playing games with guessing how much reservation is left).  Btree
cursor state can't survive transaction rolls because the transaction
commit releases all the buffers that aren't bhold'en, and we can't bhold
that many buffers across a _defer_finish.

So, that early draft spent a lot of time tearing down and reconstructing
rmapbt cursors since the standard _btree_query_all isn't suited to that
kind of usage.  It was easily twice as slow on a RAM-backed disk just
from the rmap cursor overhead and much more complex, so I rewrote it to
be simpler.  I also have a slight preference for not touching anything
until we're absolutely sure we have all the data we need to repair the
structure.

For other repair functions (like the data/attr fork repairs) we have to
scan all the rmapbts for extents, and I'd prefer to lock those AGs only
for as long as necessary to extract the extents we want.

> The obvious problem is that we still have some checks that allow the
> whole repair operation to bail out before we determine whether we can
> start to rebuild the on-disk btrees. These are things like making sure
> we can actually read the associated rmapbt blocks (i.e., no read errors
> or verifier failures), basic record sanity checks, etc. But ISTM that
> isn't anything we couldn't get around with a multi-pass implementation.
> Secondary issues might be things like no longer being able to easily
> insert the longest free extent range(s) first (meaning we'd have to
> stuff the agfl with old btree blocks or figure out some other approach).

Well, you could scan the rmapbt twice -- once to find the longest
record, then again to do the actual insertion.

> BTW, isn't the typical scrub sequence already multi-pass by virtue of
> the xfs_scrub_metadata() implementation? I'm wondering if the ->scrub()
> callout could not only detect corruption, but validate whether repair
> (if requested) is possible based on the kind of checks that are
> currently in the repair side rmapbt walkers. Thoughts?r

Yes, scrub basically validates that for us now, with the notable
exception of the notorious rmapbt scrubber, which doesn't
cross-reference with inode block mappings because that would be a
locking nightmare.

> Are there future
> changes that are better supported by an in-core tracking structure in
> general (assuming we'll eventually replace the linked lists with
> something more efficient) as opposed to attempting to optimize out the
> need for that tracking at all?

Well, I was thinking that we could just allocate a memfd (or a file on
the same xfs once we have AG offlining) and store the records in there.
That saves us the list_head overhead and potentially enables access to a
lot more storage than pinning things in RAM.

--D

> Brian
> 
> > --D
> > 
> > > Brian
> > > 
> > > > --D
> > > > 
> > > > > Brian
> > > > > 
> > > > > > > > +
> > > > > > > > +done:
> > > > > > > > +	/* Free all the OWN_AG blocks that are not in the rmapbt/agfl. */
> > > > > > > > +	xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG);
> > > > > > > > +	return xrep_reap_extents(sc, old_allocbt_blocks, &oinfo,
> > > > > > > > +			XFS_AG_RESV_NONE);
> > > > > > > > +}
> > > > > > > > +
> > > > > > > ...
> > > > > > > > diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
> > > > > > > > index 0ed68379e551..82f99633a597 100644
> > > > > > > > --- a/fs/xfs/xfs_extent_busy.c
> > > > > > > > +++ b/fs/xfs/xfs_extent_busy.c
> > > > > > > > @@ -657,3 +657,17 @@ xfs_extent_busy_ag_cmp(
> > > > > > > >  		diff = b1->bno - b2->bno;
> > > > > > > >  	return diff;
> > > > > > > >  }
> > > > > > > > +
> > > > > > > > +/* Are there any busy extents in this AG? */
> > > > > > > > +bool
> > > > > > > > +xfs_extent_busy_list_empty(
> > > > > > > > +	struct xfs_perag	*pag)
> > > > > > > > +{
> > > > > > > > +	spin_lock(&pag->pagb_lock);
> > > > > > > > +	if (pag->pagb_tree.rb_node) {
> > > > > > > 
> > > > > > > RB_EMPTY_ROOT()?
> > > > > > 
> > > > > > Good suggestion, thank you!
> > > > > > 
> > > > > > --D
> > > > > > 
> > > > > > > Brian
> > > > > > > 
> > > > > > > > +		spin_unlock(&pag->pagb_lock);
> > > > > > > > +		return false;
> > > > > > > > +	}
> > > > > > > > +	spin_unlock(&pag->pagb_lock);
> > > > > > > > +	return true;
> > > > > > > > +}
> > > > > > > > diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h
> > > > > > > > index 990ab3891971..2f8c73c712c6 100644
> > > > > > > > --- a/fs/xfs/xfs_extent_busy.h
> > > > > > > > +++ b/fs/xfs/xfs_extent_busy.h
> > > > > > > > @@ -65,4 +65,6 @@ static inline void xfs_extent_busy_sort(struct list_head *list)
> > > > > > > >  	list_sort(NULL, list, xfs_extent_busy_ag_cmp);
> > > > > > > >  }
> > > > > > > >  
> > > > > > > > +bool xfs_extent_busy_list_empty(struct xfs_perag *pag);
> > > > > > > > +
> > > > > > > >  #endif /* __XFS_EXTENT_BUSY_H__ */
> > > > > > > > 
> > > > > > > > --
> > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > > --
> > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > --
> > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > --
> > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > the body of a message to majordomo@vger.kernel.org
> > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > --
> > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > the body of a message to majordomo@vger.kernel.org
> > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Brian Foster Aug. 3, 2018, 10:49 a.m. UTC | #9
On Thu, Aug 02, 2018 at 12:22:05PM -0700, Darrick J. Wong wrote:
> On Thu, Aug 02, 2018 at 09:48:24AM -0400, Brian Foster wrote:
> > On Wed, Aug 01, 2018 at 11:28:45PM -0700, Darrick J. Wong wrote:
> > > On Wed, Aug 01, 2018 at 02:39:20PM -0400, Brian Foster wrote:
> > > > On Wed, Aug 01, 2018 at 09:23:16AM -0700, Darrick J. Wong wrote:
> > > > > On Wed, Aug 01, 2018 at 07:54:09AM -0400, Brian Foster wrote:
> > > > > > On Tue, Jul 31, 2018 at 03:01:25PM -0700, Darrick J. Wong wrote:
> > > > > > > On Tue, Jul 31, 2018 at 01:47:23PM -0400, Brian Foster wrote:
> > > > > > > > On Sun, Jul 29, 2018 at 10:48:21PM -0700, Darrick J. Wong wrote:
> > > > > > > > > From: Darrick J. Wong <darrick.wong@oracle.com>
> > > > > > > > > 
> > > > > > > > > Rebuild the free space btrees from the gaps in the rmap btree.
> > > > > > > > > 
> > > > > > > > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > > > > > > > > ---
> > > > > > > > >  fs/xfs/Makefile             |    1 
> > > > > > > > >  fs/xfs/scrub/alloc.c        |    1 
> > > > > > > > >  fs/xfs/scrub/alloc_repair.c |  581 +++++++++++++++++++++++++++++++++++++++++++
> > > > > > > > >  fs/xfs/scrub/common.c       |    8 +
> > > > > > > > >  fs/xfs/scrub/repair.h       |    2 
> > > > > > > > >  fs/xfs/scrub/scrub.c        |    4 
> > > > > > > > >  fs/xfs/scrub/trace.h        |    2 
> > > > > > > > >  fs/xfs/xfs_extent_busy.c    |   14 +
> > > > > > > > >  fs/xfs/xfs_extent_busy.h    |    2 
> > > > > > > > >  9 files changed, 610 insertions(+), 5 deletions(-)
> > > > > > > > >  create mode 100644 fs/xfs/scrub/alloc_repair.c
> > > > > > > > > 
> > > > > > > > > 
> > > > > > > > ...
> > > > > > > > > diff --git a/fs/xfs/scrub/alloc_repair.c b/fs/xfs/scrub/alloc_repair.c
> > > > > > > > > new file mode 100644
> > > > > > > > > index 000000000000..b228c2906de2
> > > > > > > > > --- /dev/null
> > > > > > > > > +++ b/fs/xfs/scrub/alloc_repair.c
> > > > > > > > > @@ -0,0 +1,581 @@
> > > > ...
> > > > > > > > > +
> > > > > > > > > +/*
> > > > > > > > > + * Make our new freespace btree roots permanent so that we can start freeing
> > > > > > > > > + * unused space back into the AG.
> > > > > > > > > + */
> > > > > > > > > +STATIC int
> > > > > > > > > +xrep_abt_commit_new(
> > > > > > > > > +	struct xfs_scrub	*sc,
> > > > > > > > > +	struct xfs_bitmap	*old_allocbt_blocks,
> > > > > > > > > +	int			log_flags)
> > > > > > > > > +{
> > > > > > > > > +	int			error;
> > > > > > > > > +
> > > > > > > > > +	xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, log_flags);
> > > > > > > > > +
> > > > > > > > > +	/* Invalidate the old freespace btree blocks and commit. */
> > > > > > > > > +	error = xrep_invalidate_blocks(sc, old_allocbt_blocks);
> > > > > > > > > +	if (error)
> > > > > > > > > +		return error;
> > > > > > > > 
> > > > > > > > It looks like the above invalidation all happens in the same
> > > > > > > > transaction. Those aren't logging buffer data or anything, but any idea
> > > > > > > > how many log formats we can get away with in this single transaction?
> > > > > > > 
> > > > > > > Hm... well, on my computer a log format is ~88 bytes.  Assuming 4K
> > > > > > > blocks, the max AG size of 1TB, maximum free space fragmentation, and
> > > > > > > two btrees, the tree could be up to ~270 million records.  Assuming ~505
> > > > > > > records per block, that's ... ~531,000 leaf blocks and ~1100 node blocks
> > > > > > > for both btrees.  If we invalidate both, that's ~46M of RAM?
> > > > > > > 
> > > > > > 
> > > > > > I was thinking more about transaction reservation than RAM. It may not
> > > > > 
> > > > > Hmm.  tr_itruncate is ~650K on my 2TB SSD, assuming 88 bytes per, that's
> > > > > about ... ~7300 log format items?  Not a lot, maybe it should roll the
> > > > > transaction every 1000 invalidations or so...
> > > > > 
> > > > 
> > > > I'm not really sure what categorizes as a lot here given that the blocks
> > > > would need to be in-core, but rolling on some fixed/safe interval sounds
> > > > reasonable to me.
> > > > 
> > > > > > currently be an issue, but it might be worth putting something down in a
> > > > > > comment to note that this is a single transaction and we expect to not
> > > > > > have to invalidate more than N (ballpark) blocks in a single go,
> > > > > > whatever that value happens to be.
> > > > > > 
> > > > > > > > > +	error = xrep_roll_ag_trans(sc);
> > > > > > > > > +	if (error)
> > > > > > > > > +		return error;
> > > > > > > > > +
> > > > > > > > > +	/* Now that we've succeeded, mark the incore state valid again. */
> > > > > > > > > +	sc->sa.pag->pagf_init = 1;
> > > > > > > > > +	return 0;
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +/* Build new free space btrees and dispose of the old one. */
> > > > > > > > > +STATIC int
> > > > > > > > > +xrep_abt_rebuild_trees(
> > > > > > > > > +	struct xfs_scrub	*sc,
> > > > > > > > > +	struct list_head	*free_extents,
> > > > > > > > > +	struct xfs_bitmap	*old_allocbt_blocks)
> > > > > > > > > +{
> > > > > > > > > +	struct xfs_owner_info	oinfo;
> > > > > > > > > +	struct xrep_abt_extent	*rae;
> > > > > > > > > +	struct xrep_abt_extent	*n;
> > > > > > > > > +	struct xrep_abt_extent	*longest;
> > > > > > > > > +	int			error;
> > > > > > > > > +
> > > > > > > > > +	xfs_rmap_skip_owner_update(&oinfo);
> > > > > > > > > +
> > > > > > > > > +	/*
> > > > > > > > > +	 * Insert the longest free extent in case it's necessary to
> > > > > > > > > +	 * refresh the AGFL with multiple blocks.  If there is no longest
> > > > > > > > > +	 * extent, we had exactly the free space we needed; we're done.
> > > > > > > > > +	 */
> > > > > > > > 
> > > > > > > > I'm confused by the last sentence. longest should only be NULL if the
> > > > > > > > free space list is empty and haven't we already bailed out with -ENOSPC
> > > > > > > > if that's the case?
> > > > > > > > 
> > > > > > > > > +	longest = xrep_abt_get_longest(free_extents);
> > > > > > > 
> > > > > > > xrep_abt_rebuild_trees is called after we allocate and initialize two
> > > > > > > new btree roots in xrep_abt_reset_btrees.  If free_extents is an empty
> > > > > > > list here, then we found exactly two blocks worth of free space and used
> > > > > > > them to set up new btree roots.
> > > > > > > 
> > > > > > 
> > > > > > Got it, thanks.
> > > > > > 
> > > > > > > > > +	if (!longest)
> > > > > > > > > +		goto done;
> > > > > > > > > +	error = xrep_abt_free_extent(sc,
> > > > > > > > > +			XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, longest->bno),
> > > > > > > > > +			longest->len, &oinfo);
> > > > > > > > > +	list_del(&longest->list);
> > > > > > > > > +	kmem_free(longest);
> > > > > > > > > +	if (error)
> > > > > > > > > +		return error;
> > > > > > > > > +
> > > > > > > > > +	/* Insert records into the new btrees. */
> > > > > > > > > +	list_for_each_entry_safe(rae, n, free_extents, list) {
> > > > > > > > > +		error = xrep_abt_free_extent(sc,
> > > > > > > > > +				XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, rae->bno),
> > > > > > > > > +				rae->len, &oinfo);
> > > > > > > > > +		if (error)
> > > > > > > > > +			return error;
> > > > > > > > > +		list_del(&rae->list);
> > > > > > > > > +		kmem_free(rae);
> > > > > > > > > +	}
> > > > > > > > 
> > > > > > > > Ok, at this point we've reset the btree roots and we start freeing the
> > > > > > > > free ranges that were discovered via the rmapbt analysis. AFAICT, if we
> > > > > > > > fail or crash at this point, we leave the allocbts in a partially
> > > > > > > > constructed state. I take it that is Ok with respect to the broader
> > > > > > > > repair algorithm because we'd essentially start over by inspecting the
> > > > > > > > rmapbt again on a retry.
> > > > > > > 
> > > > > > > Right.  Though in the crash/shutdown case, you'll end up with the
> > > > > > > filesystem in an offline state at some point before you can retry the
> > > > > > > scrub, it's probably faster to run xfs_repair to fix the damage.
> > > > > > > 
> > > > > > 
> > > > > > Can we really assume that if we're already up and running an online
> > > > > > repair? The filesystem has to be mountable in that case in the first
> > > > > > place. If we've already reset and started reconstructing the allocation
> > > > > > btrees then I'd think those transactions would recover just fine on a
> > > > > > power loss or something (perhaps not in the event of some other
> > > > > > corruption related shutdown).
> > > > > 
> > > > > Right, for the system crash case, whatever transactions committed should
> > > > > replay just fine, and you can even start up the online repair again, and
> > > > > if the AG isn't particularly close to ENOSPC then (barring rmap
> > > > > corruption) it should work just fine.
> > > > > 
> > > > > If the fs went down because either (a) repair hit other corruption or
> > > > > (b) some other thread hit an error in some other part of the filesystem,
> > > > > then it's not so clear -- in (b) you could probably try again, but for
> > > > > (a) you'll definitely have to unmount and run xfs_repair.
> > > > > 
> > > > 
> > > > Indeed, there are certainly cases where we simply won't be able to do an
> > > > online repair. I'm trying to think about scenarios where we should be
> > > > able to do an online repair, but we lose power or hit some kind of
> > > > transient error like a memory allocation failure before it completes. It
> > > > would be nice if the online repair itself didn't contribute (within
> > > > reason) to the inability to simply try again just because the fs was
> > > > close to -ENOSPC.
> > > 
> > > Agreed.  Most of the, uh, opportunities to hit ENOMEM happen before we
> > > start modifying on-disk metadata.  If that happens, we just free all the
> > > memory and bail out having done nothing.
> > > 
> > > > For one, I think it's potentially confusing behavior. Second, it might
> > > > be concerning to regular users who perceive it as an online repair
> > > > leaving the fs in a worse off state. Us fs devs know that may not really
> > > > be the case, but I think we're better for addressing it if we can
> > > > reasonably do so.
> > > 
> > > <nod> Further in the future I want to add the ability to offline an AG,
> > > so the worst that happens is that scrub turns the AG off, repair doesn't
> > > fix it, and the AG simply stays offline.  That might give us the
> > > ability to survive cancelling the repair transaction, since if the AG's
> > > offline already anyway we could just throw away the dirty buffers and
> > > resurrect the AG later.  I don't know, that's purely speculative.
> > > 
> > > > > Perhaps the guideline here is that if the fs goes down more than once
> > > > > during online repair then unmount it and run xfs_repair.
> > > > > 
> > > > 
> > > > Yep, I think that makes sense if the filesystem or repair itself is
> > > > tripping over other corruptions that fail to keep it active for the
> > > > duration of the repair.
> > > 
> > > <nod>
> > > 
> > > > > > > > The blocks allocated for the btrees that we've begun to construct here
> > > > > > > > end up mapped in the rmapbt as we go, right? IIUC, that means we don't
> > > > > > > > necessarily have infinite retries to make sure this completes. IOW,
> > > > > > > > suppose that a first repair attempt finds just enough free space to
> > > > > > > > construct new trees, gets far enough along to consume most of that free
> > > > > > > > space and then crashes. Is it possible that a subsequent repair attempt
> > > > > > > > includes the btree blocks allocated during the previous failed repair
> > > > > > > > attempt in the sum of "old btree blocks" and determines we don't have
> > > > > > > > enough free space to repair?
> > > > > > > 
> > > > > > > Yes, that's a risk of running the free space repair.
> > > > > > > 
> > > > > > 
> > > > > > Can we improve on that? For example, are the rmapbt entries for the old
> > > > > > allocation btree blocks necessary once we commit the btree resets? If
> > > > > > not, could we remove those entries before we start tree reconstruction?
> > > > > > 
> > > > > > Alternatively, could we incorporate use of the old btree blocks? As it
> > > > > > is, we discover those blocks simply so we can free them at the end.
> > > > > > Perhaps we could free them sooner or find a more clever means to
> > > > > > reallocate directly from that in-core list? I guess we have to consider
> > > > > > whether they were really valid/sane btree blocks, but either way ISTM
> > > > > > that the old blocks list is essentially invalidated once we reset the
> > > > > > btrees.
> > > > > 
> > > > > Hmm, it's a little tricky to do that -- we could reap the old bnobt and
> > > > > cntbt blocks (in the old_allocbt_blocks bitmap) first, but if adding a
> > > > > record causes a btree split we'll pull blocks from the AGFL, and if
> > > > > there aren't enough blocks in the bnobt to fill the AGFL back up then
> > > > > fix_freelist won't succeed.  That complication is why it finds the
> > > > > longest extent in the unclaimed list and pushes that in first, then
> > > > > works on the rest of the extents.
> > > > > 
> > > > 
> > > > Hmm, but doesn't a btree split require at least one full space btree
> > > > block per-level? In conjunction, the agfl minimum size requirement grows
> > > > with the height of the tree, which implies available free space..? I
> > > > could be missing something, perhaps we have to account for the rmapbt in
> > > > that case as well? Regardless...
> > > > 
> > > > > I suppose one could try to avoid ENOSPC by pushing that longest extent
> > > > > in first (since we know that won't trigger a split), then reap the old
> > > > > alloc btree blocks, and then add everything else back in...
> > > > > 
> > > > 
> > > > I think it would be reasonable to seed the btree with the longest record
> > > > or some fixed number of longest records (~1/2 a root block, for example)
> > > > before making actual use of the btrees to reap the old blocks. I think
> > > > then you'd only have a very short window of a single block leak on a
> > > > poorly timed power loss and repair retry sequence before you start
> > > > actually freeing originally used space (which in practice, I think
> > > > solves the problem).
> > > > 
> > > > Given that we're starting from empty, I wonder if another option may be
> > > > to over fill the agfl with old btree blocks or something. The first real
> > > > free should shift enough blocks back into the btrees to ensure the agfl
> > > > can be managed from that point forward, right? That may be more work
> > > > than it's worth though and/or a job for another patch. (FWIW, we also
> > > > have that NOSHRINK agfl fixup flag for userspace repair.)
> > > 
> > > Yes, I'll give that a try tomorrow, now that I've finished porting all
> > > the 4.19 stuff to xfsprogs. :)
> > > 
> > > Looping back to something we discussed earlier in this thread, I'd
> > > prefer to hold off on converting the list of already-freed extents to
> > > xfs_bitmap because the same problem exists in all the repair functions
> > > of having to store a large number of records for the rebuilt btree, and
> > > maybe there's some way to <cough> use pageable memory for that, since
> > > the access patterns for that are append, sort, and iterate; for those
> > > three uses we don't necessarily require all the records to be in memory
> > > all the time.  For the allocbt repair I expect the free space records to
> > > be far more numerous than the list of old bnobt/cntbt blocks.
> > > 
> > 
> > Ok, it's fair enough that we'll probably want to find some kind of
> > generic, more efficient technique for handling this across the various
> > applicable repair algorithms.
> > 
> > One other high level thing that crossed my mind with regard to the
> > general btree reconstruction algorithms is whether we need to build up
> > this kind of central record list at all. For example, rather than slurp
> > up the entire list of btree records in-core, sort it and dump it back
> > out, could we take advantage of the fact that our existing on-disk
> > structure insertion mechanisms already handle out of order records
> > (simply stated, an extent free knows how to insert the associated record
> > at the right place in the space btrees)? For example, suppose we reset
> > the existing btrees first, then scanned the rmapbt and repopulated the
> > new btrees as records are discovered..?
> 
> I tried that in an earlier draft of the bnobt repair function.  The
> biggest problem with inserting as we go is dealing with the inevitable
> transaction rolls (right now we do after every record insertion to avoid
> playing games with guessing how much reservation is left).  Btree
> cursor state can't survive transaction rolls because the transaction
> commit releases all the buffers that aren't bhold'en, and we can't bhold
> that many buffers across a _defer_finish.
> 

Ok, interesting.

Where do we need to run an xfs_defer_finish() during the reconstruction
sequence, btw? I thought that would only run on final commit as opposed
to intermediate rolls. We could just try and make the automatic buffer
relogging list a dynamic allocation if there are enough held buffers in
the transaction.

> So, that early draft spent a lot of time tearing down and reconstructing
> rmapbt cursors since the standard _btree_query_all isn't suited to that
> kind of usage.  It was easily twice as slow on a RAM-backed disk just
> from the rmap cursor overhead and much more complex, so I rewrote it to
> be simpler.  I also have a slight preference for not touching anything
> until we're absolutely sure we have all the data we need to repair the
> structure.
> 

Yes, I think that is sane in principle. I'm just a bit concerned about
how reliable that xfs_repair-like approach will be in the kernel longer
term, particularly once we start having to deal with large filesystems
and limited or contended memory, etc. We already have xfs_repair users
that need to tweak settings because there isn't enough memory available
to repair the fs. Granted that is for fs-wide repairs and the flipside
is that we know a single AG can only be up to 1TB. It's certainly
possible that putting some persistent backing behind the in-core data is
enough to resolve the problem (and the current approach is certainly
reasonable enough to me for the initial implementation).

bjoin limitations aside, I wonder if a cursor roll mechanism that held
all of the cursor buffers, rolled the transaction and then rejoined all
said buffers would help us get around that. (Not sure I follow the early
prototype behavior, but it sounds like we had to restart the rmapbt
lookup over and over...).

Another caveat with that approach may be that I think we'd need to be
sure that the reconstruction operation doesn't ever need to update the
rmapbt while we're mid walk of the latter. That may be an issue for
inode btree reconstruction, for example, since it looks like inobt block
allocation requires rmapbt updates. We'd probably need some way to share
(or invalidate) a cursor across different contexts to deal with that.

> For other repair functions (like the data/attr fork repairs) we have to
> scan all the rmapbts for extents, and I'd prefer to lock those AGs only
> for as long as necessary to extract the extents we want.
> 
> > The obvious problem is that we still have some checks that allow the
> > whole repair operation to bail out before we determine whether we can
> > start to rebuild the on-disk btrees. These are things like making sure
> > we can actually read the associated rmapbt blocks (i.e., no read errors
> > or verifier failures), basic record sanity checks, etc. But ISTM that
> > isn't anything we couldn't get around with a multi-pass implementation.
> > Secondary issues might be things like no longer being able to easily
> > insert the longest free extent range(s) first (meaning we'd have to
> > stuff the agfl with old btree blocks or figure out some other approach).
> 
> Well, you could scan the rmapbt twice -- once to find the longest
> record, then again to do the actual insertion.
> 

Yep, that's what I meant by multi-pass.

> > BTW, isn't the typical scrub sequence already multi-pass by virtue of
> > the xfs_scrub_metadata() implementation? I'm wondering if the ->scrub()
> > callout could not only detect corruption, but validate whether repair
> > (if requested) is possible based on the kind of checks that are
> > currently in the repair side rmapbt walkers. Thoughts?r
> 
> Yes, scrub basically validates that for us now, with the notable
> exception of the notorious rmapbt scrubber, which doesn't
> cross-reference with inode block mappings because that would be a
> locking nightmare.
> 
> > Are there future
> > changes that are better supported by an in-core tracking structure in
> > general (assuming we'll eventually replace the linked lists with
> > something more efficient) as opposed to attempting to optimize out the
> > need for that tracking at all?
> 
> Well, I was thinking that we could just allocate a memfd (or a file on
> the same xfs once we have AG offlining) and store the records in there.
> That saves us the list_head overhead and potentially enables access to a
> lot more storage than pinning things in RAM.
> 

Would using the same fs mean we have to store the repair data in a
separate AG, or somehow locate/use free space in the target AG? I
presume either way we'd have to ensure that AG is either consistent or
locked out from outside I/O. If we have the total record count we can
preallocate the file and hope there is no such other free space
corruption or something that would allow some other task to mess with
our blocks. I'm a little skeptical overall on relying on a corrupted
filesystem to store repair data, but perhaps there are ways to mitigate
the risks.

I'm not familiar with memfd. The manpage suggests it's ram backed, is it
swappable or something? If so, that sounds a reasonable option provided
the swap space requirement can be made clear to users and the failure
characteristics aren't more severe than for userspace. An online repair
that puts the broader system at risk of OOM as opposed to predictably
failing gracefully may not be the most useful tool.

Brian

> --D
> 
> > Brian
> > 
> > > --D
> > > 
> > > > Brian
> > > > 
> > > > > --D
> > > > > 
> > > > > > Brian
> > > > > > 
> > > > > > > > > +
> > > > > > > > > +done:
> > > > > > > > > +	/* Free all the OWN_AG blocks that are not in the rmapbt/agfl. */
> > > > > > > > > +	xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG);
> > > > > > > > > +	return xrep_reap_extents(sc, old_allocbt_blocks, &oinfo,
> > > > > > > > > +			XFS_AG_RESV_NONE);
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > ...
> > > > > > > > > diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
> > > > > > > > > index 0ed68379e551..82f99633a597 100644
> > > > > > > > > --- a/fs/xfs/xfs_extent_busy.c
> > > > > > > > > +++ b/fs/xfs/xfs_extent_busy.c
> > > > > > > > > @@ -657,3 +657,17 @@ xfs_extent_busy_ag_cmp(
> > > > > > > > >  		diff = b1->bno - b2->bno;
> > > > > > > > >  	return diff;
> > > > > > > > >  }
> > > > > > > > > +
> > > > > > > > > +/* Are there any busy extents in this AG? */
> > > > > > > > > +bool
> > > > > > > > > +xfs_extent_busy_list_empty(
> > > > > > > > > +	struct xfs_perag	*pag)
> > > > > > > > > +{
> > > > > > > > > +	spin_lock(&pag->pagb_lock);
> > > > > > > > > +	if (pag->pagb_tree.rb_node) {
> > > > > > > > 
> > > > > > > > RB_EMPTY_ROOT()?
> > > > > > > 
> > > > > > > Good suggestion, thank you!
> > > > > > > 
> > > > > > > --D
> > > > > > > 
> > > > > > > > Brian
> > > > > > > > 
> > > > > > > > > +		spin_unlock(&pag->pagb_lock);
> > > > > > > > > +		return false;
> > > > > > > > > +	}
> > > > > > > > > +	spin_unlock(&pag->pagb_lock);
> > > > > > > > > +	return true;
> > > > > > > > > +}
> > > > > > > > > diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h
> > > > > > > > > index 990ab3891971..2f8c73c712c6 100644
> > > > > > > > > --- a/fs/xfs/xfs_extent_busy.h
> > > > > > > > > +++ b/fs/xfs/xfs_extent_busy.h
> > > > > > > > > @@ -65,4 +65,6 @@ static inline void xfs_extent_busy_sort(struct list_head *list)
> > > > > > > > >  	list_sort(NULL, list, xfs_extent_busy_ag_cmp);
> > > > > > > > >  }
> > > > > > > > >  
> > > > > > > > > +bool xfs_extent_busy_list_empty(struct xfs_perag *pag);
> > > > > > > > > +
> > > > > > > > >  #endif /* __XFS_EXTENT_BUSY_H__ */
> > > > > > > > > 
> > > > > > > > > --
> > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > > > --
> > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > > --
> > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > --
> > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > --
> > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > the body of a message to majordomo@vger.kernel.org
> > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > --
> > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > the body of a message to majordomo@vger.kernel.org
> > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Darrick J. Wong Aug. 7, 2018, 11:34 p.m. UTC | #10
On Fri, Aug 03, 2018 at 06:49:40AM -0400, Brian Foster wrote:
> On Thu, Aug 02, 2018 at 12:22:05PM -0700, Darrick J. Wong wrote:
> > On Thu, Aug 02, 2018 at 09:48:24AM -0400, Brian Foster wrote:
> > > On Wed, Aug 01, 2018 at 11:28:45PM -0700, Darrick J. Wong wrote:
> > > > On Wed, Aug 01, 2018 at 02:39:20PM -0400, Brian Foster wrote:
> > > > > On Wed, Aug 01, 2018 at 09:23:16AM -0700, Darrick J. Wong wrote:
> > > > > > On Wed, Aug 01, 2018 at 07:54:09AM -0400, Brian Foster wrote:
> > > > > > > On Tue, Jul 31, 2018 at 03:01:25PM -0700, Darrick J. Wong wrote:
> > > > > > > > On Tue, Jul 31, 2018 at 01:47:23PM -0400, Brian Foster wrote:
> > > > > > > > > On Sun, Jul 29, 2018 at 10:48:21PM -0700, Darrick J. Wong wrote:
> > > > > > > > > > From: Darrick J. Wong <darrick.wong@oracle.com>
> > > > > > > > > > 
> > > > > > > > > > Rebuild the free space btrees from the gaps in the rmap btree.
> > > > > > > > > > 
> > > > > > > > > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > > > > > > > > > ---
> > > > > > > > > >  fs/xfs/Makefile             |    1 
> > > > > > > > > >  fs/xfs/scrub/alloc.c        |    1 
> > > > > > > > > >  fs/xfs/scrub/alloc_repair.c |  581 +++++++++++++++++++++++++++++++++++++++++++
> > > > > > > > > >  fs/xfs/scrub/common.c       |    8 +
> > > > > > > > > >  fs/xfs/scrub/repair.h       |    2 
> > > > > > > > > >  fs/xfs/scrub/scrub.c        |    4 
> > > > > > > > > >  fs/xfs/scrub/trace.h        |    2 
> > > > > > > > > >  fs/xfs/xfs_extent_busy.c    |   14 +
> > > > > > > > > >  fs/xfs/xfs_extent_busy.h    |    2 
> > > > > > > > > >  9 files changed, 610 insertions(+), 5 deletions(-)
> > > > > > > > > >  create mode 100644 fs/xfs/scrub/alloc_repair.c
> > > > > > > > > > 
> > > > > > > > > > 
> > > > > > > > > ...
> > > > > > > > > > diff --git a/fs/xfs/scrub/alloc_repair.c b/fs/xfs/scrub/alloc_repair.c
> > > > > > > > > > new file mode 100644
> > > > > > > > > > index 000000000000..b228c2906de2
> > > > > > > > > > --- /dev/null
> > > > > > > > > > +++ b/fs/xfs/scrub/alloc_repair.c
> > > > > > > > > > @@ -0,0 +1,581 @@
> > > > > ...
> > > > > > > > > > +
> > > > > > > > > > +/*
> > > > > > > > > > + * Make our new freespace btree roots permanent so that we can start freeing
> > > > > > > > > > + * unused space back into the AG.
> > > > > > > > > > + */
> > > > > > > > > > +STATIC int
> > > > > > > > > > +xrep_abt_commit_new(
> > > > > > > > > > +	struct xfs_scrub	*sc,
> > > > > > > > > > +	struct xfs_bitmap	*old_allocbt_blocks,
> > > > > > > > > > +	int			log_flags)
> > > > > > > > > > +{
> > > > > > > > > > +	int			error;
> > > > > > > > > > +
> > > > > > > > > > +	xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, log_flags);
> > > > > > > > > > +
> > > > > > > > > > +	/* Invalidate the old freespace btree blocks and commit. */
> > > > > > > > > > +	error = xrep_invalidate_blocks(sc, old_allocbt_blocks);
> > > > > > > > > > +	if (error)
> > > > > > > > > > +		return error;
> > > > > > > > > 
> > > > > > > > > It looks like the above invalidation all happens in the same
> > > > > > > > > transaction. Those aren't logging buffer data or anything, but any idea
> > > > > > > > > how many log formats we can get away with in this single transaction?
> > > > > > > > 
> > > > > > > > Hm... well, on my computer a log format is ~88 bytes.  Assuming 4K
> > > > > > > > blocks, the max AG size of 1TB, maximum free space fragmentation, and
> > > > > > > > two btrees, the tree could be up to ~270 million records.  Assuming ~505
> > > > > > > > records per block, that's ... ~531,000 leaf blocks and ~1100 node blocks
> > > > > > > > for both btrees.  If we invalidate both, that's ~46M of RAM?
> > > > > > > > 
> > > > > > > 
> > > > > > > I was thinking more about transaction reservation than RAM. It may not
> > > > > > 
> > > > > > Hmm.  tr_itruncate is ~650K on my 2TB SSD, assuming 88 bytes per, that's
> > > > > > about ... ~7300 log format items?  Not a lot, maybe it should roll the
> > > > > > transaction every 1000 invalidations or so...
> > > > > > 
> > > > > 
> > > > > I'm not really sure what categorizes as a lot here given that the blocks
> > > > > would need to be in-core, but rolling on some fixed/safe interval sounds
> > > > > reasonable to me.
> > > > > 
> > > > > > > currently be an issue, but it might be worth putting something down in a
> > > > > > > comment to note that this is a single transaction and we expect to not
> > > > > > > have to invalidate more than N (ballpark) blocks in a single go,
> > > > > > > whatever that value happens to be.
> > > > > > > 
> > > > > > > > > > +	error = xrep_roll_ag_trans(sc);
> > > > > > > > > > +	if (error)
> > > > > > > > > > +		return error;
> > > > > > > > > > +
> > > > > > > > > > +	/* Now that we've succeeded, mark the incore state valid again. */
> > > > > > > > > > +	sc->sa.pag->pagf_init = 1;
> > > > > > > > > > +	return 0;
> > > > > > > > > > +}
> > > > > > > > > > +
> > > > > > > > > > +/* Build new free space btrees and dispose of the old one. */
> > > > > > > > > > +STATIC int
> > > > > > > > > > +xrep_abt_rebuild_trees(
> > > > > > > > > > +	struct xfs_scrub	*sc,
> > > > > > > > > > +	struct list_head	*free_extents,
> > > > > > > > > > +	struct xfs_bitmap	*old_allocbt_blocks)
> > > > > > > > > > +{
> > > > > > > > > > +	struct xfs_owner_info	oinfo;
> > > > > > > > > > +	struct xrep_abt_extent	*rae;
> > > > > > > > > > +	struct xrep_abt_extent	*n;
> > > > > > > > > > +	struct xrep_abt_extent	*longest;
> > > > > > > > > > +	int			error;
> > > > > > > > > > +
> > > > > > > > > > +	xfs_rmap_skip_owner_update(&oinfo);
> > > > > > > > > > +
> > > > > > > > > > +	/*
> > > > > > > > > > +	 * Insert the longest free extent in case it's necessary to
> > > > > > > > > > +	 * refresh the AGFL with multiple blocks.  If there is no longest
> > > > > > > > > > +	 * extent, we had exactly the free space we needed; we're done.
> > > > > > > > > > +	 */
> > > > > > > > > 
> > > > > > > > > I'm confused by the last sentence. longest should only be NULL if the
> > > > > > > > > free space list is empty and haven't we already bailed out with -ENOSPC
> > > > > > > > > if that's the case?
> > > > > > > > > 
> > > > > > > > > > +	longest = xrep_abt_get_longest(free_extents);
> > > > > > > > 
> > > > > > > > xrep_abt_rebuild_trees is called after we allocate and initialize two
> > > > > > > > new btree roots in xrep_abt_reset_btrees.  If free_extents is an empty
> > > > > > > > list here, then we found exactly two blocks worth of free space and used
> > > > > > > > them to set up new btree roots.
> > > > > > > > 
> > > > > > > 
> > > > > > > Got it, thanks.
> > > > > > > 
> > > > > > > > > > +	if (!longest)
> > > > > > > > > > +		goto done;
> > > > > > > > > > +	error = xrep_abt_free_extent(sc,
> > > > > > > > > > +			XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, longest->bno),
> > > > > > > > > > +			longest->len, &oinfo);
> > > > > > > > > > +	list_del(&longest->list);
> > > > > > > > > > +	kmem_free(longest);
> > > > > > > > > > +	if (error)
> > > > > > > > > > +		return error;
> > > > > > > > > > +
> > > > > > > > > > +	/* Insert records into the new btrees. */
> > > > > > > > > > +	list_for_each_entry_safe(rae, n, free_extents, list) {
> > > > > > > > > > +		error = xrep_abt_free_extent(sc,
> > > > > > > > > > +				XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, rae->bno),
> > > > > > > > > > +				rae->len, &oinfo);
> > > > > > > > > > +		if (error)
> > > > > > > > > > +			return error;
> > > > > > > > > > +		list_del(&rae->list);
> > > > > > > > > > +		kmem_free(rae);
> > > > > > > > > > +	}
> > > > > > > > > 
> > > > > > > > > Ok, at this point we've reset the btree roots and we start freeing the
> > > > > > > > > free ranges that were discovered via the rmapbt analysis. AFAICT, if we
> > > > > > > > > fail or crash at this point, we leave the allocbts in a partially
> > > > > > > > > constructed state. I take it that is Ok with respect to the broader
> > > > > > > > > repair algorithm because we'd essentially start over by inspecting the
> > > > > > > > > rmapbt again on a retry.
> > > > > > > > 
> > > > > > > > Right.  Though in the crash/shutdown case, you'll end up with the
> > > > > > > > filesystem in an offline state at some point before you can retry the
> > > > > > > > scrub, it's probably faster to run xfs_repair to fix the damage.
> > > > > > > > 
> > > > > > > 
> > > > > > > Can we really assume that if we're already up and running an online
> > > > > > > repair? The filesystem has to be mountable in that case in the first
> > > > > > > place. If we've already reset and started reconstructing the allocation
> > > > > > > btrees then I'd think those transactions would recover just fine on a
> > > > > > > power loss or something (perhaps not in the event of some other
> > > > > > > corruption related shutdown).
> > > > > > 
> > > > > > Right, for the system crash case, whatever transactions committed should
> > > > > > replay just fine, and you can even start up the online repair again, and
> > > > > > if the AG isn't particularly close to ENOSPC then (barring rmap
> > > > > > corruption) it should work just fine.
> > > > > > 
> > > > > > If the fs went down because either (a) repair hit other corruption or
> > > > > > (b) some other thread hit an error in some other part of the filesystem,
> > > > > > then it's not so clear -- in (b) you could probably try again, but for
> > > > > > (a) you'll definitely have to unmount and run xfs_repair.
> > > > > > 
> > > > > 
> > > > > Indeed, there are certainly cases where we simply won't be able to do an
> > > > > online repair. I'm trying to think about scenarios where we should be
> > > > > able to do an online repair, but we lose power or hit some kind of
> > > > > transient error like a memory allocation failure before it completes. It
> > > > > would be nice if the online repair itself didn't contribute (within
> > > > > reason) to the inability to simply try again just because the fs was
> > > > > close to -ENOSPC.
> > > > 
> > > > Agreed.  Most of the, uh, opportunities to hit ENOMEM happen before we
> > > > start modifying on-disk metadata.  If that happens, we just free all the
> > > > memory and bail out having done nothing.
> > > > 
> > > > > For one, I think it's potentially confusing behavior. Second, it might
> > > > > be concerning to regular users who perceive it as an online repair
> > > > > leaving the fs in a worse off state. Us fs devs know that may not really
> > > > > be the case, but I think we're better for addressing it if we can
> > > > > reasonably do so.
> > > > 
> > > > <nod> Further in the future I want to add the ability to offline an AG,
> > > > so the worst that happens is that scrub turns the AG off, repair doesn't
> > > > fix it, and the AG simply stays offline.  That might give us the
> > > > ability to survive cancelling the repair transaction, since if the AG's
> > > > offline already anyway we could just throw away the dirty buffers and
> > > > resurrect the AG later.  I don't know, that's purely speculative.
> > > > 
> > > > > > Perhaps the guideline here is that if the fs goes down more than once
> > > > > > during online repair then unmount it and run xfs_repair.
> > > > > > 
> > > > > 
> > > > > Yep, I think that makes sense if the filesystem or repair itself is
> > > > > tripping over other corruptions that fail to keep it active for the
> > > > > duration of the repair.
> > > > 
> > > > <nod>
> > > > 
> > > > > > > > > The blocks allocated for the btrees that we've begun to construct here
> > > > > > > > > end up mapped in the rmapbt as we go, right? IIUC, that means we don't
> > > > > > > > > necessarily have infinite retries to make sure this completes. IOW,
> > > > > > > > > suppose that a first repair attempt finds just enough free space to
> > > > > > > > > construct new trees, gets far enough along to consume most of that free
> > > > > > > > > space and then crashes. Is it possible that a subsequent repair attempt
> > > > > > > > > includes the btree blocks allocated during the previous failed repair
> > > > > > > > > attempt in the sum of "old btree blocks" and determines we don't have
> > > > > > > > > enough free space to repair?
> > > > > > > > 
> > > > > > > > Yes, that's a risk of running the free space repair.
> > > > > > > > 
> > > > > > > 
> > > > > > > Can we improve on that? For example, are the rmapbt entries for the old
> > > > > > > allocation btree blocks necessary once we commit the btree resets? If
> > > > > > > not, could we remove those entries before we start tree reconstruction?
> > > > > > > 
> > > > > > > Alternatively, could we incorporate use of the old btree blocks? As it
> > > > > > > is, we discover those blocks simply so we can free them at the end.
> > > > > > > Perhaps we could free them sooner or find a more clever means to
> > > > > > > reallocate directly from that in-core list? I guess we have to consider
> > > > > > > whether they were really valid/sane btree blocks, but either way ISTM
> > > > > > > that the old blocks list is essentially invalidated once we reset the
> > > > > > > btrees.
> > > > > > 
> > > > > > Hmm, it's a little tricky to do that -- we could reap the old bnobt and
> > > > > > cntbt blocks (in the old_allocbt_blocks bitmap) first, but if adding a
> > > > > > record causes a btree split we'll pull blocks from the AGFL, and if
> > > > > > there aren't enough blocks in the bnobt to fill the AGFL back up then
> > > > > > fix_freelist won't succeed.  That complication is why it finds the
> > > > > > longest extent in the unclaimed list and pushes that in first, then
> > > > > > works on the rest of the extents.
> > > > > > 
> > > > > 
> > > > > Hmm, but doesn't a btree split require at least one full space btree
> > > > > block per-level? In conjunction, the agfl minimum size requirement grows
> > > > > with the height of the tree, which implies available free space..? I
> > > > > could be missing something, perhaps we have to account for the rmapbt in
> > > > > that case as well? Regardless...
> > > > > 
> > > > > > I suppose one could try to avoid ENOSPC by pushing that longest extent
> > > > > > in first (since we know that won't trigger a split), then reap the old
> > > > > > alloc btree blocks, and then add everything else back in...
> > > > > > 
> > > > > 
> > > > > I think it would be reasonable to seed the btree with the longest record
> > > > > or some fixed number of longest records (~1/2 a root block, for example)
> > > > > before making actual use of the btrees to reap the old blocks. I think
> > > > > then you'd only have a very short window of a single block leak on a
> > > > > poorly timed power loss and repair retry sequence before you start
> > > > > actually freeing originally used space (which in practice, I think
> > > > > solves the problem).
> > > > > 
> > > > > Given that we're starting from empty, I wonder if another option may be
> > > > > to over fill the agfl with old btree blocks or something. The first real
> > > > > free should shift enough blocks back into the btrees to ensure the agfl
> > > > > can be managed from that point forward, right? That may be more work
> > > > > than it's worth though and/or a job for another patch. (FWIW, we also
> > > > > have that NOSHRINK agfl fixup flag for userspace repair.)
> > > > 
> > > > Yes, I'll give that a try tomorrow, now that I've finished porting all
> > > > the 4.19 stuff to xfsprogs. :)
> > > > 
> > > > Looping back to something we discussed earlier in this thread, I'd
> > > > prefer to hold off on converting the list of already-freed extents to
> > > > xfs_bitmap because the same problem exists in all the repair functions
> > > > of having to store a large number of records for the rebuilt btree, and
> > > > maybe there's some way to <cough> use pageable memory for that, since
> > > > the access patterns for that are append, sort, and iterate; for those
> > > > three uses we don't necessarily require all the records to be in memory
> > > > all the time.  For the allocbt repair I expect the free space records to
> > > > be far more numerous than the list of old bnobt/cntbt blocks.
> > > > 
> > > 
> > > Ok, it's fair enough that we'll probably want to find some kind of
> > > generic, more efficient technique for handling this across the various
> > > applicable repair algorithms.
> > > 
> > > One other high level thing that crossed my mind with regard to the
> > > general btree reconstruction algorithms is whether we need to build up
> > > this kind of central record list at all. For example, rather than slurp
> > > up the entire list of btree records in-core, sort it and dump it back
> > > out, could we take advantage of the fact that our existing on-disk
> > > structure insertion mechanisms already handle out of order records
> > > (simply stated, an extent free knows how to insert the associated record
> > > at the right place in the space btrees)? For example, suppose we reset
> > > the existing btrees first, then scanned the rmapbt and repopulated the
> > > new btrees as records are discovered..?
> > 
> > I tried that in an earlier draft of the bnobt repair function.  The
> > biggest problem with inserting as we go is dealing with the inevitable
> > transaction rolls (right now we do after every record insertion to avoid
> > playing games with guessing how much reservation is left).  Btree
> > cursor state can't survive transaction rolls because the transaction
> > commit releases all the buffers that aren't bhold'en, and we can't bhold
> > that many buffers across a _defer_finish.
> > 
> 
> Ok, interesting.
> 
> Where do we need to run an xfs_defer_finish() during the reconstruction
> sequence, btw?

Not here, as I'm sure you were thinking. :)  For the AG btrees
themselves it's sufficient to roll the transaction.  I suppose we could
simply have a xfs_btree_bhold function that would bhold every buffer so
that a cursor could survive a roll.

Inode fork reconstruction is going to require _defer_finish, however.

> I thought that would only run on final commit as opposed to
> intermediate rolls.

We could let the deferred items sit around until final commit, but I
think I'd prefer to process them as soon as possible since iirc deferred
items pin the log until they're finished.  I would hope that userspace
isn't banging on the log while repair runs, but it's certainly possible.

> We could just try and make the automatic buffer relogging list a
> dynamic allocation if there are enough held buffers in the
> transaction.

Hmm.  Might be worth pursuing...

> > So, that early draft spent a lot of time tearing down and reconstructing
> > rmapbt cursors since the standard _btree_query_all isn't suited to that
> > kind of usage.  It was easily twice as slow on a RAM-backed disk just
> > from the rmap cursor overhead and much more complex, so I rewrote it to
> > be simpler.  I also have a slight preference for not touching anything
> > until we're absolutely sure we have all the data we need to repair the
> > structure.
> > 
> 
> Yes, I think that is sane in principle. I'm just a bit concerned about
> how reliable that xfs_repair-like approach will be in the kernel longer
> term, particularly once we start having to deal with large filesystems
> and limited or contended memory, etc. We already have xfs_repair users
> that need to tweak settings because there isn't enough memory available
> to repair the fs. Granted that is for fs-wide repairs and the flipside
> is that we know a single AG can only be up to 1TB. It's certainly
> possible that putting some persistent backing behind the in-core data is
> enough to resolve the problem (and the current approach is certainly
> reasonable enough to me for the initial implementation).
> 
> bjoin limitations aside, I wonder if a cursor roll mechanism that held
> all of the cursor buffers, rolled the transaction and then rejoined all
> said buffers would help us get around that. (Not sure I follow the early
> prototype behavior, but it sounds like we had to restart the rmapbt
> lookup over and over...).

Correct.

> Another caveat with that approach may be that I think we'd need to be
> sure that the reconstruction operation doesn't ever need to update the
> rmapbt while we're mid walk of the latter.

<nod> Looking even farther back in my notes, that was also an issue --
fixing the free list causes blocks to go on or off the agfl, which
causes rmapbt updates, which meant that the only way I could get
in-place updates to work was to re-lookup where we were in the btree and
also try to deal with any rmapbt entries that might have crept in as
result of the record insertion.

Getting the concurrency right for each repair function looked like a
difficult problem to solve, but amassing all the records elsewhere and
rebuilding was easy to understand.

> That may be an issue for inode btree reconstruction, for example,
> since it looks like inobt block allocation requires rmapbt updates.
> We'd probably need some way to share (or invalidate) a cursor across
> different contexts to deal with that.

I might pursue that strategy if we ever hit the point where we can't
find space to store the records (see below).  Another option could be to
divert all deferred items for an AG, build a replacement btree in new
space, then finish all the deferred items... but that's starting to get
into offlineable AGs, which is its own project that I want to tackle
later.

(Not that much later, just not this cycle.)

> > For other repair functions (like the data/attr fork repairs) we have to
> > scan all the rmapbts for extents, and I'd prefer to lock those AGs only
> > for as long as necessary to extract the extents we want.
> > 
> > > The obvious problem is that we still have some checks that allow the
> > > whole repair operation to bail out before we determine whether we can
> > > start to rebuild the on-disk btrees. These are things like making sure
> > > we can actually read the associated rmapbt blocks (i.e., no read errors
> > > or verifier failures), basic record sanity checks, etc. But ISTM that
> > > isn't anything we couldn't get around with a multi-pass implementation.
> > > Secondary issues might be things like no longer being able to easily
> > > insert the longest free extent range(s) first (meaning we'd have to
> > > stuff the agfl with old btree blocks or figure out some other approach).
> > 
> > Well, you could scan the rmapbt twice -- once to find the longest
> > record, then again to do the actual insertion.
> > 
> 
> Yep, that's what I meant by multi-pass.
> 
> > > BTW, isn't the typical scrub sequence already multi-pass by virtue of
> > > the xfs_scrub_metadata() implementation? I'm wondering if the ->scrub()
> > > callout could not only detect corruption, but validate whether repair
> > > (if requested) is possible based on the kind of checks that are
> > > currently in the repair side rmapbt walkers. Thoughts?r
> > 
> > Yes, scrub basically validates that for us now, with the notable
> > exception of the notorious rmapbt scrubber, which doesn't
> > cross-reference with inode block mappings because that would be a
> > locking nightmare.
> > 
> > > Are there future
> > > changes that are better supported by an in-core tracking structure in
> > > general (assuming we'll eventually replace the linked lists with
> > > something more efficient) as opposed to attempting to optimize out the
> > > need for that tracking at all?
> > 
> > Well, I was thinking that we could just allocate a memfd (or a file on
> > the same xfs once we have AG offlining) and store the records in there.
> > That saves us the list_head overhead and potentially enables access to a
> > lot more storage than pinning things in RAM.
> > 
> 
> Would using the same fs mean we have to store the repair data in a
> separate AG, or somehow locate/use free space in the target AG?

As part of building an "offline AG" feature we'd presumably have to
teach the allocators to avoid the offline AGs for allocations, which
would make it so that we could host the repair data files in the same
XFS that's being fixed.  That seems a little risky to me, but the disk
is probably larger than mem+swap.

> presume either way we'd have to ensure that AG is either consistent or
> locked out from outside I/O. If we have the total record count we can

We usually don't, but for the btrees that have their own record/blocks
counters we might be able to guess a number, fallocate it, and see if
that doesn't ENOSPC.

> preallocate the file and hope there is no such other free space
> corruption or something that would allow some other task to mess with
> our blocks. I'm a little skeptical overall on relying on a corrupted
> filesystem to store repair data, but perhaps there are ways to mitigate
> the risks.

Store it elsewhere?  /home for root repairs, /root for any other
repair... though if we're going to do that, why not just add a swap file
temporarily?

> I'm not familiar with memfd. The manpage suggests it's ram backed, is it
> swappable or something?

It's supposed to be.  The quick test I ran (allocate a memfd, write 1GB
of junk to it on a VM with 400M of RAM) seemed to push about 980MB into
the swap file.

> If so, that sounds a reasonable option provided the swap space
> requirement can be made clear to users

We can document it.  I don't think it's any worse than xfs_repair being
able to use up all the memory + swap... and since we're probably only
going to be repairing one thing at a time, most likely scrub won't need
as much memory.

> and the failure characteristics aren't more severe than for userspace.
> An online repair that puts the broader system at risk of OOM as
> opposed to predictably failing gracefully may not be the most useful
> tool.

Agreed.  One huge downside of memfd seems to be the lack of a mechanism
for the vm to push back on us if we successfully write all we need to
the memfd but then other processes need some memory.  Obviously, if the
memfd write itself comes up short or fails then we dump the memfd and
error back to userspace.  We might simply have to free array memory
while we iterate the records to minimize the time spent at peak memory
usage.

--D

> 
> Brian
> 
> > --D
> > 
> > > Brian
> > > 
> > > > --D
> > > > 
> > > > > Brian
> > > > > 
> > > > > > --D
> > > > > > 
> > > > > > > Brian
> > > > > > > 
> > > > > > > > > > +
> > > > > > > > > > +done:
> > > > > > > > > > +	/* Free all the OWN_AG blocks that are not in the rmapbt/agfl. */
> > > > > > > > > > +	xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG);
> > > > > > > > > > +	return xrep_reap_extents(sc, old_allocbt_blocks, &oinfo,
> > > > > > > > > > +			XFS_AG_RESV_NONE);
> > > > > > > > > > +}
> > > > > > > > > > +
> > > > > > > > > ...
> > > > > > > > > > diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
> > > > > > > > > > index 0ed68379e551..82f99633a597 100644
> > > > > > > > > > --- a/fs/xfs/xfs_extent_busy.c
> > > > > > > > > > +++ b/fs/xfs/xfs_extent_busy.c
> > > > > > > > > > @@ -657,3 +657,17 @@ xfs_extent_busy_ag_cmp(
> > > > > > > > > >  		diff = b1->bno - b2->bno;
> > > > > > > > > >  	return diff;
> > > > > > > > > >  }
> > > > > > > > > > +
> > > > > > > > > > +/* Are there any busy extents in this AG? */
> > > > > > > > > > +bool
> > > > > > > > > > +xfs_extent_busy_list_empty(
> > > > > > > > > > +	struct xfs_perag	*pag)
> > > > > > > > > > +{
> > > > > > > > > > +	spin_lock(&pag->pagb_lock);
> > > > > > > > > > +	if (pag->pagb_tree.rb_node) {
> > > > > > > > > 
> > > > > > > > > RB_EMPTY_ROOT()?
> > > > > > > > 
> > > > > > > > Good suggestion, thank you!
> > > > > > > > 
> > > > > > > > --D
> > > > > > > > 
> > > > > > > > > Brian
> > > > > > > > > 
> > > > > > > > > > +		spin_unlock(&pag->pagb_lock);
> > > > > > > > > > +		return false;
> > > > > > > > > > +	}
> > > > > > > > > > +	spin_unlock(&pag->pagb_lock);
> > > > > > > > > > +	return true;
> > > > > > > > > > +}
> > > > > > > > > > diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h
> > > > > > > > > > index 990ab3891971..2f8c73c712c6 100644
> > > > > > > > > > --- a/fs/xfs/xfs_extent_busy.h
> > > > > > > > > > +++ b/fs/xfs/xfs_extent_busy.h
> > > > > > > > > > @@ -65,4 +65,6 @@ static inline void xfs_extent_busy_sort(struct list_head *list)
> > > > > > > > > >  	list_sort(NULL, list, xfs_extent_busy_ag_cmp);
> > > > > > > > > >  }
> > > > > > > > > >  
> > > > > > > > > > +bool xfs_extent_busy_list_empty(struct xfs_perag *pag);
> > > > > > > > > > +
> > > > > > > > > >  #endif /* __XFS_EXTENT_BUSY_H__ */
> > > > > > > > > > 
> > > > > > > > > > --
> > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > > > > --
> > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > > > --
> > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > > --
> > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > --
> > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > the body of a message to majordomo@vger.kernel.org
> > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > --
> > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > the body of a message to majordomo@vger.kernel.org
> > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > --
> > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > the body of a message to majordomo@vger.kernel.org
> > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Brian Foster Aug. 8, 2018, 12:29 p.m. UTC | #11
On Tue, Aug 07, 2018 at 04:34:58PM -0700, Darrick J. Wong wrote:
> On Fri, Aug 03, 2018 at 06:49:40AM -0400, Brian Foster wrote:
> > On Thu, Aug 02, 2018 at 12:22:05PM -0700, Darrick J. Wong wrote:
> > > On Thu, Aug 02, 2018 at 09:48:24AM -0400, Brian Foster wrote:
> > > > On Wed, Aug 01, 2018 at 11:28:45PM -0700, Darrick J. Wong wrote:
> > > > > On Wed, Aug 01, 2018 at 02:39:20PM -0400, Brian Foster wrote:
> > > > > > On Wed, Aug 01, 2018 at 09:23:16AM -0700, Darrick J. Wong wrote:
> > > > > > > On Wed, Aug 01, 2018 at 07:54:09AM -0400, Brian Foster wrote:
> > > > > > > > On Tue, Jul 31, 2018 at 03:01:25PM -0700, Darrick J. Wong wrote:
> > > > > > > > > On Tue, Jul 31, 2018 at 01:47:23PM -0400, Brian Foster wrote:
> > > > > > > > > > On Sun, Jul 29, 2018 at 10:48:21PM -0700, Darrick J. Wong wrote:
> > > > > > > > > > > From: Darrick J. Wong <darrick.wong@oracle.com>
> > > > > > > > > > > 
> > > > > > > > > > > Rebuild the free space btrees from the gaps in the rmap btree.
> > > > > > > > > > > 
> > > > > > > > > > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > > > > > > > > > > ---
> > > > > > > > > > >  fs/xfs/Makefile             |    1 
> > > > > > > > > > >  fs/xfs/scrub/alloc.c        |    1 
> > > > > > > > > > >  fs/xfs/scrub/alloc_repair.c |  581 +++++++++++++++++++++++++++++++++++++++++++
> > > > > > > > > > >  fs/xfs/scrub/common.c       |    8 +
> > > > > > > > > > >  fs/xfs/scrub/repair.h       |    2 
> > > > > > > > > > >  fs/xfs/scrub/scrub.c        |    4 
> > > > > > > > > > >  fs/xfs/scrub/trace.h        |    2 
> > > > > > > > > > >  fs/xfs/xfs_extent_busy.c    |   14 +
> > > > > > > > > > >  fs/xfs/xfs_extent_busy.h    |    2 
> > > > > > > > > > >  9 files changed, 610 insertions(+), 5 deletions(-)
> > > > > > > > > > >  create mode 100644 fs/xfs/scrub/alloc_repair.c
> > > > > > > > > > > 
> > > > > > > > > > > 
> > > > > > > > > > ...
> > > > > > > > > > > diff --git a/fs/xfs/scrub/alloc_repair.c b/fs/xfs/scrub/alloc_repair.c
> > > > > > > > > > > new file mode 100644
> > > > > > > > > > > index 000000000000..b228c2906de2
> > > > > > > > > > > --- /dev/null
> > > > > > > > > > > +++ b/fs/xfs/scrub/alloc_repair.c
> > > > > > > > > > > @@ -0,0 +1,581 @@
> > > > > > ...
> > > > > > > > > > > +
> > > > > > > > > > > +/*
> > > > > > > > > > > + * Make our new freespace btree roots permanent so that we can start freeing
> > > > > > > > > > > + * unused space back into the AG.
> > > > > > > > > > > + */
> > > > > > > > > > > +STATIC int
> > > > > > > > > > > +xrep_abt_commit_new(
> > > > > > > > > > > +	struct xfs_scrub	*sc,
> > > > > > > > > > > +	struct xfs_bitmap	*old_allocbt_blocks,
> > > > > > > > > > > +	int			log_flags)
> > > > > > > > > > > +{
> > > > > > > > > > > +	int			error;
> > > > > > > > > > > +
> > > > > > > > > > > +	xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, log_flags);
> > > > > > > > > > > +
> > > > > > > > > > > +	/* Invalidate the old freespace btree blocks and commit. */
> > > > > > > > > > > +	error = xrep_invalidate_blocks(sc, old_allocbt_blocks);
> > > > > > > > > > > +	if (error)
> > > > > > > > > > > +		return error;
> > > > > > > > > > 
> > > > > > > > > > It looks like the above invalidation all happens in the same
> > > > > > > > > > transaction. Those aren't logging buffer data or anything, but any idea
> > > > > > > > > > how many log formats we can get away with in this single transaction?
> > > > > > > > > 
> > > > > > > > > Hm... well, on my computer a log format is ~88 bytes.  Assuming 4K
> > > > > > > > > blocks, the max AG size of 1TB, maximum free space fragmentation, and
> > > > > > > > > two btrees, the tree could be up to ~270 million records.  Assuming ~505
> > > > > > > > > records per block, that's ... ~531,000 leaf blocks and ~1100 node blocks
> > > > > > > > > for both btrees.  If we invalidate both, that's ~46M of RAM?
> > > > > > > > > 
> > > > > > > > 
> > > > > > > > I was thinking more about transaction reservation than RAM. It may not
> > > > > > > 
> > > > > > > Hmm.  tr_itruncate is ~650K on my 2TB SSD, assuming 88 bytes per, that's
> > > > > > > about ... ~7300 log format items?  Not a lot, maybe it should roll the
> > > > > > > transaction every 1000 invalidations or so...
> > > > > > > 
> > > > > > 
> > > > > > I'm not really sure what categorizes as a lot here given that the blocks
> > > > > > would need to be in-core, but rolling on some fixed/safe interval sounds
> > > > > > reasonable to me.
> > > > > > 
> > > > > > > > currently be an issue, but it might be worth putting something down in a
> > > > > > > > comment to note that this is a single transaction and we expect to not
> > > > > > > > have to invalidate more than N (ballpark) blocks in a single go,
> > > > > > > > whatever that value happens to be.
> > > > > > > > 
> > > > > > > > > > > +	error = xrep_roll_ag_trans(sc);
> > > > > > > > > > > +	if (error)
> > > > > > > > > > > +		return error;
> > > > > > > > > > > +
> > > > > > > > > > > +	/* Now that we've succeeded, mark the incore state valid again. */
> > > > > > > > > > > +	sc->sa.pag->pagf_init = 1;
> > > > > > > > > > > +	return 0;
> > > > > > > > > > > +}
> > > > > > > > > > > +
> > > > > > > > > > > +/* Build new free space btrees and dispose of the old one. */
> > > > > > > > > > > +STATIC int
> > > > > > > > > > > +xrep_abt_rebuild_trees(
> > > > > > > > > > > +	struct xfs_scrub	*sc,
> > > > > > > > > > > +	struct list_head	*free_extents,
> > > > > > > > > > > +	struct xfs_bitmap	*old_allocbt_blocks)
> > > > > > > > > > > +{
> > > > > > > > > > > +	struct xfs_owner_info	oinfo;
> > > > > > > > > > > +	struct xrep_abt_extent	*rae;
> > > > > > > > > > > +	struct xrep_abt_extent	*n;
> > > > > > > > > > > +	struct xrep_abt_extent	*longest;
> > > > > > > > > > > +	int			error;
> > > > > > > > > > > +
> > > > > > > > > > > +	xfs_rmap_skip_owner_update(&oinfo);
> > > > > > > > > > > +
> > > > > > > > > > > +	/*
> > > > > > > > > > > +	 * Insert the longest free extent in case it's necessary to
> > > > > > > > > > > +	 * refresh the AGFL with multiple blocks.  If there is no longest
> > > > > > > > > > > +	 * extent, we had exactly the free space we needed; we're done.
> > > > > > > > > > > +	 */
> > > > > > > > > > 
> > > > > > > > > > I'm confused by the last sentence. longest should only be NULL if the
> > > > > > > > > > free space list is empty and haven't we already bailed out with -ENOSPC
> > > > > > > > > > if that's the case?
> > > > > > > > > > 
> > > > > > > > > > > +	longest = xrep_abt_get_longest(free_extents);
> > > > > > > > > 
> > > > > > > > > xrep_abt_rebuild_trees is called after we allocate and initialize two
> > > > > > > > > new btree roots in xrep_abt_reset_btrees.  If free_extents is an empty
> > > > > > > > > list here, then we found exactly two blocks worth of free space and used
> > > > > > > > > them to set up new btree roots.
> > > > > > > > > 
> > > > > > > > 
> > > > > > > > Got it, thanks.
> > > > > > > > 
> > > > > > > > > > > +	if (!longest)
> > > > > > > > > > > +		goto done;
> > > > > > > > > > > +	error = xrep_abt_free_extent(sc,
> > > > > > > > > > > +			XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, longest->bno),
> > > > > > > > > > > +			longest->len, &oinfo);
> > > > > > > > > > > +	list_del(&longest->list);
> > > > > > > > > > > +	kmem_free(longest);
> > > > > > > > > > > +	if (error)
> > > > > > > > > > > +		return error;
> > > > > > > > > > > +
> > > > > > > > > > > +	/* Insert records into the new btrees. */
> > > > > > > > > > > +	list_for_each_entry_safe(rae, n, free_extents, list) {
> > > > > > > > > > > +		error = xrep_abt_free_extent(sc,
> > > > > > > > > > > +				XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, rae->bno),
> > > > > > > > > > > +				rae->len, &oinfo);
> > > > > > > > > > > +		if (error)
> > > > > > > > > > > +			return error;
> > > > > > > > > > > +		list_del(&rae->list);
> > > > > > > > > > > +		kmem_free(rae);
> > > > > > > > > > > +	}
> > > > > > > > > > 
> > > > > > > > > > Ok, at this point we've reset the btree roots and we start freeing the
> > > > > > > > > > free ranges that were discovered via the rmapbt analysis. AFAICT, if we
> > > > > > > > > > fail or crash at this point, we leave the allocbts in a partially
> > > > > > > > > > constructed state. I take it that is Ok with respect to the broader
> > > > > > > > > > repair algorithm because we'd essentially start over by inspecting the
> > > > > > > > > > rmapbt again on a retry.
> > > > > > > > > 
> > > > > > > > > Right.  Though in the crash/shutdown case, you'll end up with the
> > > > > > > > > filesystem in an offline state at some point before you can retry the
> > > > > > > > > scrub, it's probably faster to run xfs_repair to fix the damage.
> > > > > > > > > 
> > > > > > > > 
> > > > > > > > Can we really assume that if we're already up and running an online
> > > > > > > > repair? The filesystem has to be mountable in that case in the first
> > > > > > > > place. If we've already reset and started reconstructing the allocation
> > > > > > > > btrees then I'd think those transactions would recover just fine on a
> > > > > > > > power loss or something (perhaps not in the event of some other
> > > > > > > > corruption related shutdown).
> > > > > > > 
> > > > > > > Right, for the system crash case, whatever transactions committed should
> > > > > > > replay just fine, and you can even start up the online repair again, and
> > > > > > > if the AG isn't particularly close to ENOSPC then (barring rmap
> > > > > > > corruption) it should work just fine.
> > > > > > > 
> > > > > > > If the fs went down because either (a) repair hit other corruption or
> > > > > > > (b) some other thread hit an error in some other part of the filesystem,
> > > > > > > then it's not so clear -- in (b) you could probably try again, but for
> > > > > > > (a) you'll definitely have to unmount and run xfs_repair.
> > > > > > > 
> > > > > > 
> > > > > > Indeed, there are certainly cases where we simply won't be able to do an
> > > > > > online repair. I'm trying to think about scenarios where we should be
> > > > > > able to do an online repair, but we lose power or hit some kind of
> > > > > > transient error like a memory allocation failure before it completes. It
> > > > > > would be nice if the online repair itself didn't contribute (within
> > > > > > reason) to the inability to simply try again just because the fs was
> > > > > > close to -ENOSPC.
> > > > > 
> > > > > Agreed.  Most of the, uh, opportunities to hit ENOMEM happen before we
> > > > > start modifying on-disk metadata.  If that happens, we just free all the
> > > > > memory and bail out having done nothing.
> > > > > 
> > > > > > For one, I think it's potentially confusing behavior. Second, it might
> > > > > > be concerning to regular users who perceive it as an online repair
> > > > > > leaving the fs in a worse off state. Us fs devs know that may not really
> > > > > > be the case, but I think we're better for addressing it if we can
> > > > > > reasonably do so.
> > > > > 
> > > > > <nod> Further in the future I want to add the ability to offline an AG,
> > > > > so the worst that happens is that scrub turns the AG off, repair doesn't
> > > > > fix it, and the AG simply stays offline.  That might give us the
> > > > > ability to survive cancelling the repair transaction, since if the AG's
> > > > > offline already anyway we could just throw away the dirty buffers and
> > > > > resurrect the AG later.  I don't know, that's purely speculative.
> > > > > 
> > > > > > > Perhaps the guideline here is that if the fs goes down more than once
> > > > > > > during online repair then unmount it and run xfs_repair.
> > > > > > > 
> > > > > > 
> > > > > > Yep, I think that makes sense if the filesystem or repair itself is
> > > > > > tripping over other corruptions that fail to keep it active for the
> > > > > > duration of the repair.
> > > > > 
> > > > > <nod>
> > > > > 
> > > > > > > > > > The blocks allocated for the btrees that we've begun to construct here
> > > > > > > > > > end up mapped in the rmapbt as we go, right? IIUC, that means we don't
> > > > > > > > > > necessarily have infinite retries to make sure this completes. IOW,
> > > > > > > > > > suppose that a first repair attempt finds just enough free space to
> > > > > > > > > > construct new trees, gets far enough along to consume most of that free
> > > > > > > > > > space and then crashes. Is it possible that a subsequent repair attempt
> > > > > > > > > > includes the btree blocks allocated during the previous failed repair
> > > > > > > > > > attempt in the sum of "old btree blocks" and determines we don't have
> > > > > > > > > > enough free space to repair?
> > > > > > > > > 
> > > > > > > > > Yes, that's a risk of running the free space repair.
> > > > > > > > > 
> > > > > > > > 
> > > > > > > > Can we improve on that? For example, are the rmapbt entries for the old
> > > > > > > > allocation btree blocks necessary once we commit the btree resets? If
> > > > > > > > not, could we remove those entries before we start tree reconstruction?
> > > > > > > > 
> > > > > > > > Alternatively, could we incorporate use of the old btree blocks? As it
> > > > > > > > is, we discover those blocks simply so we can free them at the end.
> > > > > > > > Perhaps we could free them sooner or find a more clever means to
> > > > > > > > reallocate directly from that in-core list? I guess we have to consider
> > > > > > > > whether they were really valid/sane btree blocks, but either way ISTM
> > > > > > > > that the old blocks list is essentially invalidated once we reset the
> > > > > > > > btrees.
> > > > > > > 
> > > > > > > Hmm, it's a little tricky to do that -- we could reap the old bnobt and
> > > > > > > cntbt blocks (in the old_allocbt_blocks bitmap) first, but if adding a
> > > > > > > record causes a btree split we'll pull blocks from the AGFL, and if
> > > > > > > there aren't enough blocks in the bnobt to fill the AGFL back up then
> > > > > > > fix_freelist won't succeed.  That complication is why it finds the
> > > > > > > longest extent in the unclaimed list and pushes that in first, then
> > > > > > > works on the rest of the extents.
> > > > > > > 
> > > > > > 
> > > > > > Hmm, but doesn't a btree split require at least one full space btree
> > > > > > block per-level? In conjunction, the agfl minimum size requirement grows
> > > > > > with the height of the tree, which implies available free space..? I
> > > > > > could be missing something, perhaps we have to account for the rmapbt in
> > > > > > that case as well? Regardless...
> > > > > > 
> > > > > > > I suppose one could try to avoid ENOSPC by pushing that longest extent
> > > > > > > in first (since we know that won't trigger a split), then reap the old
> > > > > > > alloc btree blocks, and then add everything else back in...
> > > > > > > 
> > > > > > 
> > > > > > I think it would be reasonable to seed the btree with the longest record
> > > > > > or some fixed number of longest records (~1/2 a root block, for example)
> > > > > > before making actual use of the btrees to reap the old blocks. I think
> > > > > > then you'd only have a very short window of a single block leak on a
> > > > > > poorly timed power loss and repair retry sequence before you start
> > > > > > actually freeing originally used space (which in practice, I think
> > > > > > solves the problem).
> > > > > > 
> > > > > > Given that we're starting from empty, I wonder if another option may be
> > > > > > to over fill the agfl with old btree blocks or something. The first real
> > > > > > free should shift enough blocks back into the btrees to ensure the agfl
> > > > > > can be managed from that point forward, right? That may be more work
> > > > > > than it's worth though and/or a job for another patch. (FWIW, we also
> > > > > > have that NOSHRINK agfl fixup flag for userspace repair.)
> > > > > 
> > > > > Yes, I'll give that a try tomorrow, now that I've finished porting all
> > > > > the 4.19 stuff to xfsprogs. :)
> > > > > 
> > > > > Looping back to something we discussed earlier in this thread, I'd
> > > > > prefer to hold off on converting the list of already-freed extents to
> > > > > xfs_bitmap because the same problem exists in all the repair functions
> > > > > of having to store a large number of records for the rebuilt btree, and
> > > > > maybe there's some way to <cough> use pageable memory for that, since
> > > > > the access patterns for that are append, sort, and iterate; for those
> > > > > three uses we don't necessarily require all the records to be in memory
> > > > > all the time.  For the allocbt repair I expect the free space records to
> > > > > be far more numerous than the list of old bnobt/cntbt blocks.
> > > > > 
> > > > 
> > > > Ok, it's fair enough that we'll probably want to find some kind of
> > > > generic, more efficient technique for handling this across the various
> > > > applicable repair algorithms.
> > > > 
> > > > One other high level thing that crossed my mind with regard to the
> > > > general btree reconstruction algorithms is whether we need to build up
> > > > this kind of central record list at all. For example, rather than slurp
> > > > up the entire list of btree records in-core, sort it and dump it back
> > > > out, could we take advantage of the fact that our existing on-disk
> > > > structure insertion mechanisms already handle out of order records
> > > > (simply stated, an extent free knows how to insert the associated record
> > > > at the right place in the space btrees)? For example, suppose we reset
> > > > the existing btrees first, then scanned the rmapbt and repopulated the
> > > > new btrees as records are discovered..?
> > > 
> > > I tried that in an earlier draft of the bnobt repair function.  The
> > > biggest problem with inserting as we go is dealing with the inevitable
> > > transaction rolls (right now we do after every record insertion to avoid
> > > playing games with guessing how much reservation is left).  Btree
> > > cursor state can't survive transaction rolls because the transaction
> > > commit releases all the buffers that aren't bhold'en, and we can't bhold
> > > that many buffers across a _defer_finish.
> > > 
> > 
> > Ok, interesting.
> > 
> > Where do we need to run an xfs_defer_finish() during the reconstruction
> > sequence, btw?
> 
> Not here, as I'm sure you were thinking. :)  For the AG btrees
> themselves it's sufficient to roll the transaction.  I suppose we could
> simply have a xfs_btree_bhold function that would bhold every buffer so
> that a cursor could survive a roll.
> 
> Inode fork reconstruction is going to require _defer_finish, however.
> 

Ok, just wasn't sure if I missed something in the bits I've looked
through so far..

> > I thought that would only run on final commit as opposed to
> > intermediate rolls.
> 
> We could let the deferred items sit around until final commit, but I
> think I'd prefer to process them as soon as possible since iirc deferred
> items pin the log until they're finished.  I would hope that userspace
> isn't banging on the log while repair runs, but it's certainly possible.
> 

I was just surmising in general, not necessarily suggesting we change
behavior.

> > We could just try and make the automatic buffer relogging list a
> > dynamic allocation if there are enough held buffers in the
> > transaction.
> 
> Hmm.  Might be worth pursuing...
> 
> > > So, that early draft spent a lot of time tearing down and reconstructing
> > > rmapbt cursors since the standard _btree_query_all isn't suited to that
> > > kind of usage.  It was easily twice as slow on a RAM-backed disk just
> > > from the rmap cursor overhead and much more complex, so I rewrote it to
> > > be simpler.  I also have a slight preference for not touching anything
> > > until we're absolutely sure we have all the data we need to repair the
> > > structure.
> > > 
> > 
> > Yes, I think that is sane in principle. I'm just a bit concerned about
> > how reliable that xfs_repair-like approach will be in the kernel longer
> > term, particularly once we start having to deal with large filesystems
> > and limited or contended memory, etc. We already have xfs_repair users
> > that need to tweak settings because there isn't enough memory available
> > to repair the fs. Granted that is for fs-wide repairs and the flipside
> > is that we know a single AG can only be up to 1TB. It's certainly
> > possible that putting some persistent backing behind the in-core data is
> > enough to resolve the problem (and the current approach is certainly
> > reasonable enough to me for the initial implementation).
> > 
> > bjoin limitations aside, I wonder if a cursor roll mechanism that held
> > all of the cursor buffers, rolled the transaction and then rejoined all
> > said buffers would help us get around that. (Not sure I follow the early
> > prototype behavior, but it sounds like we had to restart the rmapbt
> > lookup over and over...).
> 
> Correct.
> 
> > Another caveat with that approach may be that I think we'd need to be
> > sure that the reconstruction operation doesn't ever need to update the
> > rmapbt while we're mid walk of the latter.
> 
> <nod> Looking even farther back in my notes, that was also an issue --
> fixing the free list causes blocks to go on or off the agfl, which
> causes rmapbt updates, which meant that the only way I could get
> in-place updates to work was to re-lookup where we were in the btree and
> also try to deal with any rmapbt entries that might have crept in as
> result of the record insertion.
> 
> Getting the concurrency right for each repair function looked like a
> difficult problem to solve, but amassing all the records elsewhere and
> rebuilding was easy to understand.
> 

Yeah. This all points to this kind of strategy being too complex to be
worth the prospective benefits in the short term. Clearly we have
several, potentially tricky roadblocks to work through before this can
be made feasible. Thanks for the background, it's still useful to have
this context to compare with whatever we may have to do to support a
reclaimable memory approach.

> > That may be an issue for inode btree reconstruction, for example,
> > since it looks like inobt block allocation requires rmapbt updates.
> > We'd probably need some way to share (or invalidate) a cursor across
> > different contexts to deal with that.
> 
> I might pursue that strategy if we ever hit the point where we can't
> find space to store the records (see below).  Another option could be to
> divert all deferred items for an AG, build a replacement btree in new
> space, then finish all the deferred items... but that's starting to get
> into offlineable AGs, which is its own project that I want to tackle
> later.
> 
> (Not that much later, just not this cycle.)
> 

*nod*

> > > For other repair functions (like the data/attr fork repairs) we have to
> > > scan all the rmapbts for extents, and I'd prefer to lock those AGs only
> > > for as long as necessary to extract the extents we want.
> > > 
> > > > The obvious problem is that we still have some checks that allow the
> > > > whole repair operation to bail out before we determine whether we can
> > > > start to rebuild the on-disk btrees. These are things like making sure
> > > > we can actually read the associated rmapbt blocks (i.e., no read errors
> > > > or verifier failures), basic record sanity checks, etc. But ISTM that
> > > > isn't anything we couldn't get around with a multi-pass implementation.
> > > > Secondary issues might be things like no longer being able to easily
> > > > insert the longest free extent range(s) first (meaning we'd have to
> > > > stuff the agfl with old btree blocks or figure out some other approach).
> > > 
> > > Well, you could scan the rmapbt twice -- once to find the longest
> > > record, then again to do the actual insertion.
> > > 
> > 
> > Yep, that's what I meant by multi-pass.
> > 
> > > > BTW, isn't the typical scrub sequence already multi-pass by virtue of
> > > > the xfs_scrub_metadata() implementation? I'm wondering if the ->scrub()
> > > > callout could not only detect corruption, but validate whether repair
> > > > (if requested) is possible based on the kind of checks that are
> > > > currently in the repair side rmapbt walkers. Thoughts?r
> > > 
> > > Yes, scrub basically validates that for us now, with the notable
> > > exception of the notorious rmapbt scrubber, which doesn't
> > > cross-reference with inode block mappings because that would be a
> > > locking nightmare.
> > > 
> > > > Are there future
> > > > changes that are better supported by an in-core tracking structure in
> > > > general (assuming we'll eventually replace the linked lists with
> > > > something more efficient) as opposed to attempting to optimize out the
> > > > need for that tracking at all?
> > > 
> > > Well, I was thinking that we could just allocate a memfd (or a file on
> > > the same xfs once we have AG offlining) and store the records in there.
> > > That saves us the list_head overhead and potentially enables access to a
> > > lot more storage than pinning things in RAM.
> > > 
> > 
> > Would using the same fs mean we have to store the repair data in a
> > separate AG, or somehow locate/use free space in the target AG?
> 
> As part of building an "offline AG" feature we'd presumably have to
> teach the allocators to avoid the offline AGs for allocations, which
> would make it so that we could host the repair data files in the same
> XFS that's being fixed.  That seems a little risky to me, but the disk
> is probably larger than mem+swap.
> 

Got it, so we'd use the remaining space in the fs outside of the target
AG. ISTM that still presumes the rest of the fs is coherent, but I
suppose the offline AG thing helps us with that. We'd just have to make
sure we've shut down all currently corrupted AGs before we start to
repair a particular corrupted one, and then hope there's still enough
free space in the fs to proceed.

That makes more sense, but I still agree that it seems risky in general.
Technical risk aside, there's also usability concerns in that the local
free space requirement is another bit of non-determinism around the
ability to online repair vs. having to punt to xfs_repair, or if the
repair consumes whatever free space remains in the fs to the detriment
of whatever workload the user presumably wanted to keep the fs online
for, etc.

> > presume either way we'd have to ensure that AG is either consistent or
> > locked out from outside I/O. If we have the total record count we can
> 
> We usually don't, but for the btrees that have their own record/blocks
> counters we might be able to guess a number, fallocate it, and see if
> that doesn't ENOSPC.
> 
> > preallocate the file and hope there is no such other free space
> > corruption or something that would allow some other task to mess with
> > our blocks. I'm a little skeptical overall on relying on a corrupted
> > filesystem to store repair data, but perhaps there are ways to mitigate
> > the risks.
> 
> Store it elsewhere?  /home for root repairs, /root for any other
> repair... though if we're going to do that, why not just add a swap file
> temporarily?
> 

Indeed. The thought crossed my mind about whether we could do something
like have an internal/isolated swap file for dedicated XFS allocations
to avoid contention with the traditional swap. Userspace could somehow
set it up or communicate to the kernel. I have no idea how realistic
that is though or if there's a better interface for that kind of thing
(i.e., file backed kmem cache?). What _seems_ beneficial about that
approach is we get (potentially external) persistent backing and memory
reclaim ability with the traditional memory allocation model.

ISTM that if we used a regular file, we'd need to deal with the
traditional file interface somehow or another (file read/pagecache
lookup -> record ??). We could repurpose some existing mechanism like
the directory code or quota inode mechanism to use xfs buffers for that
purpose, but I think that would require us to always use an internal
inode. Allowing userspace to pass an fd/file passes that consideration
on to the user, which might be more flexible. We could always warn about
additional limitations if that fd happens to be based on the target fs.

> > I'm not familiar with memfd. The manpage suggests it's ram backed, is it
> > swappable or something?
> 
> It's supposed to be.  The quick test I ran (allocate a memfd, write 1GB
> of junk to it on a VM with 400M of RAM) seemed to push about 980MB into
> the swap file.
> 

Ok.

> > If so, that sounds a reasonable option provided the swap space
> > requirement can be made clear to users
> 
> We can document it.  I don't think it's any worse than xfs_repair being
> able to use up all the memory + swap... and since we're probably only
> going to be repairing one thing at a time, most likely scrub won't need
> as much memory.
> 

Right, but as noted below, my concerns with the xfs_repair comparison
are that 1.) the kernel generally has more of a limit on anonymous
memory allocations than userspace (i.e., not swappable AFAIU?) and 2.)
it's not clear how effectively running the system out of memory via the
kernel will behave from a failure perspective.

IOW, xfs_repair can run the system out of memory but for the most part
that ends up being a simple problem for the system: OOM kill the bloated
xfs_repair process. For an online repair in a similar situation, I have
no idea what's going to happen. The hope is that the online repair hits
-ENOMEM and unwinds, but ISTM we'd still be at risk of other subsystems
running into memory allocation problems, filling up swap, the OOM killer
going after unrelated processes, etc.

What if, for example, the OOM killer starts picking off processes in
service to a running online repair that immediately consumes freed up
memory until the system is borked? I don't know how likely that is or if
it really ends up much different from the analogous xfs_repair
situation. My only point right now is that failure scenario is something
we should explore for any solution we ultimately consider because it may
be an unexpected use case of the underlying mechanism. (To the
contrary, just using a cached file seems a natural fit from that
perspective.)

> > and the failure characteristics aren't more severe than for userspace.
> > An online repair that puts the broader system at risk of OOM as
> > opposed to predictably failing gracefully may not be the most useful
> > tool.
> 
> Agreed.  One huge downside of memfd seems to be the lack of a mechanism
> for the vm to push back on us if we successfully write all we need to
> the memfd but then other processes need some memory.  Obviously, if the
> memfd write itself comes up short or fails then we dump the memfd and
> error back to userspace.  We might simply have to free array memory
> while we iterate the records to minimize the time spent at peak memory
> usage.
> 

Hm, yeah. Some kind of fixed/relative size in-core memory pool approach
may simplify things because we could allocate it up front and know right
away whether we just don't have enough memory available to repair.

Brian

> --D
> 
> > 
> > Brian
> > 
> > > --D
> > > 
> > > > Brian
> > > > 
> > > > > --D
> > > > > 
> > > > > > Brian
> > > > > > 
> > > > > > > --D
> > > > > > > 
> > > > > > > > Brian
> > > > > > > > 
> > > > > > > > > > > +
> > > > > > > > > > > +done:
> > > > > > > > > > > +	/* Free all the OWN_AG blocks that are not in the rmapbt/agfl. */
> > > > > > > > > > > +	xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG);
> > > > > > > > > > > +	return xrep_reap_extents(sc, old_allocbt_blocks, &oinfo,
> > > > > > > > > > > +			XFS_AG_RESV_NONE);
> > > > > > > > > > > +}
> > > > > > > > > > > +
> > > > > > > > > > ...
> > > > > > > > > > > diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
> > > > > > > > > > > index 0ed68379e551..82f99633a597 100644
> > > > > > > > > > > --- a/fs/xfs/xfs_extent_busy.c
> > > > > > > > > > > +++ b/fs/xfs/xfs_extent_busy.c
> > > > > > > > > > > @@ -657,3 +657,17 @@ xfs_extent_busy_ag_cmp(
> > > > > > > > > > >  		diff = b1->bno - b2->bno;
> > > > > > > > > > >  	return diff;
> > > > > > > > > > >  }
> > > > > > > > > > > +
> > > > > > > > > > > +/* Are there any busy extents in this AG? */
> > > > > > > > > > > +bool
> > > > > > > > > > > +xfs_extent_busy_list_empty(
> > > > > > > > > > > +	struct xfs_perag	*pag)
> > > > > > > > > > > +{
> > > > > > > > > > > +	spin_lock(&pag->pagb_lock);
> > > > > > > > > > > +	if (pag->pagb_tree.rb_node) {
> > > > > > > > > > 
> > > > > > > > > > RB_EMPTY_ROOT()?
> > > > > > > > > 
> > > > > > > > > Good suggestion, thank you!
> > > > > > > > > 
> > > > > > > > > --D
> > > > > > > > > 
> > > > > > > > > > Brian
> > > > > > > > > > 
> > > > > > > > > > > +		spin_unlock(&pag->pagb_lock);
> > > > > > > > > > > +		return false;
> > > > > > > > > > > +	}
> > > > > > > > > > > +	spin_unlock(&pag->pagb_lock);
> > > > > > > > > > > +	return true;
> > > > > > > > > > > +}
> > > > > > > > > > > diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h
> > > > > > > > > > > index 990ab3891971..2f8c73c712c6 100644
> > > > > > > > > > > --- a/fs/xfs/xfs_extent_busy.h
> > > > > > > > > > > +++ b/fs/xfs/xfs_extent_busy.h
> > > > > > > > > > > @@ -65,4 +65,6 @@ static inline void xfs_extent_busy_sort(struct list_head *list)
> > > > > > > > > > >  	list_sort(NULL, list, xfs_extent_busy_ag_cmp);
> > > > > > > > > > >  }
> > > > > > > > > > >  
> > > > > > > > > > > +bool xfs_extent_busy_list_empty(struct xfs_perag *pag);
> > > > > > > > > > > +
> > > > > > > > > > >  #endif /* __XFS_EXTENT_BUSY_H__ */
> > > > > > > > > > > 
> > > > > > > > > > > --
> > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > > > > > --
> > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > > > > --
> > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > > > --
> > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > --
> > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > --
> > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > the body of a message to majordomo@vger.kernel.org
> > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > --
> > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > the body of a message to majordomo@vger.kernel.org
> > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > --
> > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > the body of a message to majordomo@vger.kernel.org
> > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Darrick J. Wong Aug. 8, 2018, 10:42 p.m. UTC | #12
On Wed, Aug 08, 2018 at 08:29:54AM -0400, Brian Foster wrote:
> On Tue, Aug 07, 2018 at 04:34:58PM -0700, Darrick J. Wong wrote:
> > On Fri, Aug 03, 2018 at 06:49:40AM -0400, Brian Foster wrote:
> > > On Thu, Aug 02, 2018 at 12:22:05PM -0700, Darrick J. Wong wrote:
> > > > On Thu, Aug 02, 2018 at 09:48:24AM -0400, Brian Foster wrote:
> > > > > On Wed, Aug 01, 2018 at 11:28:45PM -0700, Darrick J. Wong wrote:
> > > > > > On Wed, Aug 01, 2018 at 02:39:20PM -0400, Brian Foster wrote:
> > > > > > > On Wed, Aug 01, 2018 at 09:23:16AM -0700, Darrick J. Wong wrote:
> > > > > > > > On Wed, Aug 01, 2018 at 07:54:09AM -0400, Brian Foster wrote:
> > > > > > > > > On Tue, Jul 31, 2018 at 03:01:25PM -0700, Darrick J. Wong wrote:
> > > > > > > > > > On Tue, Jul 31, 2018 at 01:47:23PM -0400, Brian Foster wrote:
> > > > > > > > > > > On Sun, Jul 29, 2018 at 10:48:21PM -0700, Darrick J. Wong wrote:
> > > > > > > > > > > > From: Darrick J. Wong <darrick.wong@oracle.com>
> > > > > > > > > > > > 
> > > > > > > > > > > > Rebuild the free space btrees from the gaps in the rmap btree.
> > > > > > > > > > > > 
> > > > > > > > > > > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > > > > > > > > > > > ---
> > > > > > > > > > > >  fs/xfs/Makefile             |    1 
> > > > > > > > > > > >  fs/xfs/scrub/alloc.c        |    1 
> > > > > > > > > > > >  fs/xfs/scrub/alloc_repair.c |  581 +++++++++++++++++++++++++++++++++++++++++++
> > > > > > > > > > > >  fs/xfs/scrub/common.c       |    8 +
> > > > > > > > > > > >  fs/xfs/scrub/repair.h       |    2 
> > > > > > > > > > > >  fs/xfs/scrub/scrub.c        |    4 
> > > > > > > > > > > >  fs/xfs/scrub/trace.h        |    2 
> > > > > > > > > > > >  fs/xfs/xfs_extent_busy.c    |   14 +
> > > > > > > > > > > >  fs/xfs/xfs_extent_busy.h    |    2 
> > > > > > > > > > > >  9 files changed, 610 insertions(+), 5 deletions(-)
> > > > > > > > > > > >  create mode 100644 fs/xfs/scrub/alloc_repair.c
> > > > > > > > > > > > 
> > > > > > > > > > > > 
> > > > > > > > > > > ...
> > > > > > > > > > > > diff --git a/fs/xfs/scrub/alloc_repair.c b/fs/xfs/scrub/alloc_repair.c
> > > > > > > > > > > > new file mode 100644
> > > > > > > > > > > > index 000000000000..b228c2906de2
> > > > > > > > > > > > --- /dev/null
> > > > > > > > > > > > +++ b/fs/xfs/scrub/alloc_repair.c
> > > > > > > > > > > > @@ -0,0 +1,581 @@
> > > > > > > ...
> > > > > > > > > > > > +
> > > > > > > > > > > > +/*
> > > > > > > > > > > > + * Make our new freespace btree roots permanent so that we can start freeing
> > > > > > > > > > > > + * unused space back into the AG.
> > > > > > > > > > > > + */
> > > > > > > > > > > > +STATIC int
> > > > > > > > > > > > +xrep_abt_commit_new(
> > > > > > > > > > > > +	struct xfs_scrub	*sc,
> > > > > > > > > > > > +	struct xfs_bitmap	*old_allocbt_blocks,
> > > > > > > > > > > > +	int			log_flags)
> > > > > > > > > > > > +{
> > > > > > > > > > > > +	int			error;
> > > > > > > > > > > > +
> > > > > > > > > > > > +	xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, log_flags);
> > > > > > > > > > > > +
> > > > > > > > > > > > +	/* Invalidate the old freespace btree blocks and commit. */
> > > > > > > > > > > > +	error = xrep_invalidate_blocks(sc, old_allocbt_blocks);
> > > > > > > > > > > > +	if (error)
> > > > > > > > > > > > +		return error;
> > > > > > > > > > > 
> > > > > > > > > > > It looks like the above invalidation all happens in the same
> > > > > > > > > > > transaction. Those aren't logging buffer data or anything, but any idea
> > > > > > > > > > > how many log formats we can get away with in this single transaction?
> > > > > > > > > > 
> > > > > > > > > > Hm... well, on my computer a log format is ~88 bytes.  Assuming 4K
> > > > > > > > > > blocks, the max AG size of 1TB, maximum free space fragmentation, and
> > > > > > > > > > two btrees, the tree could be up to ~270 million records.  Assuming ~505
> > > > > > > > > > records per block, that's ... ~531,000 leaf blocks and ~1100 node blocks
> > > > > > > > > > for both btrees.  If we invalidate both, that's ~46M of RAM?
> > > > > > > > > > 
> > > > > > > > > 
> > > > > > > > > I was thinking more about transaction reservation than RAM. It may not
> > > > > > > > 
> > > > > > > > Hmm.  tr_itruncate is ~650K on my 2TB SSD, assuming 88 bytes per, that's
> > > > > > > > about ... ~7300 log format items?  Not a lot, maybe it should roll the
> > > > > > > > transaction every 1000 invalidations or so...
> > > > > > > > 
> > > > > > > 
> > > > > > > I'm not really sure what categorizes as a lot here given that the blocks
> > > > > > > would need to be in-core, but rolling on some fixed/safe interval sounds
> > > > > > > reasonable to me.
> > > > > > > 
> > > > > > > > > currently be an issue, but it might be worth putting something down in a
> > > > > > > > > comment to note that this is a single transaction and we expect to not
> > > > > > > > > have to invalidate more than N (ballpark) blocks in a single go,
> > > > > > > > > whatever that value happens to be.
> > > > > > > > > 
> > > > > > > > > > > > +	error = xrep_roll_ag_trans(sc);
> > > > > > > > > > > > +	if (error)
> > > > > > > > > > > > +		return error;
> > > > > > > > > > > > +
> > > > > > > > > > > > +	/* Now that we've succeeded, mark the incore state valid again. */
> > > > > > > > > > > > +	sc->sa.pag->pagf_init = 1;
> > > > > > > > > > > > +	return 0;
> > > > > > > > > > > > +}
> > > > > > > > > > > > +
> > > > > > > > > > > > +/* Build new free space btrees and dispose of the old one. */
> > > > > > > > > > > > +STATIC int
> > > > > > > > > > > > +xrep_abt_rebuild_trees(
> > > > > > > > > > > > +	struct xfs_scrub	*sc,
> > > > > > > > > > > > +	struct list_head	*free_extents,
> > > > > > > > > > > > +	struct xfs_bitmap	*old_allocbt_blocks)
> > > > > > > > > > > > +{
> > > > > > > > > > > > +	struct xfs_owner_info	oinfo;
> > > > > > > > > > > > +	struct xrep_abt_extent	*rae;
> > > > > > > > > > > > +	struct xrep_abt_extent	*n;
> > > > > > > > > > > > +	struct xrep_abt_extent	*longest;
> > > > > > > > > > > > +	int			error;
> > > > > > > > > > > > +
> > > > > > > > > > > > +	xfs_rmap_skip_owner_update(&oinfo);
> > > > > > > > > > > > +
> > > > > > > > > > > > +	/*
> > > > > > > > > > > > +	 * Insert the longest free extent in case it's necessary to
> > > > > > > > > > > > +	 * refresh the AGFL with multiple blocks.  If there is no longest
> > > > > > > > > > > > +	 * extent, we had exactly the free space we needed; we're done.
> > > > > > > > > > > > +	 */
> > > > > > > > > > > 
> > > > > > > > > > > I'm confused by the last sentence. longest should only be NULL if the
> > > > > > > > > > > free space list is empty and haven't we already bailed out with -ENOSPC
> > > > > > > > > > > if that's the case?
> > > > > > > > > > > 
> > > > > > > > > > > > +	longest = xrep_abt_get_longest(free_extents);
> > > > > > > > > > 
> > > > > > > > > > xrep_abt_rebuild_trees is called after we allocate and initialize two
> > > > > > > > > > new btree roots in xrep_abt_reset_btrees.  If free_extents is an empty
> > > > > > > > > > list here, then we found exactly two blocks worth of free space and used
> > > > > > > > > > them to set up new btree roots.
> > > > > > > > > > 
> > > > > > > > > 
> > > > > > > > > Got it, thanks.
> > > > > > > > > 
> > > > > > > > > > > > +	if (!longest)
> > > > > > > > > > > > +		goto done;
> > > > > > > > > > > > +	error = xrep_abt_free_extent(sc,
> > > > > > > > > > > > +			XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, longest->bno),
> > > > > > > > > > > > +			longest->len, &oinfo);
> > > > > > > > > > > > +	list_del(&longest->list);
> > > > > > > > > > > > +	kmem_free(longest);
> > > > > > > > > > > > +	if (error)
> > > > > > > > > > > > +		return error;
> > > > > > > > > > > > +
> > > > > > > > > > > > +	/* Insert records into the new btrees. */
> > > > > > > > > > > > +	list_for_each_entry_safe(rae, n, free_extents, list) {
> > > > > > > > > > > > +		error = xrep_abt_free_extent(sc,
> > > > > > > > > > > > +				XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, rae->bno),
> > > > > > > > > > > > +				rae->len, &oinfo);
> > > > > > > > > > > > +		if (error)
> > > > > > > > > > > > +			return error;
> > > > > > > > > > > > +		list_del(&rae->list);
> > > > > > > > > > > > +		kmem_free(rae);
> > > > > > > > > > > > +	}
> > > > > > > > > > > 
> > > > > > > > > > > Ok, at this point we've reset the btree roots and we start freeing the
> > > > > > > > > > > free ranges that were discovered via the rmapbt analysis. AFAICT, if we
> > > > > > > > > > > fail or crash at this point, we leave the allocbts in a partially
> > > > > > > > > > > constructed state. I take it that is Ok with respect to the broader
> > > > > > > > > > > repair algorithm because we'd essentially start over by inspecting the
> > > > > > > > > > > rmapbt again on a retry.
> > > > > > > > > > 
> > > > > > > > > > Right.  Though in the crash/shutdown case, you'll end up with the
> > > > > > > > > > filesystem in an offline state at some point before you can retry the
> > > > > > > > > > scrub, it's probably faster to run xfs_repair to fix the damage.
> > > > > > > > > > 
> > > > > > > > > 
> > > > > > > > > Can we really assume that if we're already up and running an online
> > > > > > > > > repair? The filesystem has to be mountable in that case in the first
> > > > > > > > > place. If we've already reset and started reconstructing the allocation
> > > > > > > > > btrees then I'd think those transactions would recover just fine on a
> > > > > > > > > power loss or something (perhaps not in the event of some other
> > > > > > > > > corruption related shutdown).
> > > > > > > > 
> > > > > > > > Right, for the system crash case, whatever transactions committed should
> > > > > > > > replay just fine, and you can even start up the online repair again, and
> > > > > > > > if the AG isn't particularly close to ENOSPC then (barring rmap
> > > > > > > > corruption) it should work just fine.
> > > > > > > > 
> > > > > > > > If the fs went down because either (a) repair hit other corruption or
> > > > > > > > (b) some other thread hit an error in some other part of the filesystem,
> > > > > > > > then it's not so clear -- in (b) you could probably try again, but for
> > > > > > > > (a) you'll definitely have to unmount and run xfs_repair.
> > > > > > > > 
> > > > > > > 
> > > > > > > Indeed, there are certainly cases where we simply won't be able to do an
> > > > > > > online repair. I'm trying to think about scenarios where we should be
> > > > > > > able to do an online repair, but we lose power or hit some kind of
> > > > > > > transient error like a memory allocation failure before it completes. It
> > > > > > > would be nice if the online repair itself didn't contribute (within
> > > > > > > reason) to the inability to simply try again just because the fs was
> > > > > > > close to -ENOSPC.
> > > > > > 
> > > > > > Agreed.  Most of the, uh, opportunities to hit ENOMEM happen before we
> > > > > > start modifying on-disk metadata.  If that happens, we just free all the
> > > > > > memory and bail out having done nothing.
> > > > > > 
> > > > > > > For one, I think it's potentially confusing behavior. Second, it might
> > > > > > > be concerning to regular users who perceive it as an online repair
> > > > > > > leaving the fs in a worse off state. Us fs devs know that may not really
> > > > > > > be the case, but I think we're better for addressing it if we can
> > > > > > > reasonably do so.
> > > > > > 
> > > > > > <nod> Further in the future I want to add the ability to offline an AG,
> > > > > > so the worst that happens is that scrub turns the AG off, repair doesn't
> > > > > > fix it, and the AG simply stays offline.  That might give us the
> > > > > > ability to survive cancelling the repair transaction, since if the AG's
> > > > > > offline already anyway we could just throw away the dirty buffers and
> > > > > > resurrect the AG later.  I don't know, that's purely speculative.
> > > > > > 
> > > > > > > > Perhaps the guideline here is that if the fs goes down more than once
> > > > > > > > during online repair then unmount it and run xfs_repair.
> > > > > > > > 
> > > > > > > 
> > > > > > > Yep, I think that makes sense if the filesystem or repair itself is
> > > > > > > tripping over other corruptions that fail to keep it active for the
> > > > > > > duration of the repair.
> > > > > > 
> > > > > > <nod>
> > > > > > 
> > > > > > > > > > > The blocks allocated for the btrees that we've begun to construct here
> > > > > > > > > > > end up mapped in the rmapbt as we go, right? IIUC, that means we don't
> > > > > > > > > > > necessarily have infinite retries to make sure this completes. IOW,
> > > > > > > > > > > suppose that a first repair attempt finds just enough free space to
> > > > > > > > > > > construct new trees, gets far enough along to consume most of that free
> > > > > > > > > > > space and then crashes. Is it possible that a subsequent repair attempt
> > > > > > > > > > > includes the btree blocks allocated during the previous failed repair
> > > > > > > > > > > attempt in the sum of "old btree blocks" and determines we don't have
> > > > > > > > > > > enough free space to repair?
> > > > > > > > > > 
> > > > > > > > > > Yes, that's a risk of running the free space repair.
> > > > > > > > > > 
> > > > > > > > > 
> > > > > > > > > Can we improve on that? For example, are the rmapbt entries for the old
> > > > > > > > > allocation btree blocks necessary once we commit the btree resets? If
> > > > > > > > > not, could we remove those entries before we start tree reconstruction?
> > > > > > > > > 
> > > > > > > > > Alternatively, could we incorporate use of the old btree blocks? As it
> > > > > > > > > is, we discover those blocks simply so we can free them at the end.
> > > > > > > > > Perhaps we could free them sooner or find a more clever means to
> > > > > > > > > reallocate directly from that in-core list? I guess we have to consider
> > > > > > > > > whether they were really valid/sane btree blocks, but either way ISTM
> > > > > > > > > that the old blocks list is essentially invalidated once we reset the
> > > > > > > > > btrees.
> > > > > > > > 
> > > > > > > > Hmm, it's a little tricky to do that -- we could reap the old bnobt and
> > > > > > > > cntbt blocks (in the old_allocbt_blocks bitmap) first, but if adding a
> > > > > > > > record causes a btree split we'll pull blocks from the AGFL, and if
> > > > > > > > there aren't enough blocks in the bnobt to fill the AGFL back up then
> > > > > > > > fix_freelist won't succeed.  That complication is why it finds the
> > > > > > > > longest extent in the unclaimed list and pushes that in first, then
> > > > > > > > works on the rest of the extents.
> > > > > > > > 
> > > > > > > 
> > > > > > > Hmm, but doesn't a btree split require at least one full space btree
> > > > > > > block per-level? In conjunction, the agfl minimum size requirement grows
> > > > > > > with the height of the tree, which implies available free space..? I
> > > > > > > could be missing something, perhaps we have to account for the rmapbt in
> > > > > > > that case as well? Regardless...
> > > > > > > 
> > > > > > > > I suppose one could try to avoid ENOSPC by pushing that longest extent
> > > > > > > > in first (since we know that won't trigger a split), then reap the old
> > > > > > > > alloc btree blocks, and then add everything else back in...
> > > > > > > > 
> > > > > > > 
> > > > > > > I think it would be reasonable to seed the btree with the longest record
> > > > > > > or some fixed number of longest records (~1/2 a root block, for example)
> > > > > > > before making actual use of the btrees to reap the old blocks. I think
> > > > > > > then you'd only have a very short window of a single block leak on a
> > > > > > > poorly timed power loss and repair retry sequence before you start
> > > > > > > actually freeing originally used space (which in practice, I think
> > > > > > > solves the problem).
> > > > > > > 
> > > > > > > Given that we're starting from empty, I wonder if another option may be
> > > > > > > to over fill the agfl with old btree blocks or something. The first real
> > > > > > > free should shift enough blocks back into the btrees to ensure the agfl
> > > > > > > can be managed from that point forward, right? That may be more work
> > > > > > > than it's worth though and/or a job for another patch. (FWIW, we also
> > > > > > > have that NOSHRINK agfl fixup flag for userspace repair.)
> > > > > > 
> > > > > > Yes, I'll give that a try tomorrow, now that I've finished porting all
> > > > > > the 4.19 stuff to xfsprogs. :)
> > > > > > 
> > > > > > Looping back to something we discussed earlier in this thread, I'd
> > > > > > prefer to hold off on converting the list of already-freed extents to
> > > > > > xfs_bitmap because the same problem exists in all the repair functions
> > > > > > of having to store a large number of records for the rebuilt btree, and
> > > > > > maybe there's some way to <cough> use pageable memory for that, since
> > > > > > the access patterns for that are append, sort, and iterate; for those
> > > > > > three uses we don't necessarily require all the records to be in memory
> > > > > > all the time.  For the allocbt repair I expect the free space records to
> > > > > > be far more numerous than the list of old bnobt/cntbt blocks.
> > > > > > 
> > > > > 
> > > > > Ok, it's fair enough that we'll probably want to find some kind of
> > > > > generic, more efficient technique for handling this across the various
> > > > > applicable repair algorithms.
> > > > > 
> > > > > One other high level thing that crossed my mind with regard to the
> > > > > general btree reconstruction algorithms is whether we need to build up
> > > > > this kind of central record list at all. For example, rather than slurp
> > > > > up the entire list of btree records in-core, sort it and dump it back
> > > > > out, could we take advantage of the fact that our existing on-disk
> > > > > structure insertion mechanisms already handle out of order records
> > > > > (simply stated, an extent free knows how to insert the associated record
> > > > > at the right place in the space btrees)? For example, suppose we reset
> > > > > the existing btrees first, then scanned the rmapbt and repopulated the
> > > > > new btrees as records are discovered..?
> > > > 
> > > > I tried that in an earlier draft of the bnobt repair function.  The
> > > > biggest problem with inserting as we go is dealing with the inevitable
> > > > transaction rolls (right now we do after every record insertion to avoid
> > > > playing games with guessing how much reservation is left).  Btree
> > > > cursor state can't survive transaction rolls because the transaction
> > > > commit releases all the buffers that aren't bhold'en, and we can't bhold
> > > > that many buffers across a _defer_finish.
> > > > 
> > > 
> > > Ok, interesting.
> > > 
> > > Where do we need to run an xfs_defer_finish() during the reconstruction
> > > sequence, btw?
> > 
> > Not here, as I'm sure you were thinking. :)  For the AG btrees
> > themselves it's sufficient to roll the transaction.  I suppose we could
> > simply have a xfs_btree_bhold function that would bhold every buffer so
> > that a cursor could survive a roll.
> > 
> > Inode fork reconstruction is going to require _defer_finish, however.
> > 
> 
> Ok, just wasn't sure if I missed something in the bits I've looked
> through so far..
> 
> > > I thought that would only run on final commit as opposed to
> > > intermediate rolls.
> > 
> > We could let the deferred items sit around until final commit, but I
> > think I'd prefer to process them as soon as possible since iirc deferred
> > items pin the log until they're finished.  I would hope that userspace
> > isn't banging on the log while repair runs, but it's certainly possible.
> > 
> 
> I was just surmising in general, not necessarily suggesting we change
> behavior.

Oh, ok.  Sorry, I misinterpreted you. :)

> > > We could just try and make the automatic buffer relogging list a
> > > dynamic allocation if there are enough held buffers in the
> > > transaction.
> > 
> > Hmm.  Might be worth pursuing...
> > 
> > > > So, that early draft spent a lot of time tearing down and reconstructing
> > > > rmapbt cursors since the standard _btree_query_all isn't suited to that
> > > > kind of usage.  It was easily twice as slow on a RAM-backed disk just
> > > > from the rmap cursor overhead and much more complex, so I rewrote it to
> > > > be simpler.  I also have a slight preference for not touching anything
> > > > until we're absolutely sure we have all the data we need to repair the
> > > > structure.
> > > > 
> > > 
> > > Yes, I think that is sane in principle. I'm just a bit concerned about
> > > how reliable that xfs_repair-like approach will be in the kernel longer
> > > term, particularly once we start having to deal with large filesystems
> > > and limited or contended memory, etc. We already have xfs_repair users
> > > that need to tweak settings because there isn't enough memory available
> > > to repair the fs. Granted that is for fs-wide repairs and the flipside
> > > is that we know a single AG can only be up to 1TB. It's certainly
> > > possible that putting some persistent backing behind the in-core data is
> > > enough to resolve the problem (and the current approach is certainly
> > > reasonable enough to me for the initial implementation).
> > > 
> > > bjoin limitations aside, I wonder if a cursor roll mechanism that held
> > > all of the cursor buffers, rolled the transaction and then rejoined all
> > > said buffers would help us get around that. (Not sure I follow the early
> > > prototype behavior, but it sounds like we had to restart the rmapbt
> > > lookup over and over...).
> > 
> > Correct.
> > 
> > > Another caveat with that approach may be that I think we'd need to be
> > > sure that the reconstruction operation doesn't ever need to update the
> > > rmapbt while we're mid walk of the latter.
> > 
> > <nod> Looking even farther back in my notes, that was also an issue --
> > fixing the free list causes blocks to go on or off the agfl, which
> > causes rmapbt updates, which meant that the only way I could get
> > in-place updates to work was to re-lookup where we were in the btree and
> > also try to deal with any rmapbt entries that might have crept in as
> > result of the record insertion.
> > 
> > Getting the concurrency right for each repair function looked like a
> > difficult problem to solve, but amassing all the records elsewhere and
> > rebuilding was easy to understand.
> > 
> 
> Yeah. This all points to this kind of strategy being too complex to be
> worth the prospective benefits in the short term. Clearly we have
> several, potentially tricky roadblocks to work through before this can
> be made feasible. Thanks for the background, it's still useful to have
> this context to compare with whatever we may have to do to support a
> reclaimable memory approach.

<nod>  Reclaimable memfd "memory" isn't too difficult, we can call
kernel_read and kernel_write, though lockdep gets pretty mad about xfs
taking sb_start_write (on the memfd filesystem) at the same time it has
sb_starT_write on the xfs (not to mention the stack usage) so I had to
throw in the extra twist of delegating the actual file io to a workqueue
item (a la xfs_btree_split).

> > > That may be an issue for inode btree reconstruction, for example,
> > > since it looks like inobt block allocation requires rmapbt updates.
> > > We'd probably need some way to share (or invalidate) a cursor across
> > > different contexts to deal with that.
> > 
> > I might pursue that strategy if we ever hit the point where we can't
> > find space to store the records (see below).  Another option could be to
> > divert all deferred items for an AG, build a replacement btree in new
> > space, then finish all the deferred items... but that's starting to get
> > into offlineable AGs, which is its own project that I want to tackle
> > later.
> > 
> > (Not that much later, just not this cycle.)
> > 
> 
> *nod*
> 
> > > > For other repair functions (like the data/attr fork repairs) we have to
> > > > scan all the rmapbts for extents, and I'd prefer to lock those AGs only
> > > > for as long as necessary to extract the extents we want.
> > > > 
> > > > > The obvious problem is that we still have some checks that allow the
> > > > > whole repair operation to bail out before we determine whether we can
> > > > > start to rebuild the on-disk btrees. These are things like making sure
> > > > > we can actually read the associated rmapbt blocks (i.e., no read errors
> > > > > or verifier failures), basic record sanity checks, etc. But ISTM that
> > > > > isn't anything we couldn't get around with a multi-pass implementation.
> > > > > Secondary issues might be things like no longer being able to easily
> > > > > insert the longest free extent range(s) first (meaning we'd have to
> > > > > stuff the agfl with old btree blocks or figure out some other approach).
> > > > 
> > > > Well, you could scan the rmapbt twice -- once to find the longest
> > > > record, then again to do the actual insertion.
> > > > 
> > > 
> > > Yep, that's what I meant by multi-pass.
> > > 
> > > > > BTW, isn't the typical scrub sequence already multi-pass by virtue of
> > > > > the xfs_scrub_metadata() implementation? I'm wondering if the ->scrub()
> > > > > callout could not only detect corruption, but validate whether repair
> > > > > (if requested) is possible based on the kind of checks that are
> > > > > currently in the repair side rmapbt walkers. Thoughts?r
> > > > 
> > > > Yes, scrub basically validates that for us now, with the notable
> > > > exception of the notorious rmapbt scrubber, which doesn't
> > > > cross-reference with inode block mappings because that would be a
> > > > locking nightmare.
> > > > 
> > > > > Are there future
> > > > > changes that are better supported by an in-core tracking structure in
> > > > > general (assuming we'll eventually replace the linked lists with
> > > > > something more efficient) as opposed to attempting to optimize out the
> > > > > need for that tracking at all?
> > > > 
> > > > Well, I was thinking that we could just allocate a memfd (or a file on
> > > > the same xfs once we have AG offlining) and store the records in there.
> > > > That saves us the list_head overhead and potentially enables access to a
> > > > lot more storage than pinning things in RAM.
> > > > 
> > > 
> > > Would using the same fs mean we have to store the repair data in a
> > > separate AG, or somehow locate/use free space in the target AG?
> > 
> > As part of building an "offline AG" feature we'd presumably have to
> > teach the allocators to avoid the offline AGs for allocations, which
> > would make it so that we could host the repair data files in the same
> > XFS that's being fixed.  That seems a little risky to me, but the disk
> > is probably larger than mem+swap.
> > 
> 
> Got it, so we'd use the remaining space in the fs outside of the target
> AG. ISTM that still presumes the rest of the fs is coherent, but I
> suppose the offline AG thing helps us with that. We'd just have to make
> sure we've shut down all currently corrupted AGs before we start to
> repair a particular corrupted one, and then hope there's still enough
> free space in the fs to proceed.

That's a pretty big hope. :)  I think for now 

> That makes more sense, but I still agree that it seems risky in general.
> Technical risk aside, there's also usability concerns in that the local
> free space requirement is another bit of non-determinism

I don't think it's non-deterministic, it's just hard for the filesystem
to communicate to the user/admin ahead of time.  Roughly speaking, we
need to have about as much disk space for the new btree as we had
allocated for the old one.

As far as memory requirements go, in last week's revising of the patches
I compressed the in-memory record structs down about as far as possible;
with the removal of the list heads, the memory requirements drop by
30-60%.  We require the same amount of memory as would be needed to
store all of the records in the leaf nodes, and no more, and we can use
swap space to do it.

> around the ability to online repair vs. having to punt to xfs_repair,
> or if the repair consumes whatever free space remains in the fs to the
> detriment of whatever workload the user presumably wanted to keep the
> fs online for, etc.

I've occasionally thought that future xfs_scrub could ask the kernel to
estimate how much disk and memory it will need for the repair (and
whether the disk space requirement is fs-scope or AG-scope); then it
could forego a repair action and recommend xfs_repair if running the
online repair would take the system below some configurable threshold.

> > > presume either way we'd have to ensure that AG is either consistent or
> > > locked out from outside I/O. If we have the total record count we can
> > 
> > We usually don't, but for the btrees that have their own record/blocks
> > counters we might be able to guess a number, fallocate it, and see if
> > that doesn't ENOSPC.
> > 
> > > preallocate the file and hope there is no such other free space
> > > corruption or something that would allow some other task to mess with
> > > our blocks. I'm a little skeptical overall on relying on a corrupted
> > > filesystem to store repair data, but perhaps there are ways to mitigate
> > > the risks.
> > 
> > Store it elsewhere?  /home for root repairs, /root for any other
> > repair... though if we're going to do that, why not just add a swap file
> > temporarily?
> > 
> 
> Indeed. The thought crossed my mind about whether we could do something
> like have an internal/isolated swap file for dedicated XFS allocations
> to avoid contention with the traditional swap.

Heh, I think e2fsck has some feature like that where you can pass it a
swap file.  No idea how much good that does on modern systems where
there's one huge partition... :)

> Userspace could somehow set it up or communicate to the kernel. I have
> no idea how realistic that is though or if there's a better interface
> for that kind of thing (i.e., file backed kmem cache?).

I looked, and there aren't any other mechanisms for unpinnned kernel
memory allocations.

> What _seems_ beneficial about that approach is we get (potentially
> external) persistent backing and memory reclaim ability with the
> traditional memory allocation model.
>
> ISTM that if we used a regular file, we'd need to deal with the
> traditional file interface somehow or another (file read/pagecache
> lookup -> record ??).

Yes, that's all neatly wrapped up in kernel_read() and kernel_write() so
all we need is a (struct file *).

> We could repurpose some existing mechanism like the directory code or
> quota inode mechanism to use xfs buffers for that purpose, but I think
> that would require us to always use an internal inode. Allowing
> userspace to pass an fd/file passes that consideration on to the user,
> which might be more flexible. We could always warn about additional
> limitations if that fd happens to be based on the target fs.

<nod> A second advantage of the struct file/kernel_{read,write} approach
is that we if we ever decide to let userspace pass in a fd, it's trivial
to feed that struct file to the kernel io routines instead of a memfd
one.

> > > I'm not familiar with memfd. The manpage suggests it's ram backed, is it
> > > swappable or something?
> > 
> > It's supposed to be.  The quick test I ran (allocate a memfd, write 1GB
> > of junk to it on a VM with 400M of RAM) seemed to push about 980MB into
> > the swap file.
> > 
> 
> Ok.
> 
> > > If so, that sounds a reasonable option provided the swap space
> > > requirement can be made clear to users
> > 
> > We can document it.  I don't think it's any worse than xfs_repair being
> > able to use up all the memory + swap... and since we're probably only
> > going to be repairing one thing at a time, most likely scrub won't need
> > as much memory.
> > 
> 
> Right, but as noted below, my concerns with the xfs_repair comparison
> are that 1.) the kernel generally has more of a limit on anonymous
> memory allocations than userspace (i.e., not swappable AFAIU?) and 2.)
> it's not clear how effectively running the system out of memory via the
> kernel will behave from a failure perspective.
> 
> IOW, xfs_repair can run the system out of memory but for the most part
> that ends up being a simple problem for the system: OOM kill the bloated
> xfs_repair process. For an online repair in a similar situation, I have
> no idea what's going to happen.

Back in the days of the huge linked lists the oom killer would target
other proceses because it doesn't know that the online repair thread is
sitting on a ton of pinned kernel memory...

> The hope is that the online repair hits -ENOMEM and unwinds, but ISTM
> we'd still be at risk of other subsystems running into memory
> allocation problems, filling up swap, the OOM killer going after
> unrelated processes, etc.  What if, for example, the OOM killer starts
> picking off processes in service to a running online repair that
> immediately consumes freed up memory until the system is borked?

Yeah.  One thing we /could/ do is register an oom notifier that would
urge any running repair threads to bail out if they can.  It seems to me
that the oom killer blocks on the oom_notify_list chain, so our handler
could wait until at least one thread exits before returning.

> I don't know how likely that is or if it really ends up much different
> from the analogous xfs_repair situation. My only point right now is
> that failure scenario is something we should explore for any solution
> we ultimately consider because it may be an unexpected use case of the
> underlying mechanism.

Ideally, online repair would always be the victim since we know we have
a reasonable fallback.  At least for memfd, however, I think the only
clues we have to decide the question "is this memfd getting in the way
of other threads?" is either seeing ENOMEM, short writes, or getting
kicked by an oom notification.  Maybe that'll be enough?

> (To the contrary, just using a cached file seems a natural fit from
> that perspective.)

Same here.

> > > and the failure characteristics aren't more severe than for userspace.
> > > An online repair that puts the broader system at risk of OOM as
> > > opposed to predictably failing gracefully may not be the most useful
> > > tool.
> > 
> > Agreed.  One huge downside of memfd seems to be the lack of a mechanism
> > for the vm to push back on us if we successfully write all we need to
> > the memfd but then other processes need some memory.  Obviously, if the
> > memfd write itself comes up short or fails then we dump the memfd and
> > error back to userspace.  We might simply have to free array memory
> > while we iterate the records to minimize the time spent at peak memory
> > usage.
> > 
> 
> Hm, yeah. Some kind of fixed/relative size in-core memory pool approach
> may simplify things because we could allocate it up front and know right
> away whether we just don't have enough memory available to repair.

Hmm.  Apparently we actually /can/ call fallocate on memfd to grab all
the pages at once, provided we have some guesstimate beforehand of how
much space we think we'll need.

So long as my earlier statement about the memory requirements being no
more than the size of the btree leaves is actually true (I haven't
rigorously tried to prove it), we need about (xrep_calc_ag_resblks() *
blocksize) worth of space in the memfd file.  Maybe we ask for 1.5x
that and if we don't get it, we kill the memfd and exit.

--D

> 
> Brian
> 
> > --D
> > 
> > > 
> > > Brian
> > > 
> > > > --D
> > > > 
> > > > > Brian
> > > > > 
> > > > > > --D
> > > > > > 
> > > > > > > Brian
> > > > > > > 
> > > > > > > > --D
> > > > > > > > 
> > > > > > > > > Brian
> > > > > > > > > 
> > > > > > > > > > > > +
> > > > > > > > > > > > +done:
> > > > > > > > > > > > +	/* Free all the OWN_AG blocks that are not in the rmapbt/agfl. */
> > > > > > > > > > > > +	xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG);
> > > > > > > > > > > > +	return xrep_reap_extents(sc, old_allocbt_blocks, &oinfo,
> > > > > > > > > > > > +			XFS_AG_RESV_NONE);
> > > > > > > > > > > > +}
> > > > > > > > > > > > +
> > > > > > > > > > > ...
> > > > > > > > > > > > diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
> > > > > > > > > > > > index 0ed68379e551..82f99633a597 100644
> > > > > > > > > > > > --- a/fs/xfs/xfs_extent_busy.c
> > > > > > > > > > > > +++ b/fs/xfs/xfs_extent_busy.c
> > > > > > > > > > > > @@ -657,3 +657,17 @@ xfs_extent_busy_ag_cmp(
> > > > > > > > > > > >  		diff = b1->bno - b2->bno;
> > > > > > > > > > > >  	return diff;
> > > > > > > > > > > >  }
> > > > > > > > > > > > +
> > > > > > > > > > > > +/* Are there any busy extents in this AG? */
> > > > > > > > > > > > +bool
> > > > > > > > > > > > +xfs_extent_busy_list_empty(
> > > > > > > > > > > > +	struct xfs_perag	*pag)
> > > > > > > > > > > > +{
> > > > > > > > > > > > +	spin_lock(&pag->pagb_lock);
> > > > > > > > > > > > +	if (pag->pagb_tree.rb_node) {
> > > > > > > > > > > 
> > > > > > > > > > > RB_EMPTY_ROOT()?
> > > > > > > > > > 
> > > > > > > > > > Good suggestion, thank you!
> > > > > > > > > > 
> > > > > > > > > > --D
> > > > > > > > > > 
> > > > > > > > > > > Brian
> > > > > > > > > > > 
> > > > > > > > > > > > +		spin_unlock(&pag->pagb_lock);
> > > > > > > > > > > > +		return false;
> > > > > > > > > > > > +	}
> > > > > > > > > > > > +	spin_unlock(&pag->pagb_lock);
> > > > > > > > > > > > +	return true;
> > > > > > > > > > > > +}
> > > > > > > > > > > > diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h
> > > > > > > > > > > > index 990ab3891971..2f8c73c712c6 100644
> > > > > > > > > > > > --- a/fs/xfs/xfs_extent_busy.h
> > > > > > > > > > > > +++ b/fs/xfs/xfs_extent_busy.h
> > > > > > > > > > > > @@ -65,4 +65,6 @@ static inline void xfs_extent_busy_sort(struct list_head *list)
> > > > > > > > > > > >  	list_sort(NULL, list, xfs_extent_busy_ag_cmp);
> > > > > > > > > > > >  }
> > > > > > > > > > > >  
> > > > > > > > > > > > +bool xfs_extent_busy_list_empty(struct xfs_perag *pag);
> > > > > > > > > > > > +
> > > > > > > > > > > >  #endif /* __XFS_EXTENT_BUSY_H__ */
> > > > > > > > > > > > 
> > > > > > > > > > > > --
> > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > > > > > > --
> > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > > > > > --
> > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > > > > --
> > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > > --
> > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > --
> > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > --
> > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > the body of a message to majordomo@vger.kernel.org
> > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > --
> > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > the body of a message to majordomo@vger.kernel.org
> > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > --
> > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > the body of a message to majordomo@vger.kernel.org
> > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Brian Foster Aug. 9, 2018, noon UTC | #13
On Wed, Aug 08, 2018 at 03:42:32PM -0700, Darrick J. Wong wrote:
> On Wed, Aug 08, 2018 at 08:29:54AM -0400, Brian Foster wrote:
> > On Tue, Aug 07, 2018 at 04:34:58PM -0700, Darrick J. Wong wrote:
> > > On Fri, Aug 03, 2018 at 06:49:40AM -0400, Brian Foster wrote:
> > > > On Thu, Aug 02, 2018 at 12:22:05PM -0700, Darrick J. Wong wrote:
> > > > > On Thu, Aug 02, 2018 at 09:48:24AM -0400, Brian Foster wrote:
> > > > > > On Wed, Aug 01, 2018 at 11:28:45PM -0700, Darrick J. Wong wrote:
> > > > > > > On Wed, Aug 01, 2018 at 02:39:20PM -0400, Brian Foster wrote:
> > > > > > > > On Wed, Aug 01, 2018 at 09:23:16AM -0700, Darrick J. Wong wrote:
> > > > > > > > > On Wed, Aug 01, 2018 at 07:54:09AM -0400, Brian Foster wrote:
> > > > > > > > > > On Tue, Jul 31, 2018 at 03:01:25PM -0700, Darrick J. Wong wrote:
> > > > > > > > > > > On Tue, Jul 31, 2018 at 01:47:23PM -0400, Brian Foster wrote:
> > > > > > > > > > > > On Sun, Jul 29, 2018 at 10:48:21PM -0700, Darrick J. Wong wrote:
...
> > > > > So, that early draft spent a lot of time tearing down and reconstructing
> > > > > rmapbt cursors since the standard _btree_query_all isn't suited to that
> > > > > kind of usage.  It was easily twice as slow on a RAM-backed disk just
> > > > > from the rmap cursor overhead and much more complex, so I rewrote it to
> > > > > be simpler.  I also have a slight preference for not touching anything
> > > > > until we're absolutely sure we have all the data we need to repair the
> > > > > structure.
> > > > > 
> > > > 
> > > > Yes, I think that is sane in principle. I'm just a bit concerned about
> > > > how reliable that xfs_repair-like approach will be in the kernel longer
> > > > term, particularly once we start having to deal with large filesystems
> > > > and limited or contended memory, etc. We already have xfs_repair users
> > > > that need to tweak settings because there isn't enough memory available
> > > > to repair the fs. Granted that is for fs-wide repairs and the flipside
> > > > is that we know a single AG can only be up to 1TB. It's certainly
> > > > possible that putting some persistent backing behind the in-core data is
> > > > enough to resolve the problem (and the current approach is certainly
> > > > reasonable enough to me for the initial implementation).
> > > > 
> > > > bjoin limitations aside, I wonder if a cursor roll mechanism that held
> > > > all of the cursor buffers, rolled the transaction and then rejoined all
> > > > said buffers would help us get around that. (Not sure I follow the early
> > > > prototype behavior, but it sounds like we had to restart the rmapbt
> > > > lookup over and over...).
> > > 
> > > Correct.
> > > 
> > > > Another caveat with that approach may be that I think we'd need to be
> > > > sure that the reconstruction operation doesn't ever need to update the
> > > > rmapbt while we're mid walk of the latter.
> > > 
> > > <nod> Looking even farther back in my notes, that was also an issue --
> > > fixing the free list causes blocks to go on or off the agfl, which
> > > causes rmapbt updates, which meant that the only way I could get
> > > in-place updates to work was to re-lookup where we were in the btree and
> > > also try to deal with any rmapbt entries that might have crept in as
> > > result of the record insertion.
> > > 
> > > Getting the concurrency right for each repair function looked like a
> > > difficult problem to solve, but amassing all the records elsewhere and
> > > rebuilding was easy to understand.
> > > 
> > 
> > Yeah. This all points to this kind of strategy being too complex to be
> > worth the prospective benefits in the short term. Clearly we have
> > several, potentially tricky roadblocks to work through before this can
> > be made feasible. Thanks for the background, it's still useful to have
> > this context to compare with whatever we may have to do to support a
> > reclaimable memory approach.
> 
> <nod>  Reclaimable memfd "memory" isn't too difficult, we can call
> kernel_read and kernel_write, though lockdep gets pretty mad about xfs
> taking sb_start_write (on the memfd filesystem) at the same time it has
> sb_starT_write on the xfs (not to mention the stack usage) so I had to
> throw in the extra twist of delegating the actual file io to a workqueue
> item (a la xfs_btree_split).
> 

Ok, I'm more curious what the surrounding code looks like around
managing the underlying file pages. Now that I think of it, the primary
usage was to dump everything into the file and read it back
sequentually, so perhaps this really isn't that difficult to deal with
since the file content is presumably fixed size data structures. (Hmm,
was there a sort in there somewhere as well?).

> > > > That may be an issue for inode btree reconstruction, for example,
> > > > since it looks like inobt block allocation requires rmapbt updates.
> > > > We'd probably need some way to share (or invalidate) a cursor across
> > > > different contexts to deal with that.
> > > 
> > > I might pursue that strategy if we ever hit the point where we can't
> > > find space to store the records (see below).  Another option could be to
> > > divert all deferred items for an AG, build a replacement btree in new
> > > space, then finish all the deferred items... but that's starting to get
> > > into offlineable AGs, which is its own project that I want to tackle
> > > later.
> > > 
> > > (Not that much later, just not this cycle.)
> > > 
> > 
> > *nod*
> > 
> > > > > For other repair functions (like the data/attr fork repairs) we have to
> > > > > scan all the rmapbts for extents, and I'd prefer to lock those AGs only
> > > > > for as long as necessary to extract the extents we want.
> > > > > 
> > > > > > The obvious problem is that we still have some checks that allow the
> > > > > > whole repair operation to bail out before we determine whether we can
> > > > > > start to rebuild the on-disk btrees. These are things like making sure
> > > > > > we can actually read the associated rmapbt blocks (i.e., no read errors
> > > > > > or verifier failures), basic record sanity checks, etc. But ISTM that
> > > > > > isn't anything we couldn't get around with a multi-pass implementation.
> > > > > > Secondary issues might be things like no longer being able to easily
> > > > > > insert the longest free extent range(s) first (meaning we'd have to
> > > > > > stuff the agfl with old btree blocks or figure out some other approach).
> > > > > 
> > > > > Well, you could scan the rmapbt twice -- once to find the longest
> > > > > record, then again to do the actual insertion.
> > > > > 
> > > > 
> > > > Yep, that's what I meant by multi-pass.
> > > > 
> > > > > > BTW, isn't the typical scrub sequence already multi-pass by virtue of
> > > > > > the xfs_scrub_metadata() implementation? I'm wondering if the ->scrub()
> > > > > > callout could not only detect corruption, but validate whether repair
> > > > > > (if requested) is possible based on the kind of checks that are
> > > > > > currently in the repair side rmapbt walkers. Thoughts?r
> > > > > 
> > > > > Yes, scrub basically validates that for us now, with the notable
> > > > > exception of the notorious rmapbt scrubber, which doesn't
> > > > > cross-reference with inode block mappings because that would be a
> > > > > locking nightmare.
> > > > > 
> > > > > > Are there future
> > > > > > changes that are better supported by an in-core tracking structure in
> > > > > > general (assuming we'll eventually replace the linked lists with
> > > > > > something more efficient) as opposed to attempting to optimize out the
> > > > > > need for that tracking at all?
> > > > > 
> > > > > Well, I was thinking that we could just allocate a memfd (or a file on
> > > > > the same xfs once we have AG offlining) and store the records in there.
> > > > > That saves us the list_head overhead and potentially enables access to a
> > > > > lot more storage than pinning things in RAM.
> > > > > 
> > > > 
> > > > Would using the same fs mean we have to store the repair data in a
> > > > separate AG, or somehow locate/use free space in the target AG?
> > > 
> > > As part of building an "offline AG" feature we'd presumably have to
> > > teach the allocators to avoid the offline AGs for allocations, which
> > > would make it so that we could host the repair data files in the same
> > > XFS that's being fixed.  That seems a little risky to me, but the disk
> > > is probably larger than mem+swap.
> > > 
> > 
> > Got it, so we'd use the remaining space in the fs outside of the target
> > AG. ISTM that still presumes the rest of the fs is coherent, but I
> > suppose the offline AG thing helps us with that. We'd just have to make
> > sure we've shut down all currently corrupted AGs before we start to
> > repair a particular corrupted one, and then hope there's still enough
> > free space in the fs to proceed.
> 
> That's a pretty big hope. :)  I think for now 
> 
> > That makes more sense, but I still agree that it seems risky in general.
> > Technical risk aside, there's also usability concerns in that the local
> > free space requirement is another bit of non-determinism
> 
> I don't think it's non-deterministic, it's just hard for the filesystem
> to communicate to the user/admin ahead of time.  Roughly speaking, we
> need to have about as much disk space for the new btree as we had
> allocated for the old one.
> 

Right, maybe non-deterministic is not the best term. What I mean is that
it's not clear to the user why a particular filesystem may not be able
to run a repair (e.g., if it has plenty of reported free space but
enough AGs may be shut down due to corruption). So in certain scenarios
an unrelated corruption or particular ordering of AG repairs could be
the difference between whether an online repair succeeds or defers to
offline repair on the otherwise same filesystem. 

> As far as memory requirements go, in last week's revising of the patches
> I compressed the in-memory record structs down about as far as possible;
> with the removal of the list heads, the memory requirements drop by
> 30-60%.  We require the same amount of memory as would be needed to
> store all of the records in the leaf nodes, and no more, and we can use
> swap space to do it.
> 

Nice. When looking at the existing structures it looked like a worst
case (1TB AG, every other 1k block allocated) could require up to
10-12GB RAM (but I could have easily messed that up). That's not insane
on its own, it's just the question of allocating that much memory in the
kernel. Slimming that down and pushing it into something swappable
doesn't _sound_ too overbearing. I'm not really sure what default distro
swap sizes are these days (some % of RAM?), but it shouldn't be that
hard to find ~10GB of disk space somewhere to facilitate a repair.

> > around the ability to online repair vs. having to punt to xfs_repair,
> > or if the repair consumes whatever free space remains in the fs to the
> > detriment of whatever workload the user presumably wanted to keep the
> > fs online for, etc.
> 
> I've occasionally thought that future xfs_scrub could ask the kernel to
> estimate how much disk and memory it will need for the repair (and
> whether the disk space requirement is fs-scope or AG-scope); then it
> could forego a repair action and recommend xfs_repair if running the
> online repair would take the system below some configurable threshold.
> 

I think something like that would improve usability once we nail down
the core mechanism.

> > > > presume either way we'd have to ensure that AG is either consistent or
> > > > locked out from outside I/O. If we have the total record count we can
> > > 
> > > We usually don't, but for the btrees that have their own record/blocks
> > > counters we might be able to guess a number, fallocate it, and see if
> > > that doesn't ENOSPC.
> > > 
> > > > preallocate the file and hope there is no such other free space
> > > > corruption or something that would allow some other task to mess with
> > > > our blocks. I'm a little skeptical overall on relying on a corrupted
> > > > filesystem to store repair data, but perhaps there are ways to mitigate
> > > > the risks.
> > > 
> > > Store it elsewhere?  /home for root repairs, /root for any other
> > > repair... though if we're going to do that, why not just add a swap file
> > > temporarily?
> > > 
> > 
> > Indeed. The thought crossed my mind about whether we could do something
> > like have an internal/isolated swap file for dedicated XFS allocations
> > to avoid contention with the traditional swap.
> 
> Heh, I think e2fsck has some feature like that where you can pass it a
> swap file.  No idea how much good that does on modern systems where
> there's one huge partition... :)
> 

Interesting. Couldn't you always create an additional swap file, run the
repair then kill it off when it's no longer needed?

> > Userspace could somehow set it up or communicate to the kernel. I have
> > no idea how realistic that is though or if there's a better interface
> > for that kind of thing (i.e., file backed kmem cache?).
> 
> I looked, and there aren't any other mechanisms for unpinnned kernel
> memory allocations.
> 

Ok, it looks like swap or traditional files it is then. ;P

> > What _seems_ beneficial about that approach is we get (potentially
> > external) persistent backing and memory reclaim ability with the
> > traditional memory allocation model.
> >
> > ISTM that if we used a regular file, we'd need to deal with the
> > traditional file interface somehow or another (file read/pagecache
> > lookup -> record ??).
> 
> Yes, that's all neatly wrapped up in kernel_read() and kernel_write() so
> all we need is a (struct file *).
> 
> > We could repurpose some existing mechanism like the directory code or
> > quota inode mechanism to use xfs buffers for that purpose, but I think
> > that would require us to always use an internal inode. Allowing
> > userspace to pass an fd/file passes that consideration on to the user,
> > which might be more flexible. We could always warn about additional
> > limitations if that fd happens to be based on the target fs.
> 
> <nod> A second advantage of the struct file/kernel_{read,write} approach
> is that we if we ever decide to let userspace pass in a fd, it's trivial
> to feed that struct file to the kernel io routines instead of a memfd
> one.
> 

Yeah, I like this flexibility. In fact, I'm wondering why we wouldn't do
something like this anyways. Could/should xfs_scrub be responsible for
allocating a memfd and passing along the fd? Another advantage of doing
that is whatever logic we may need to clean up old repair files or
whatever is pushed to userspace.

> > > > I'm not familiar with memfd. The manpage suggests it's ram backed, is it
> > > > swappable or something?
> > > 
> > > It's supposed to be.  The quick test I ran (allocate a memfd, write 1GB
> > > of junk to it on a VM with 400M of RAM) seemed to push about 980MB into
> > > the swap file.
> > > 
> > 
> > Ok.
> > 
> > > > If so, that sounds a reasonable option provided the swap space
> > > > requirement can be made clear to users
> > > 
> > > We can document it.  I don't think it's any worse than xfs_repair being
> > > able to use up all the memory + swap... and since we're probably only
> > > going to be repairing one thing at a time, most likely scrub won't need
> > > as much memory.
> > > 
> > 
> > Right, but as noted below, my concerns with the xfs_repair comparison
> > are that 1.) the kernel generally has more of a limit on anonymous
> > memory allocations than userspace (i.e., not swappable AFAIU?) and 2.)
> > it's not clear how effectively running the system out of memory via the
> > kernel will behave from a failure perspective.
> > 
> > IOW, xfs_repair can run the system out of memory but for the most part
> > that ends up being a simple problem for the system: OOM kill the bloated
> > xfs_repair process. For an online repair in a similar situation, I have
> > no idea what's going to happen.
> 
> Back in the days of the huge linked lists the oom killer would target
> other proceses because it doesn't know that the online repair thread is
> sitting on a ton of pinned kernel memory...
> 

Makes sense, kind of what I'd expect...

> > The hope is that the online repair hits -ENOMEM and unwinds, but ISTM
> > we'd still be at risk of other subsystems running into memory
> > allocation problems, filling up swap, the OOM killer going after
> > unrelated processes, etc.  What if, for example, the OOM killer starts
> > picking off processes in service to a running online repair that
> > immediately consumes freed up memory until the system is borked?
> 
> Yeah.  One thing we /could/ do is register an oom notifier that would
> urge any running repair threads to bail out if they can.  It seems to me
> that the oom killer blocks on the oom_notify_list chain, so our handler
> could wait until at least one thread exits before returning.
> 

Ok, something like that could be useful. I agree that we probably don't
need to go that far until the mechanism is nailed down and testing shows
that OOM is a problem.

> > I don't know how likely that is or if it really ends up much different
> > from the analogous xfs_repair situation. My only point right now is
> > that failure scenario is something we should explore for any solution
> > we ultimately consider because it may be an unexpected use case of the
> > underlying mechanism.
> 
> Ideally, online repair would always be the victim since we know we have
> a reasonable fallback.  At least for memfd, however, I think the only
> clues we have to decide the question "is this memfd getting in the way
> of other threads?" is either seeing ENOMEM, short writes, or getting
> kicked by an oom notification.  Maybe that'll be enough?
> 

Hm, yeah. It may be challenging to track memfd usage as such. If
userspace has access to the fd on an OOM notification or whatever, it
might be able to do more accurate analysis based on an fstat() or
something.

Related question... is the online repair sequence currently
interruptible, if xfs_scrub receives a fatal signal while pulling in
entries during an allocbt scan for example?

> > (To the contrary, just using a cached file seems a natural fit from
> > that perspective.)
> 
> Same here.
> 
> > > > and the failure characteristics aren't more severe than for userspace.
> > > > An online repair that puts the broader system at risk of OOM as
> > > > opposed to predictably failing gracefully may not be the most useful
> > > > tool.
> > > 
> > > Agreed.  One huge downside of memfd seems to be the lack of a mechanism
> > > for the vm to push back on us if we successfully write all we need to
> > > the memfd but then other processes need some memory.  Obviously, if the
> > > memfd write itself comes up short or fails then we dump the memfd and
> > > error back to userspace.  We might simply have to free array memory
> > > while we iterate the records to minimize the time spent at peak memory
> > > usage.
> > > 
> > 
> > Hm, yeah. Some kind of fixed/relative size in-core memory pool approach
> > may simplify things because we could allocate it up front and know right
> > away whether we just don't have enough memory available to repair.
> 
> Hmm.  Apparently we actually /can/ call fallocate on memfd to grab all
> the pages at once, provided we have some guesstimate beforehand of how
> much space we think we'll need.
> 
> So long as my earlier statement about the memory requirements being no
> more than the size of the btree leaves is actually true (I haven't
> rigorously tried to prove it), we need about (xrep_calc_ag_resblks() *
> blocksize) worth of space in the memfd file.  Maybe we ask for 1.5x
> that and if we don't get it, we kill the memfd and exit.
> 

Indeed. It would be nice if we could do all of the file management bits
in userspace.

Brian

> --D
> 
> > 
> > Brian
> > 
> > > --D
> > > 
> > > > 
> > > > Brian
> > > > 
> > > > > --D
> > > > > 
> > > > > > Brian
> > > > > > 
> > > > > > > --D
> > > > > > > 
> > > > > > > > Brian
> > > > > > > > 
> > > > > > > > > --D
> > > > > > > > > 
> > > > > > > > > > Brian
> > > > > > > > > > 
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +done:
> > > > > > > > > > > > > +	/* Free all the OWN_AG blocks that are not in the rmapbt/agfl. */
> > > > > > > > > > > > > +	xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG);
> > > > > > > > > > > > > +	return xrep_reap_extents(sc, old_allocbt_blocks, &oinfo,
> > > > > > > > > > > > > +			XFS_AG_RESV_NONE);
> > > > > > > > > > > > > +}
> > > > > > > > > > > > > +
> > > > > > > > > > > > ...
> > > > > > > > > > > > > diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
> > > > > > > > > > > > > index 0ed68379e551..82f99633a597 100644
> > > > > > > > > > > > > --- a/fs/xfs/xfs_extent_busy.c
> > > > > > > > > > > > > +++ b/fs/xfs/xfs_extent_busy.c
> > > > > > > > > > > > > @@ -657,3 +657,17 @@ xfs_extent_busy_ag_cmp(
> > > > > > > > > > > > >  		diff = b1->bno - b2->bno;
> > > > > > > > > > > > >  	return diff;
> > > > > > > > > > > > >  }
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +/* Are there any busy extents in this AG? */
> > > > > > > > > > > > > +bool
> > > > > > > > > > > > > +xfs_extent_busy_list_empty(
> > > > > > > > > > > > > +	struct xfs_perag	*pag)
> > > > > > > > > > > > > +{
> > > > > > > > > > > > > +	spin_lock(&pag->pagb_lock);
> > > > > > > > > > > > > +	if (pag->pagb_tree.rb_node) {
> > > > > > > > > > > > 
> > > > > > > > > > > > RB_EMPTY_ROOT()?
> > > > > > > > > > > 
> > > > > > > > > > > Good suggestion, thank you!
> > > > > > > > > > > 
> > > > > > > > > > > --D
> > > > > > > > > > > 
> > > > > > > > > > > > Brian
> > > > > > > > > > > > 
> > > > > > > > > > > > > +		spin_unlock(&pag->pagb_lock);
> > > > > > > > > > > > > +		return false;
> > > > > > > > > > > > > +	}
> > > > > > > > > > > > > +	spin_unlock(&pag->pagb_lock);
> > > > > > > > > > > > > +	return true;
> > > > > > > > > > > > > +}
> > > > > > > > > > > > > diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h
> > > > > > > > > > > > > index 990ab3891971..2f8c73c712c6 100644
> > > > > > > > > > > > > --- a/fs/xfs/xfs_extent_busy.h
> > > > > > > > > > > > > +++ b/fs/xfs/xfs_extent_busy.h
> > > > > > > > > > > > > @@ -65,4 +65,6 @@ static inline void xfs_extent_busy_sort(struct list_head *list)
> > > > > > > > > > > > >  	list_sort(NULL, list, xfs_extent_busy_ag_cmp);
> > > > > > > > > > > > >  }
> > > > > > > > > > > > >  
> > > > > > > > > > > > > +bool xfs_extent_busy_list_empty(struct xfs_perag *pag);
> > > > > > > > > > > > > +
> > > > > > > > > > > > >  #endif /* __XFS_EXTENT_BUSY_H__ */
> > > > > > > > > > > > > 
> > > > > > > > > > > > > --
> > > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > > > > > > > --
> > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > > > > > > --
> > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > > > > > --
> > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > > > --
> > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > > --
> > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > --
> > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > --
> > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > the body of a message to majordomo@vger.kernel.org
> > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > --
> > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > the body of a message to majordomo@vger.kernel.org
> > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > --
> > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > the body of a message to majordomo@vger.kernel.org
> > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Darrick J. Wong Aug. 9, 2018, 3:59 p.m. UTC | #14
On Thu, Aug 09, 2018 at 08:00:28AM -0400, Brian Foster wrote:
> On Wed, Aug 08, 2018 at 03:42:32PM -0700, Darrick J. Wong wrote:
> > On Wed, Aug 08, 2018 at 08:29:54AM -0400, Brian Foster wrote:
> > > On Tue, Aug 07, 2018 at 04:34:58PM -0700, Darrick J. Wong wrote:
> > > > On Fri, Aug 03, 2018 at 06:49:40AM -0400, Brian Foster wrote:
> > > > > On Thu, Aug 02, 2018 at 12:22:05PM -0700, Darrick J. Wong wrote:
> > > > > > On Thu, Aug 02, 2018 at 09:48:24AM -0400, Brian Foster wrote:
> > > > > > > On Wed, Aug 01, 2018 at 11:28:45PM -0700, Darrick J. Wong wrote:
> > > > > > > > On Wed, Aug 01, 2018 at 02:39:20PM -0400, Brian Foster wrote:
> > > > > > > > > On Wed, Aug 01, 2018 at 09:23:16AM -0700, Darrick J. Wong wrote:
> > > > > > > > > > On Wed, Aug 01, 2018 at 07:54:09AM -0400, Brian Foster wrote:
> > > > > > > > > > > On Tue, Jul 31, 2018 at 03:01:25PM -0700, Darrick J. Wong wrote:
> > > > > > > > > > > > On Tue, Jul 31, 2018 at 01:47:23PM -0400, Brian Foster wrote:
> > > > > > > > > > > > > On Sun, Jul 29, 2018 at 10:48:21PM -0700, Darrick J. Wong wrote:
> ...
> > > > > > So, that early draft spent a lot of time tearing down and reconstructing
> > > > > > rmapbt cursors since the standard _btree_query_all isn't suited to that
> > > > > > kind of usage.  It was easily twice as slow on a RAM-backed disk just
> > > > > > from the rmap cursor overhead and much more complex, so I rewrote it to
> > > > > > be simpler.  I also have a slight preference for not touching anything
> > > > > > until we're absolutely sure we have all the data we need to repair the
> > > > > > structure.
> > > > > > 
> > > > > 
> > > > > Yes, I think that is sane in principle. I'm just a bit concerned about
> > > > > how reliable that xfs_repair-like approach will be in the kernel longer
> > > > > term, particularly once we start having to deal with large filesystems
> > > > > and limited or contended memory, etc. We already have xfs_repair users
> > > > > that need to tweak settings because there isn't enough memory available
> > > > > to repair the fs. Granted that is for fs-wide repairs and the flipside
> > > > > is that we know a single AG can only be up to 1TB. It's certainly
> > > > > possible that putting some persistent backing behind the in-core data is
> > > > > enough to resolve the problem (and the current approach is certainly
> > > > > reasonable enough to me for the initial implementation).
> > > > > 
> > > > > bjoin limitations aside, I wonder if a cursor roll mechanism that held
> > > > > all of the cursor buffers, rolled the transaction and then rejoined all
> > > > > said buffers would help us get around that. (Not sure I follow the early
> > > > > prototype behavior, but it sounds like we had to restart the rmapbt
> > > > > lookup over and over...).
> > > > 
> > > > Correct.
> > > > 
> > > > > Another caveat with that approach may be that I think we'd need to be
> > > > > sure that the reconstruction operation doesn't ever need to update the
> > > > > rmapbt while we're mid walk of the latter.
> > > > 
> > > > <nod> Looking even farther back in my notes, that was also an issue --
> > > > fixing the free list causes blocks to go on or off the agfl, which
> > > > causes rmapbt updates, which meant that the only way I could get
> > > > in-place updates to work was to re-lookup where we were in the btree and
> > > > also try to deal with any rmapbt entries that might have crept in as
> > > > result of the record insertion.
> > > > 
> > > > Getting the concurrency right for each repair function looked like a
> > > > difficult problem to solve, but amassing all the records elsewhere and
> > > > rebuilding was easy to understand.
> > > > 
> > > 
> > > Yeah. This all points to this kind of strategy being too complex to be
> > > worth the prospective benefits in the short term. Clearly we have
> > > several, potentially tricky roadblocks to work through before this can
> > > be made feasible. Thanks for the background, it's still useful to have
> > > this context to compare with whatever we may have to do to support a
> > > reclaimable memory approach.
> > 
> > <nod>  Reclaimable memfd "memory" isn't too difficult, we can call
> > kernel_read and kernel_write, though lockdep gets pretty mad about xfs
> > taking sb_start_write (on the memfd filesystem) at the same time it has
> > sb_starT_write on the xfs (not to mention the stack usage) so I had to
> > throw in the extra twist of delegating the actual file io to a workqueue
> > item (a la xfs_btree_split).
> > 
> 
> Ok, I'm more curious what the surrounding code looks like around
> managing the underlying file pages. Now that I think of it, the primary
> usage was to dump everything into the file and read it back
> sequentually,

Yep.  Simplified, the code is more or less:

array_init(array)
{
	array->filp = shmem_file_create(...);
}

array_destroy(array)
{
	fput(array->filp);
}

array_set(array, nr, ptr)
{
	kernel_write(array->filp, ptr, array->obj_size, nr * array->obj_size);
}

array_get(array, nr, ptr)
{
	kernel_read(array->filp, ptr, array->obj_size, nr * array->obj_size);
}

That's leaving out all the bookkeeping and other weird details to show
pseudocode versions of the file manipulation calls.

I did end up playing a bit of sleight-of-hand with the file io, however
-- all the io is deferred to a workqueue for the dual purpose of
avoiding stack overflows in the memfd file's io paths and to avoid some
sort of deadlock in the page fault handler of the memfd write.  I didn't
investigate the deadlock too deeply, as solving the first problem seemed
to make the second go away.

> so perhaps this really isn't that difficult to deal with since the
> file content is presumably fixed size data structures.

Correct.  There is one user that needs variable-sized records (the
extended attribute repair) for which I've constructed the 'xblob' data
structure which stores blobs in a second memfd and returns the file
offset of a blob as a magic cookie that is recorded in the (fixed size)
attr keys.

Presumably the future directory rebuilder will use xblob too.

> (Hmm, was there a sort in there somewhere as well?).

Yes.  I spent a couple of days implementing a hybrid quicksort/insertion
sort that won't blow out the call stack.

> > > > > That may be an issue for inode btree reconstruction, for example,
> > > > > since it looks like inobt block allocation requires rmapbt updates.
> > > > > We'd probably need some way to share (or invalidate) a cursor across
> > > > > different contexts to deal with that.
> > > > 
> > > > I might pursue that strategy if we ever hit the point where we can't
> > > > find space to store the records (see below).  Another option could be to
> > > > divert all deferred items for an AG, build a replacement btree in new
> > > > space, then finish all the deferred items... but that's starting to get
> > > > into offlineable AGs, which is its own project that I want to tackle
> > > > later.
> > > > 
> > > > (Not that much later, just not this cycle.)
> > > > 
> > > 
> > > *nod*
> > > 
> > > > > > For other repair functions (like the data/attr fork repairs) we have to
> > > > > > scan all the rmapbts for extents, and I'd prefer to lock those AGs only
> > > > > > for as long as necessary to extract the extents we want.
> > > > > > 
> > > > > > > The obvious problem is that we still have some checks that allow the
> > > > > > > whole repair operation to bail out before we determine whether we can
> > > > > > > start to rebuild the on-disk btrees. These are things like making sure
> > > > > > > we can actually read the associated rmapbt blocks (i.e., no read errors
> > > > > > > or verifier failures), basic record sanity checks, etc. But ISTM that
> > > > > > > isn't anything we couldn't get around with a multi-pass implementation.
> > > > > > > Secondary issues might be things like no longer being able to easily
> > > > > > > insert the longest free extent range(s) first (meaning we'd have to
> > > > > > > stuff the agfl with old btree blocks or figure out some other approach).
> > > > > > 
> > > > > > Well, you could scan the rmapbt twice -- once to find the longest
> > > > > > record, then again to do the actual insertion.
> > > > > > 
> > > > > 
> > > > > Yep, that's what I meant by multi-pass.
> > > > > 
> > > > > > > BTW, isn't the typical scrub sequence already multi-pass by virtue of
> > > > > > > the xfs_scrub_metadata() implementation? I'm wondering if the ->scrub()
> > > > > > > callout could not only detect corruption, but validate whether repair
> > > > > > > (if requested) is possible based on the kind of checks that are
> > > > > > > currently in the repair side rmapbt walkers. Thoughts?r
> > > > > > 
> > > > > > Yes, scrub basically validates that for us now, with the notable
> > > > > > exception of the notorious rmapbt scrubber, which doesn't
> > > > > > cross-reference with inode block mappings because that would be a
> > > > > > locking nightmare.
> > > > > > 
> > > > > > > Are there future
> > > > > > > changes that are better supported by an in-core tracking structure in
> > > > > > > general (assuming we'll eventually replace the linked lists with
> > > > > > > something more efficient) as opposed to attempting to optimize out the
> > > > > > > need for that tracking at all?
> > > > > > 
> > > > > > Well, I was thinking that we could just allocate a memfd (or a file on
> > > > > > the same xfs once we have AG offlining) and store the records in there.
> > > > > > That saves us the list_head overhead and potentially enables access to a
> > > > > > lot more storage than pinning things in RAM.
> > > > > > 
> > > > > 
> > > > > Would using the same fs mean we have to store the repair data in a
> > > > > separate AG, or somehow locate/use free space in the target AG?
> > > > 
> > > > As part of building an "offline AG" feature we'd presumably have to
> > > > teach the allocators to avoid the offline AGs for allocations, which
> > > > would make it so that we could host the repair data files in the same
> > > > XFS that's being fixed.  That seems a little risky to me, but the disk
> > > > is probably larger than mem+swap.
> > > > 
> > > 
> > > Got it, so we'd use the remaining space in the fs outside of the target
> > > AG. ISTM that still presumes the rest of the fs is coherent, but I
> > > suppose the offline AG thing helps us with that. We'd just have to make
> > > sure we've shut down all currently corrupted AGs before we start to
> > > repair a particular corrupted one, and then hope there's still enough
> > > free space in the fs to proceed.
> > 
> > That's a pretty big hope. :)  I think for now 
> > 
> > > That makes more sense, but I still agree that it seems risky in general.
> > > Technical risk aside, there's also usability concerns in that the local
> > > free space requirement is another bit of non-determinism
> > 
> > I don't think it's non-deterministic, it's just hard for the filesystem
> > to communicate to the user/admin ahead of time.  Roughly speaking, we
> > need to have about as much disk space for the new btree as we had
> > allocated for the old one.
> > 
> 
> Right, maybe non-deterministic is not the best term. What I mean is that
> it's not clear to the user why a particular filesystem may not be able
> to run a repair (e.g., if it has plenty of reported free space but
> enough AGs may be shut down due to corruption). So in certain scenarios
> an unrelated corruption or particular ordering of AG repairs could be
> the difference between whether an online repair succeeds or defers to
> offline repair on the otherwise same filesystem. 

<nod>

> > As far as memory requirements go, in last week's revising of the patches
> > I compressed the in-memory record structs down about as far as possible;
> > with the removal of the list heads, the memory requirements drop by
> > 30-60%.  We require the same amount of memory as would be needed to
> > store all of the records in the leaf nodes, and no more, and we can use
> > swap space to do it.
> > 
> 
> Nice. When looking at the existing structures it looked like a worst
> case (1TB AG, every other 1k block allocated) could require up to
> 10-12GB RAM (but I could have easily messed that up).r

Sounds about right.  1TB AG = 268 million 4k blocks

bnobt: 8-byte records, or ~2.2GB of memory
inobt: 16-byte records, or ~4.3GB of memory
refcountbt: 12-byte records, or ~3.2GB of memory
rmapbt: 24-byte records, or ~6.4GB of memory

Multiply by 4 for a 1k block filesystem, divide by 16 for a 64k block fs.

Note that if the AG is full and heavily shared then the rmapbt
requirements can exceed that, but that's a known property of rmap in
general.

> That's not insane on its own, it's just the question of allocating
> that much memory in the kernel. Slimming that down and pushing it into
> something swappable doesn't _sound_ too overbearing. I'm not really
> sure what default distro swap sizes are these days (some % of RAM?),

I think so?  I think RH/Centos/OL default to the size of RAM + 2GB
nowadays, and Ubuntu seems to do RAM+sqrt(RAM)?

> but it shouldn't be that hard to find ~10GB of disk space somewhere to
> facilitate a repair.
>
> > > around the ability to online repair vs. having to punt to xfs_repair,
> > > or if the repair consumes whatever free space remains in the fs to the
> > > detriment of whatever workload the user presumably wanted to keep the
> > > fs online for, etc.
> > 
> > I've occasionally thought that future xfs_scrub could ask the kernel to
> > estimate how much disk and memory it will need for the repair (and
> > whether the disk space requirement is fs-scope or AG-scope); then it
> > could forego a repair action and recommend xfs_repair if running the
> > online repair would take the system below some configurable threshold.
> > 
> 
> I think something like that would improve usability once we nail down
> the core mechanism.

Ok, I'll put it on my list of things to do.

> > > > > presume either way we'd have to ensure that AG is either consistent or
> > > > > locked out from outside I/O. If we have the total record count we can
> > > > 
> > > > We usually don't, but for the btrees that have their own record/blocks
> > > > counters we might be able to guess a number, fallocate it, and see if
> > > > that doesn't ENOSPC.
> > > > 
> > > > > preallocate the file and hope there is no such other free space
> > > > > corruption or something that would allow some other task to mess with
> > > > > our blocks. I'm a little skeptical overall on relying on a corrupted
> > > > > filesystem to store repair data, but perhaps there are ways to mitigate
> > > > > the risks.
> > > > 
> > > > Store it elsewhere?  /home for root repairs, /root for any other
> > > > repair... though if we're going to do that, why not just add a swap file
> > > > temporarily?
> > > > 
> > > 
> > > Indeed. The thought crossed my mind about whether we could do something
> > > like have an internal/isolated swap file for dedicated XFS allocations
> > > to avoid contention with the traditional swap.
> > 
> > Heh, I think e2fsck has some feature like that where you can pass it a
> > swap file.  No idea how much good that does on modern systems where
> > there's one huge partition... :)
> > 
> 
> Interesting. Couldn't you always create an additional swap file, run the
> repair then kill it off when it's no longer needed?

Yes, though as I think you said in an earlier reply, it would be nice to
have our own private swap file instead of risking some other process
taking it.

> > > Userspace could somehow set it up or communicate to the kernel. I have
> > > no idea how realistic that is though or if there's a better interface
> > > for that kind of thing (i.e., file backed kmem cache?).
> > 
> > I looked, and there aren't any other mechanisms for unpinnned kernel
> > memory allocations.
> > 
> 
> Ok, it looks like swap or traditional files it is then. ;P
> 
> > > What _seems_ beneficial about that approach is we get (potentially
> > > external) persistent backing and memory reclaim ability with the
> > > traditional memory allocation model.
> > >
> > > ISTM that if we used a regular file, we'd need to deal with the
> > > traditional file interface somehow or another (file read/pagecache
> > > lookup -> record ??).
> > 
> > Yes, that's all neatly wrapped up in kernel_read() and kernel_write() so
> > all we need is a (struct file *).
> > 
> > > We could repurpose some existing mechanism like the directory code or
> > > quota inode mechanism to use xfs buffers for that purpose, but I think
> > > that would require us to always use an internal inode. Allowing
> > > userspace to pass an fd/file passes that consideration on to the user,
> > > which might be more flexible. We could always warn about additional
> > > limitations if that fd happens to be based on the target fs.
> > 
> > <nod> A second advantage of the struct file/kernel_{read,write} approach
> > is that we if we ever decide to let userspace pass in a fd, it's trivial
> > to feed that struct file to the kernel io routines instead of a memfd
> > one.
> > 
> 
> Yeah, I like this flexibility. In fact, I'm wondering why we wouldn't do
> something like this anyways. Could/should xfs_scrub be responsible for
> allocating a memfd and passing along the fd? Another advantage of doing
> that is whatever logic we may need to clean up old repair files or
> whatever is pushed to userspace.

There are two ways we could do this -- one is to have the kernel manage
the memfd creation internally (like my patches do now); the other is for
xfs_scrub to pass in creat(O_TMPFILE).

When repair fputs the file (or fdputs the fd if we switch to using
that), the kernel will perform the usual deletion of the zero-linkcount
zero-refcount file.  We get all the "cleanup" for free by closing the
file.

One other potential complication is that a couple of the repair
functions need two memfds.  The extended attribute repair creates a
fixed-record array for attr keys and an xblob to hold names and values;
each structure gets its own memfd.  The refcount repair creates two
fixed-record arrays, one for refcount records and another to act as a
stack of rmaps to compute reference counts.

(In theory the xbitmap could also be converted to use the fixed record
array, but in practice they haven't (yet) become large enough to warrant
it, and there's currently no way to insert or delete records from the
middle of the array.)

> > > > > I'm not familiar with memfd. The manpage suggests it's ram backed, is it
> > > > > swappable or something?
> > > > 
> > > > It's supposed to be.  The quick test I ran (allocate a memfd, write 1GB
> > > > of junk to it on a VM with 400M of RAM) seemed to push about 980MB into
> > > > the swap file.
> > > > 
> > > 
> > > Ok.
> > > 
> > > > > If so, that sounds a reasonable option provided the swap space
> > > > > requirement can be made clear to users
> > > > 
> > > > We can document it.  I don't think it's any worse than xfs_repair being
> > > > able to use up all the memory + swap... and since we're probably only
> > > > going to be repairing one thing at a time, most likely scrub won't need
> > > > as much memory.
> > > > 
> > > 
> > > Right, but as noted below, my concerns with the xfs_repair comparison
> > > are that 1.) the kernel generally has more of a limit on anonymous
> > > memory allocations than userspace (i.e., not swappable AFAIU?) and 2.)
> > > it's not clear how effectively running the system out of memory via the
> > > kernel will behave from a failure perspective.
> > > 
> > > IOW, xfs_repair can run the system out of memory but for the most part
> > > that ends up being a simple problem for the system: OOM kill the bloated
> > > xfs_repair process. For an online repair in a similar situation, I have
> > > no idea what's going to happen.
> > 
> > Back in the days of the huge linked lists the oom killer would target
> > other proceses because it doesn't know that the online repair thread is
> > sitting on a ton of pinned kernel memory...
> > 
> 
> Makes sense, kind of what I'd expect...
> 
> > > The hope is that the online repair hits -ENOMEM and unwinds, but ISTM
> > > we'd still be at risk of other subsystems running into memory
> > > allocation problems, filling up swap, the OOM killer going after
> > > unrelated processes, etc.  What if, for example, the OOM killer starts
> > > picking off processes in service to a running online repair that
> > > immediately consumes freed up memory until the system is borked?
> > 
> > Yeah.  One thing we /could/ do is register an oom notifier that would
> > urge any running repair threads to bail out if they can.  It seems to me
> > that the oom killer blocks on the oom_notify_list chain, so our handler
> > could wait until at least one thread exits before returning.
> > 
> 
> Ok, something like that could be useful. I agree that we probably don't
> need to go that far until the mechanism is nailed down and testing shows
> that OOM is a problem.

It already is a problem on my contrived "2TB hardlink/reflink farm fs" +
"400M of RAM and no swap" scenario.  Granted, pretty much every other
xfs utility also blows out on that so I'm not sure how hard I really
need to try...

> > > I don't know how likely that is or if it really ends up much different
> > > from the analogous xfs_repair situation. My only point right now is
> > > that failure scenario is something we should explore for any solution
> > > we ultimately consider because it may be an unexpected use case of the
> > > underlying mechanism.
> > 
> > Ideally, online repair would always be the victim since we know we have
> > a reasonable fallback.  At least for memfd, however, I think the only
> > clues we have to decide the question "is this memfd getting in the way
> > of other threads?" is either seeing ENOMEM, short writes, or getting
> > kicked by an oom notification.  Maybe that'll be enough?
> > 
> 
> Hm, yeah. It may be challenging to track memfd usage as such. If
> userspace has access to the fd on an OOM notification or whatever, it
> might be able to do more accurate analysis based on an fstat() or
> something.
> 
> Related question... is the online repair sequence currently
> interruptible, if xfs_scrub receives a fatal signal while pulling in
> entries during an allocbt scan for example?

It's interruptible (fatal signals only) during the scan phase, but once
it starts logging metadata updates it will run all the way to
completion.

> > > (To the contrary, just using a cached file seems a natural fit from
> > > that perspective.)
> > 
> > Same here.
> > 
> > > > > and the failure characteristics aren't more severe than for userspace.
> > > > > An online repair that puts the broader system at risk of OOM as
> > > > > opposed to predictably failing gracefully may not be the most useful
> > > > > tool.
> > > > 
> > > > Agreed.  One huge downside of memfd seems to be the lack of a mechanism
> > > > for the vm to push back on us if we successfully write all we need to
> > > > the memfd but then other processes need some memory.  Obviously, if the
> > > > memfd write itself comes up short or fails then we dump the memfd and
> > > > error back to userspace.  We might simply have to free array memory
> > > > while we iterate the records to minimize the time spent at peak memory
> > > > usage.
> > > > 
> > > 
> > > Hm, yeah. Some kind of fixed/relative size in-core memory pool approach
> > > may simplify things because we could allocate it up front and know right
> > > away whether we just don't have enough memory available to repair.
> > 
> > Hmm.  Apparently we actually /can/ call fallocate on memfd to grab all
> > the pages at once, provided we have some guesstimate beforehand of how
> > much space we think we'll need.
> > 
> > So long as my earlier statement about the memory requirements being no
> > more than the size of the btree leaves is actually true (I haven't
> > rigorously tried to prove it), we need about (xrep_calc_ag_resblks() *
> > blocksize) worth of space in the memfd file.  Maybe we ask for 1.5x
> > that and if we don't get it, we kill the memfd and exit.
> > 
> 
> Indeed. It would be nice if we could do all of the file management bits
> in userspace.

Agreed, though no file management would be even better. :)

--D

> Brian
> 
> > --D
> > 
> > > 
> > > Brian
> > > 
> > > > --D
> > > > 
> > > > > 
> > > > > Brian
> > > > > 
> > > > > > --D
> > > > > > 
> > > > > > > Brian
> > > > > > > 
> > > > > > > > --D
> > > > > > > > 
> > > > > > > > > Brian
> > > > > > > > > 
> > > > > > > > > > --D
> > > > > > > > > > 
> > > > > > > > > > > Brian
> > > > > > > > > > > 
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +done:
> > > > > > > > > > > > > > +	/* Free all the OWN_AG blocks that are not in the rmapbt/agfl. */
> > > > > > > > > > > > > > +	xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG);
> > > > > > > > > > > > > > +	return xrep_reap_extents(sc, old_allocbt_blocks, &oinfo,
> > > > > > > > > > > > > > +			XFS_AG_RESV_NONE);
> > > > > > > > > > > > > > +}
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > ...
> > > > > > > > > > > > > > diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
> > > > > > > > > > > > > > index 0ed68379e551..82f99633a597 100644
> > > > > > > > > > > > > > --- a/fs/xfs/xfs_extent_busy.c
> > > > > > > > > > > > > > +++ b/fs/xfs/xfs_extent_busy.c
> > > > > > > > > > > > > > @@ -657,3 +657,17 @@ xfs_extent_busy_ag_cmp(
> > > > > > > > > > > > > >  		diff = b1->bno - b2->bno;
> > > > > > > > > > > > > >  	return diff;
> > > > > > > > > > > > > >  }
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +/* Are there any busy extents in this AG? */
> > > > > > > > > > > > > > +bool
> > > > > > > > > > > > > > +xfs_extent_busy_list_empty(
> > > > > > > > > > > > > > +	struct xfs_perag	*pag)
> > > > > > > > > > > > > > +{
> > > > > > > > > > > > > > +	spin_lock(&pag->pagb_lock);
> > > > > > > > > > > > > > +	if (pag->pagb_tree.rb_node) {
> > > > > > > > > > > > > 
> > > > > > > > > > > > > RB_EMPTY_ROOT()?
> > > > > > > > > > > > 
> > > > > > > > > > > > Good suggestion, thank you!
> > > > > > > > > > > > 
> > > > > > > > > > > > --D
> > > > > > > > > > > > 
> > > > > > > > > > > > > Brian
> > > > > > > > > > > > > 
> > > > > > > > > > > > > > +		spin_unlock(&pag->pagb_lock);
> > > > > > > > > > > > > > +		return false;
> > > > > > > > > > > > > > +	}
> > > > > > > > > > > > > > +	spin_unlock(&pag->pagb_lock);
> > > > > > > > > > > > > > +	return true;
> > > > > > > > > > > > > > +}
> > > > > > > > > > > > > > diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h
> > > > > > > > > > > > > > index 990ab3891971..2f8c73c712c6 100644
> > > > > > > > > > > > > > --- a/fs/xfs/xfs_extent_busy.h
> > > > > > > > > > > > > > +++ b/fs/xfs/xfs_extent_busy.h
> > > > > > > > > > > > > > @@ -65,4 +65,6 @@ static inline void xfs_extent_busy_sort(struct list_head *list)
> > > > > > > > > > > > > >  	list_sort(NULL, list, xfs_extent_busy_ag_cmp);
> > > > > > > > > > > > > >  }
> > > > > > > > > > > > > >  
> > > > > > > > > > > > > > +bool xfs_extent_busy_list_empty(struct xfs_perag *pag);
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > >  #endif /* __XFS_EXTENT_BUSY_H__ */
> > > > > > > > > > > > > > 
> > > > > > > > > > > > > > --
> > > > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > > > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > > > > > > > > --
> > > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > > > > > > > --
> > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > > > > > > --
> > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > > > > --
> > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > > > --
> > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > > --
> > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > --
> > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > --
> > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > the body of a message to majordomo@vger.kernel.org
> > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > --
> > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > the body of a message to majordomo@vger.kernel.org
> > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > --
> > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > the body of a message to majordomo@vger.kernel.org
> > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Brian Foster Aug. 10, 2018, 10:33 a.m. UTC | #15
On Thu, Aug 09, 2018 at 08:59:59AM -0700, Darrick J. Wong wrote:
> On Thu, Aug 09, 2018 at 08:00:28AM -0400, Brian Foster wrote:
> > On Wed, Aug 08, 2018 at 03:42:32PM -0700, Darrick J. Wong wrote:
> > > On Wed, Aug 08, 2018 at 08:29:54AM -0400, Brian Foster wrote:
> > > > On Tue, Aug 07, 2018 at 04:34:58PM -0700, Darrick J. Wong wrote:
> > > > > On Fri, Aug 03, 2018 at 06:49:40AM -0400, Brian Foster wrote:
> > > > > > On Thu, Aug 02, 2018 at 12:22:05PM -0700, Darrick J. Wong wrote:
> > > > > > > On Thu, Aug 02, 2018 at 09:48:24AM -0400, Brian Foster wrote:
> > > > > > > > On Wed, Aug 01, 2018 at 11:28:45PM -0700, Darrick J. Wong wrote:
> > > > > > > > > On Wed, Aug 01, 2018 at 02:39:20PM -0400, Brian Foster wrote:
> > > > > > > > > > On Wed, Aug 01, 2018 at 09:23:16AM -0700, Darrick J. Wong wrote:
> > > > > > > > > > > On Wed, Aug 01, 2018 at 07:54:09AM -0400, Brian Foster wrote:
> > > > > > > > > > > > On Tue, Jul 31, 2018 at 03:01:25PM -0700, Darrick J. Wong wrote:
> > > > > > > > > > > > > On Tue, Jul 31, 2018 at 01:47:23PM -0400, Brian Foster wrote:
> > > > > > > > > > > > > > On Sun, Jul 29, 2018 at 10:48:21PM -0700, Darrick J. Wong wrote:
...
> > > > What _seems_ beneficial about that approach is we get (potentially
> > > > external) persistent backing and memory reclaim ability with the
> > > > traditional memory allocation model.
> > > >
> > > > ISTM that if we used a regular file, we'd need to deal with the
> > > > traditional file interface somehow or another (file read/pagecache
> > > > lookup -> record ??).
> > > 
> > > Yes, that's all neatly wrapped up in kernel_read() and kernel_write() so
> > > all we need is a (struct file *).
> > > 
> > > > We could repurpose some existing mechanism like the directory code or
> > > > quota inode mechanism to use xfs buffers for that purpose, but I think
> > > > that would require us to always use an internal inode. Allowing
> > > > userspace to pass an fd/file passes that consideration on to the user,
> > > > which might be more flexible. We could always warn about additional
> > > > limitations if that fd happens to be based on the target fs.
> > > 
> > > <nod> A second advantage of the struct file/kernel_{read,write} approach
> > > is that we if we ever decide to let userspace pass in a fd, it's trivial
> > > to feed that struct file to the kernel io routines instead of a memfd
> > > one.
> > > 
> > 
> > Yeah, I like this flexibility. In fact, I'm wondering why we wouldn't do
> > something like this anyways. Could/should xfs_scrub be responsible for
> > allocating a memfd and passing along the fd? Another advantage of doing
> > that is whatever logic we may need to clean up old repair files or
> > whatever is pushed to userspace.
> 
> There are two ways we could do this -- one is to have the kernel manage
> the memfd creation internally (like my patches do now); the other is for
> xfs_scrub to pass in creat(O_TMPFILE).
> 
> When repair fputs the file (or fdputs the fd if we switch to using
> that), the kernel will perform the usual deletion of the zero-linkcount
> zero-refcount file.  We get all the "cleanup" for free by closing the
> file.
> 

Ok. FWIW, the latter approach where xfs_scrub creates a file and passes
the fd along to the kernel seems preferable to me, but perhaps others
have different opinions. We could accept a pathname from the user to
create the file or otherwise attempt to allocate an memfd by default and
pass that along.

> One other potential complication is that a couple of the repair
> functions need two memfds.  The extended attribute repair creates a
> fixed-record array for attr keys and an xblob to hold names and values;
> each structure gets its own memfd.  The refcount repair creates two
> fixed-record arrays, one for refcount records and another to act as a
> stack of rmaps to compute reference counts.
> 

Hmm, I guess there's nothing stopping scrub from passing in two fds.
Maybe it would make more sense for the userspace option to be a path
basename or directory where scrub is allowed to create whatever scratch
files it needs.

That aside, is there any reason the repair mechanism couldn't emulate
multiple files with a single fd via a magic offset delimeter or
something? E.g., "file 1" starts at offset 0, "file 2" starts at offset
1TB, etc. (1TB is probably overkill, but you get the idea..).

Brian

> (In theory the xbitmap could also be converted to use the fixed record
> array, but in practice they haven't (yet) become large enough to warrant
> it, and there's currently no way to insert or delete records from the
> middle of the array.)
> 
> > > > > > I'm not familiar with memfd. The manpage suggests it's ram backed, is it
> > > > > > swappable or something?
> > > > > 
> > > > > It's supposed to be.  The quick test I ran (allocate a memfd, write 1GB
> > > > > of junk to it on a VM with 400M of RAM) seemed to push about 980MB into
> > > > > the swap file.
> > > > > 
> > > > 
> > > > Ok.
> > > > 
> > > > > > If so, that sounds a reasonable option provided the swap space
> > > > > > requirement can be made clear to users
> > > > > 
> > > > > We can document it.  I don't think it's any worse than xfs_repair being
> > > > > able to use up all the memory + swap... and since we're probably only
> > > > > going to be repairing one thing at a time, most likely scrub won't need
> > > > > as much memory.
> > > > > 
> > > > 
> > > > Right, but as noted below, my concerns with the xfs_repair comparison
> > > > are that 1.) the kernel generally has more of a limit on anonymous
> > > > memory allocations than userspace (i.e., not swappable AFAIU?) and 2.)
> > > > it's not clear how effectively running the system out of memory via the
> > > > kernel will behave from a failure perspective.
> > > > 
> > > > IOW, xfs_repair can run the system out of memory but for the most part
> > > > that ends up being a simple problem for the system: OOM kill the bloated
> > > > xfs_repair process. For an online repair in a similar situation, I have
> > > > no idea what's going to happen.
> > > 
> > > Back in the days of the huge linked lists the oom killer would target
> > > other proceses because it doesn't know that the online repair thread is
> > > sitting on a ton of pinned kernel memory...
> > > 
> > 
> > Makes sense, kind of what I'd expect...
> > 
> > > > The hope is that the online repair hits -ENOMEM and unwinds, but ISTM
> > > > we'd still be at risk of other subsystems running into memory
> > > > allocation problems, filling up swap, the OOM killer going after
> > > > unrelated processes, etc.  What if, for example, the OOM killer starts
> > > > picking off processes in service to a running online repair that
> > > > immediately consumes freed up memory until the system is borked?
> > > 
> > > Yeah.  One thing we /could/ do is register an oom notifier that would
> > > urge any running repair threads to bail out if they can.  It seems to me
> > > that the oom killer blocks on the oom_notify_list chain, so our handler
> > > could wait until at least one thread exits before returning.
> > > 
> > 
> > Ok, something like that could be useful. I agree that we probably don't
> > need to go that far until the mechanism is nailed down and testing shows
> > that OOM is a problem.
> 
> It already is a problem on my contrived "2TB hardlink/reflink farm fs" +
> "400M of RAM and no swap" scenario.  Granted, pretty much every other
> xfs utility also blows out on that so I'm not sure how hard I really
> need to try...
> 
> > > > I don't know how likely that is or if it really ends up much different
> > > > from the analogous xfs_repair situation. My only point right now is
> > > > that failure scenario is something we should explore for any solution
> > > > we ultimately consider because it may be an unexpected use case of the
> > > > underlying mechanism.
> > > 
> > > Ideally, online repair would always be the victim since we know we have
> > > a reasonable fallback.  At least for memfd, however, I think the only
> > > clues we have to decide the question "is this memfd getting in the way
> > > of other threads?" is either seeing ENOMEM, short writes, or getting
> > > kicked by an oom notification.  Maybe that'll be enough?
> > > 
> > 
> > Hm, yeah. It may be challenging to track memfd usage as such. If
> > userspace has access to the fd on an OOM notification or whatever, it
> > might be able to do more accurate analysis based on an fstat() or
> > something.
> > 
> > Related question... is the online repair sequence currently
> > interruptible, if xfs_scrub receives a fatal signal while pulling in
> > entries during an allocbt scan for example?
> 
> It's interruptible (fatal signals only) during the scan phase, but once
> it starts logging metadata updates it will run all the way to
> completion.
> 
> > > > (To the contrary, just using a cached file seems a natural fit from
> > > > that perspective.)
> > > 
> > > Same here.
> > > 
> > > > > > and the failure characteristics aren't more severe than for userspace.
> > > > > > An online repair that puts the broader system at risk of OOM as
> > > > > > opposed to predictably failing gracefully may not be the most useful
> > > > > > tool.
> > > > > 
> > > > > Agreed.  One huge downside of memfd seems to be the lack of a mechanism
> > > > > for the vm to push back on us if we successfully write all we need to
> > > > > the memfd but then other processes need some memory.  Obviously, if the
> > > > > memfd write itself comes up short or fails then we dump the memfd and
> > > > > error back to userspace.  We might simply have to free array memory
> > > > > while we iterate the records to minimize the time spent at peak memory
> > > > > usage.
> > > > > 
> > > > 
> > > > Hm, yeah. Some kind of fixed/relative size in-core memory pool approach
> > > > may simplify things because we could allocate it up front and know right
> > > > away whether we just don't have enough memory available to repair.
> > > 
> > > Hmm.  Apparently we actually /can/ call fallocate on memfd to grab all
> > > the pages at once, provided we have some guesstimate beforehand of how
> > > much space we think we'll need.
> > > 
> > > So long as my earlier statement about the memory requirements being no
> > > more than the size of the btree leaves is actually true (I haven't
> > > rigorously tried to prove it), we need about (xrep_calc_ag_resblks() *
> > > blocksize) worth of space in the memfd file.  Maybe we ask for 1.5x
> > > that and if we don't get it, we kill the memfd and exit.
> > > 
> > 
> > Indeed. It would be nice if we could do all of the file management bits
> > in userspace.
> 
> Agreed, though no file management would be even better. :)
> 
> --D
> 
> > Brian
> > 
> > > --D
> > > 
> > > > 
> > > > Brian
> > > > 
> > > > > --D
> > > > > 
> > > > > > 
> > > > > > Brian
> > > > > > 
> > > > > > > --D
> > > > > > > 
> > > > > > > > Brian
> > > > > > > > 
> > > > > > > > > --D
> > > > > > > > > 
> > > > > > > > > > Brian
> > > > > > > > > > 
> > > > > > > > > > > --D
> > > > > > > > > > > 
> > > > > > > > > > > > Brian
> > > > > > > > > > > > 
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +done:
> > > > > > > > > > > > > > > +	/* Free all the OWN_AG blocks that are not in the rmapbt/agfl. */
> > > > > > > > > > > > > > > +	xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG);
> > > > > > > > > > > > > > > +	return xrep_reap_extents(sc, old_allocbt_blocks, &oinfo,
> > > > > > > > > > > > > > > +			XFS_AG_RESV_NONE);
> > > > > > > > > > > > > > > +}
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > ...
> > > > > > > > > > > > > > > diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
> > > > > > > > > > > > > > > index 0ed68379e551..82f99633a597 100644
> > > > > > > > > > > > > > > --- a/fs/xfs/xfs_extent_busy.c
> > > > > > > > > > > > > > > +++ b/fs/xfs/xfs_extent_busy.c
> > > > > > > > > > > > > > > @@ -657,3 +657,17 @@ xfs_extent_busy_ag_cmp(
> > > > > > > > > > > > > > >  		diff = b1->bno - b2->bno;
> > > > > > > > > > > > > > >  	return diff;
> > > > > > > > > > > > > > >  }
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +/* Are there any busy extents in this AG? */
> > > > > > > > > > > > > > > +bool
> > > > > > > > > > > > > > > +xfs_extent_busy_list_empty(
> > > > > > > > > > > > > > > +	struct xfs_perag	*pag)
> > > > > > > > > > > > > > > +{
> > > > > > > > > > > > > > > +	spin_lock(&pag->pagb_lock);
> > > > > > > > > > > > > > > +	if (pag->pagb_tree.rb_node) {
> > > > > > > > > > > > > > 
> > > > > > > > > > > > > > RB_EMPTY_ROOT()?
> > > > > > > > > > > > > 
> > > > > > > > > > > > > Good suggestion, thank you!
> > > > > > > > > > > > > 
> > > > > > > > > > > > > --D
> > > > > > > > > > > > > 
> > > > > > > > > > > > > > Brian
> > > > > > > > > > > > > > 
> > > > > > > > > > > > > > > +		spin_unlock(&pag->pagb_lock);
> > > > > > > > > > > > > > > +		return false;
> > > > > > > > > > > > > > > +	}
> > > > > > > > > > > > > > > +	spin_unlock(&pag->pagb_lock);
> > > > > > > > > > > > > > > +	return true;
> > > > > > > > > > > > > > > +}
> > > > > > > > > > > > > > > diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h
> > > > > > > > > > > > > > > index 990ab3891971..2f8c73c712c6 100644
> > > > > > > > > > > > > > > --- a/fs/xfs/xfs_extent_busy.h
> > > > > > > > > > > > > > > +++ b/fs/xfs/xfs_extent_busy.h
> > > > > > > > > > > > > > > @@ -65,4 +65,6 @@ static inline void xfs_extent_busy_sort(struct list_head *list)
> > > > > > > > > > > > > > >  	list_sort(NULL, list, xfs_extent_busy_ag_cmp);
> > > > > > > > > > > > > > >  }
> > > > > > > > > > > > > > >  
> > > > > > > > > > > > > > > +bool xfs_extent_busy_list_empty(struct xfs_perag *pag);
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > >  #endif /* __XFS_EXTENT_BUSY_H__ */
> > > > > > > > > > > > > > > 
> > > > > > > > > > > > > > > --
> > > > > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > > > > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > > > > > > > > > --
> > > > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > > > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > > > > > > > > --
> > > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > > > > > > > --
> > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > > > > > --
> > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > > > > --
> > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > > > --
> > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > > --
> > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > --
> > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > --
> > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > the body of a message to majordomo@vger.kernel.org
> > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > --
> > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > the body of a message to majordomo@vger.kernel.org
> > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > --
> > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > the body of a message to majordomo@vger.kernel.org
> > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
Darrick J. Wong Aug. 10, 2018, 3:39 p.m. UTC | #16
On Fri, Aug 10, 2018 at 06:33:52AM -0400, Brian Foster wrote:
> On Thu, Aug 09, 2018 at 08:59:59AM -0700, Darrick J. Wong wrote:
> > On Thu, Aug 09, 2018 at 08:00:28AM -0400, Brian Foster wrote:
> > > On Wed, Aug 08, 2018 at 03:42:32PM -0700, Darrick J. Wong wrote:
> > > > On Wed, Aug 08, 2018 at 08:29:54AM -0400, Brian Foster wrote:
> > > > > On Tue, Aug 07, 2018 at 04:34:58PM -0700, Darrick J. Wong wrote:
> > > > > > On Fri, Aug 03, 2018 at 06:49:40AM -0400, Brian Foster wrote:
> > > > > > > On Thu, Aug 02, 2018 at 12:22:05PM -0700, Darrick J. Wong wrote:
> > > > > > > > On Thu, Aug 02, 2018 at 09:48:24AM -0400, Brian Foster wrote:
> > > > > > > > > On Wed, Aug 01, 2018 at 11:28:45PM -0700, Darrick J. Wong wrote:
> > > > > > > > > > On Wed, Aug 01, 2018 at 02:39:20PM -0400, Brian Foster wrote:
> > > > > > > > > > > On Wed, Aug 01, 2018 at 09:23:16AM -0700, Darrick J. Wong wrote:
> > > > > > > > > > > > On Wed, Aug 01, 2018 at 07:54:09AM -0400, Brian Foster wrote:
> > > > > > > > > > > > > On Tue, Jul 31, 2018 at 03:01:25PM -0700, Darrick J. Wong wrote:
> > > > > > > > > > > > > > On Tue, Jul 31, 2018 at 01:47:23PM -0400, Brian Foster wrote:
> > > > > > > > > > > > > > > On Sun, Jul 29, 2018 at 10:48:21PM -0700, Darrick J. Wong wrote:
> ...
> > > > > What _seems_ beneficial about that approach is we get (potentially
> > > > > external) persistent backing and memory reclaim ability with the
> > > > > traditional memory allocation model.
> > > > >
> > > > > ISTM that if we used a regular file, we'd need to deal with the
> > > > > traditional file interface somehow or another (file read/pagecache
> > > > > lookup -> record ??).
> > > > 
> > > > Yes, that's all neatly wrapped up in kernel_read() and kernel_write() so
> > > > all we need is a (struct file *).
> > > > 
> > > > > We could repurpose some existing mechanism like the directory code or
> > > > > quota inode mechanism to use xfs buffers for that purpose, but I think
> > > > > that would require us to always use an internal inode. Allowing
> > > > > userspace to pass an fd/file passes that consideration on to the user,
> > > > > which might be more flexible. We could always warn about additional
> > > > > limitations if that fd happens to be based on the target fs.
> > > > 
> > > > <nod> A second advantage of the struct file/kernel_{read,write} approach
> > > > is that we if we ever decide to let userspace pass in a fd, it's trivial
> > > > to feed that struct file to the kernel io routines instead of a memfd
> > > > one.
> > > > 
> > > 
> > > Yeah, I like this flexibility. In fact, I'm wondering why we wouldn't do
> > > something like this anyways. Could/should xfs_scrub be responsible for
> > > allocating a memfd and passing along the fd? Another advantage of doing
> > > that is whatever logic we may need to clean up old repair files or
> > > whatever is pushed to userspace.
> > 
> > There are two ways we could do this -- one is to have the kernel manage
> > the memfd creation internally (like my patches do now); the other is for
> > xfs_scrub to pass in creat(O_TMPFILE).
> > 
> > When repair fputs the file (or fdputs the fd if we switch to using
> > that), the kernel will perform the usual deletion of the zero-linkcount
> > zero-refcount file.  We get all the "cleanup" for free by closing the
> > file.
> > 
> 
> Ok. FWIW, the latter approach where xfs_scrub creates a file and passes
> the fd along to the kernel seems preferable to me, but perhaps others
> have different opinions. We could accept a pathname from the user to
> create the file or otherwise attempt to allocate an memfd by default and
> pass that along.
> 
> > One other potential complication is that a couple of the repair
> > functions need two memfds.  The extended attribute repair creates a
> > fixed-record array for attr keys and an xblob to hold names and values;
> > each structure gets its own memfd.  The refcount repair creates two
> > fixed-record arrays, one for refcount records and another to act as a
> > stack of rmaps to compute reference counts.
> > 
> 
> Hmm, I guess there's nothing stopping scrub from passing in two fds.
> Maybe it would make more sense for the userspace option to be a path
> basename or directory where scrub is allowed to create whatever scratch
> files it needs.
> 
> That aside, is there any reason the repair mechanism couldn't emulate
> multiple files with a single fd via a magic offset delimeter or
> something? E.g., "file 1" starts at offset 0, "file 2" starts at offset
> 1TB, etc. (1TB is probably overkill, but you get the idea..).

Hmm, ok, so to summarize, I see five options:

1) Pass in a dirfd, repair can internally openat(dirfd, O_TMPFILE...)
however many files it needs.

2) Pass in a however many file fds we need and segment the space.

3) Pass in a single file fd.

4) Let the repair code create as many memfd files as it wants.

5) Let the repair code create one memfd file and segment the space.

I'm pretty sure we don't want to support (2) because that just seems
like a requirements communication nightmare and can burn up a lot of
space in struct xfs_scrub_metadata.

(3) and (5) are basically the same except for where the file comes from.
For (3) we'd have to make sure the fd filesystem supports large sparse
files (and presumably isn't the xfs we're trying to repair), which
shouldn't be too difficult to probe.  For (5) we know that tmpfs already
supports large sparse files.  Another difficulty might be that on 32-bit
the page cache only supports offsets as high as (ULONG_MAX * PAGE_SIZE),
though I suppose at this point we only need two files and 8TB should be
enough for anyone.

(I also think it's reasonable to consider not supporting online repair
on a 32-bit system with a large filesystem...)

In general, the "pass in a thing from userspace" variants come with the
complication that we have to check the functionality of whatever gets
passed in.  On the plus side it likely unlocks access to a lot more
storage than we could get with mem+swap.  On the minus side someone
passes in a fd to a drive-managed SMR on USB 2.0, and...

(1) seems like it would maximize the kernel's flexibility to create as
many (regular, non-sparse) files as it needs, but now we're calling
do_sys_open and managing files ourselves, which might be avoided.

(4) of course is what we do right now. :)

Soooo... the simplest userspace interface (I think) is to allow
userspace to pass in a single file fd.  Scrub can reject it if it
doesn't measure up (fs is the same, sparse not supported, high offsets
not supported, etc.).  If userspace doesn't pass in an fd then we create
a memfd and use that instead.  We end up with a hybrid between (3) and (5).

--D

> Brian
> 
> > (In theory the xbitmap could also be converted to use the fixed record
> > array, but in practice they haven't (yet) become large enough to warrant
> > it, and there's currently no way to insert or delete records from the
> > middle of the array.)
> > 
> > > > > > > I'm not familiar with memfd. The manpage suggests it's ram backed, is it
> > > > > > > swappable or something?
> > > > > > 
> > > > > > It's supposed to be.  The quick test I ran (allocate a memfd, write 1GB
> > > > > > of junk to it on a VM with 400M of RAM) seemed to push about 980MB into
> > > > > > the swap file.
> > > > > > 
> > > > > 
> > > > > Ok.
> > > > > 
> > > > > > > If so, that sounds a reasonable option provided the swap space
> > > > > > > requirement can be made clear to users
> > > > > > 
> > > > > > We can document it.  I don't think it's any worse than xfs_repair being
> > > > > > able to use up all the memory + swap... and since we're probably only
> > > > > > going to be repairing one thing at a time, most likely scrub won't need
> > > > > > as much memory.
> > > > > > 
> > > > > 
> > > > > Right, but as noted below, my concerns with the xfs_repair comparison
> > > > > are that 1.) the kernel generally has more of a limit on anonymous
> > > > > memory allocations than userspace (i.e., not swappable AFAIU?) and 2.)
> > > > > it's not clear how effectively running the system out of memory via the
> > > > > kernel will behave from a failure perspective.
> > > > > 
> > > > > IOW, xfs_repair can run the system out of memory but for the most part
> > > > > that ends up being a simple problem for the system: OOM kill the bloated
> > > > > xfs_repair process. For an online repair in a similar situation, I have
> > > > > no idea what's going to happen.
> > > > 
> > > > Back in the days of the huge linked lists the oom killer would target
> > > > other proceses because it doesn't know that the online repair thread is
> > > > sitting on a ton of pinned kernel memory...
> > > > 
> > > 
> > > Makes sense, kind of what I'd expect...
> > > 
> > > > > The hope is that the online repair hits -ENOMEM and unwinds, but ISTM
> > > > > we'd still be at risk of other subsystems running into memory
> > > > > allocation problems, filling up swap, the OOM killer going after
> > > > > unrelated processes, etc.  What if, for example, the OOM killer starts
> > > > > picking off processes in service to a running online repair that
> > > > > immediately consumes freed up memory until the system is borked?
> > > > 
> > > > Yeah.  One thing we /could/ do is register an oom notifier that would
> > > > urge any running repair threads to bail out if they can.  It seems to me
> > > > that the oom killer blocks on the oom_notify_list chain, so our handler
> > > > could wait until at least one thread exits before returning.
> > > > 
> > > 
> > > Ok, something like that could be useful. I agree that we probably don't
> > > need to go that far until the mechanism is nailed down and testing shows
> > > that OOM is a problem.
> > 
> > It already is a problem on my contrived "2TB hardlink/reflink farm fs" +
> > "400M of RAM and no swap" scenario.  Granted, pretty much every other
> > xfs utility also blows out on that so I'm not sure how hard I really
> > need to try...
> > 
> > > > > I don't know how likely that is or if it really ends up much different
> > > > > from the analogous xfs_repair situation. My only point right now is
> > > > > that failure scenario is something we should explore for any solution
> > > > > we ultimately consider because it may be an unexpected use case of the
> > > > > underlying mechanism.
> > > > 
> > > > Ideally, online repair would always be the victim since we know we have
> > > > a reasonable fallback.  At least for memfd, however, I think the only
> > > > clues we have to decide the question "is this memfd getting in the way
> > > > of other threads?" is either seeing ENOMEM, short writes, or getting
> > > > kicked by an oom notification.  Maybe that'll be enough?
> > > > 
> > > 
> > > Hm, yeah. It may be challenging to track memfd usage as such. If
> > > userspace has access to the fd on an OOM notification or whatever, it
> > > might be able to do more accurate analysis based on an fstat() or
> > > something.
> > > 
> > > Related question... is the online repair sequence currently
> > > interruptible, if xfs_scrub receives a fatal signal while pulling in
> > > entries during an allocbt scan for example?
> > 
> > It's interruptible (fatal signals only) during the scan phase, but once
> > it starts logging metadata updates it will run all the way to
> > completion.
> > 
> > > > > (To the contrary, just using a cached file seems a natural fit from
> > > > > that perspective.)
> > > > 
> > > > Same here.
> > > > 
> > > > > > > and the failure characteristics aren't more severe than for userspace.
> > > > > > > An online repair that puts the broader system at risk of OOM as
> > > > > > > opposed to predictably failing gracefully may not be the most useful
> > > > > > > tool.
> > > > > > 
> > > > > > Agreed.  One huge downside of memfd seems to be the lack of a mechanism
> > > > > > for the vm to push back on us if we successfully write all we need to
> > > > > > the memfd but then other processes need some memory.  Obviously, if the
> > > > > > memfd write itself comes up short or fails then we dump the memfd and
> > > > > > error back to userspace.  We might simply have to free array memory
> > > > > > while we iterate the records to minimize the time spent at peak memory
> > > > > > usage.
> > > > > > 
> > > > > 
> > > > > Hm, yeah. Some kind of fixed/relative size in-core memory pool approach
> > > > > may simplify things because we could allocate it up front and know right
> > > > > away whether we just don't have enough memory available to repair.
> > > > 
> > > > Hmm.  Apparently we actually /can/ call fallocate on memfd to grab all
> > > > the pages at once, provided we have some guesstimate beforehand of how
> > > > much space we think we'll need.
> > > > 
> > > > So long as my earlier statement about the memory requirements being no
> > > > more than the size of the btree leaves is actually true (I haven't
> > > > rigorously tried to prove it), we need about (xrep_calc_ag_resblks() *
> > > > blocksize) worth of space in the memfd file.  Maybe we ask for 1.5x
> > > > that and if we don't get it, we kill the memfd and exit.
> > > > 
> > > 
> > > Indeed. It would be nice if we could do all of the file management bits
> > > in userspace.
> > 
> > Agreed, though no file management would be even better. :)
> > 
> > --D
> > 
> > > Brian
> > > 
> > > > --D
> > > > 
> > > > > 
> > > > > Brian
> > > > > 
> > > > > > --D
> > > > > > 
> > > > > > > 
> > > > > > > Brian
> > > > > > > 
> > > > > > > > --D
> > > > > > > > 
> > > > > > > > > Brian
> > > > > > > > > 
> > > > > > > > > > --D
> > > > > > > > > > 
> > > > > > > > > > > Brian
> > > > > > > > > > > 
> > > > > > > > > > > > --D
> > > > > > > > > > > > 
> > > > > > > > > > > > > Brian
> > > > > > > > > > > > > 
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > +done:
> > > > > > > > > > > > > > > > +	/* Free all the OWN_AG blocks that are not in the rmapbt/agfl. */
> > > > > > > > > > > > > > > > +	xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG);
> > > > > > > > > > > > > > > > +	return xrep_reap_extents(sc, old_allocbt_blocks, &oinfo,
> > > > > > > > > > > > > > > > +			XFS_AG_RESV_NONE);
> > > > > > > > > > > > > > > > +}
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > ...
> > > > > > > > > > > > > > > > diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
> > > > > > > > > > > > > > > > index 0ed68379e551..82f99633a597 100644
> > > > > > > > > > > > > > > > --- a/fs/xfs/xfs_extent_busy.c
> > > > > > > > > > > > > > > > +++ b/fs/xfs/xfs_extent_busy.c
> > > > > > > > > > > > > > > > @@ -657,3 +657,17 @@ xfs_extent_busy_ag_cmp(
> > > > > > > > > > > > > > > >  		diff = b1->bno - b2->bno;
> > > > > > > > > > > > > > > >  	return diff;
> > > > > > > > > > > > > > > >  }
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > +/* Are there any busy extents in this AG? */
> > > > > > > > > > > > > > > > +bool
> > > > > > > > > > > > > > > > +xfs_extent_busy_list_empty(
> > > > > > > > > > > > > > > > +	struct xfs_perag	*pag)
> > > > > > > > > > > > > > > > +{
> > > > > > > > > > > > > > > > +	spin_lock(&pag->pagb_lock);
> > > > > > > > > > > > > > > > +	if (pag->pagb_tree.rb_node) {
> > > > > > > > > > > > > > > 
> > > > > > > > > > > > > > > RB_EMPTY_ROOT()?
> > > > > > > > > > > > > > 
> > > > > > > > > > > > > > Good suggestion, thank you!
> > > > > > > > > > > > > > 
> > > > > > > > > > > > > > --D
> > > > > > > > > > > > > > 
> > > > > > > > > > > > > > > Brian
> > > > > > > > > > > > > > > 
> > > > > > > > > > > > > > > > +		spin_unlock(&pag->pagb_lock);
> > > > > > > > > > > > > > > > +		return false;
> > > > > > > > > > > > > > > > +	}
> > > > > > > > > > > > > > > > +	spin_unlock(&pag->pagb_lock);
> > > > > > > > > > > > > > > > +	return true;
> > > > > > > > > > > > > > > > +}
> > > > > > > > > > > > > > > > diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h
> > > > > > > > > > > > > > > > index 990ab3891971..2f8c73c712c6 100644
> > > > > > > > > > > > > > > > --- a/fs/xfs/xfs_extent_busy.h
> > > > > > > > > > > > > > > > +++ b/fs/xfs/xfs_extent_busy.h
> > > > > > > > > > > > > > > > @@ -65,4 +65,6 @@ static inline void xfs_extent_busy_sort(struct list_head *list)
> > > > > > > > > > > > > > > >  	list_sort(NULL, list, xfs_extent_busy_ag_cmp);
> > > > > > > > > > > > > > > >  }
> > > > > > > > > > > > > > > >  
> > > > > > > > > > > > > > > > +bool xfs_extent_busy_list_empty(struct xfs_perag *pag);
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > >  #endif /* __XFS_EXTENT_BUSY_H__ */
> > > > > > > > > > > > > > > > 
> > > > > > > > > > > > > > > > --
> > > > > > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > > > > > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > > > > > > > > > > --
> > > > > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > > > > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > > > > > > > > > --
> > > > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > > > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > > > > > > > > --
> > > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > > > > > > --
> > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > > > > > --
> > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > > > > --
> > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > > > --
> > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > > --
> > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > --
> > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > --
> > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > the body of a message to majordomo@vger.kernel.org
> > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > --
> > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > the body of a message to majordomo@vger.kernel.org
> > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > --
> > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > the body of a message to majordomo@vger.kernel.org
> > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
Brian Foster Aug. 10, 2018, 7:07 p.m. UTC | #17
On Fri, Aug 10, 2018 at 08:39:44AM -0700, Darrick J. Wong wrote:
> On Fri, Aug 10, 2018 at 06:33:52AM -0400, Brian Foster wrote:
> > On Thu, Aug 09, 2018 at 08:59:59AM -0700, Darrick J. Wong wrote:
> > > On Thu, Aug 09, 2018 at 08:00:28AM -0400, Brian Foster wrote:
> > > > On Wed, Aug 08, 2018 at 03:42:32PM -0700, Darrick J. Wong wrote:
> > > > > On Wed, Aug 08, 2018 at 08:29:54AM -0400, Brian Foster wrote:
> > > > > > On Tue, Aug 07, 2018 at 04:34:58PM -0700, Darrick J. Wong wrote:
> > > > > > > On Fri, Aug 03, 2018 at 06:49:40AM -0400, Brian Foster wrote:
> > > > > > > > On Thu, Aug 02, 2018 at 12:22:05PM -0700, Darrick J. Wong wrote:
> > > > > > > > > On Thu, Aug 02, 2018 at 09:48:24AM -0400, Brian Foster wrote:
> > > > > > > > > > On Wed, Aug 01, 2018 at 11:28:45PM -0700, Darrick J. Wong wrote:
> > > > > > > > > > > On Wed, Aug 01, 2018 at 02:39:20PM -0400, Brian Foster wrote:
> > > > > > > > > > > > On Wed, Aug 01, 2018 at 09:23:16AM -0700, Darrick J. Wong wrote:
> > > > > > > > > > > > > On Wed, Aug 01, 2018 at 07:54:09AM -0400, Brian Foster wrote:
> > > > > > > > > > > > > > On Tue, Jul 31, 2018 at 03:01:25PM -0700, Darrick J. Wong wrote:
> > > > > > > > > > > > > > > On Tue, Jul 31, 2018 at 01:47:23PM -0400, Brian Foster wrote:
> > > > > > > > > > > > > > > > On Sun, Jul 29, 2018 at 10:48:21PM -0700, Darrick J. Wong wrote:
> > ...
> > > > > > What _seems_ beneficial about that approach is we get (potentially
> > > > > > external) persistent backing and memory reclaim ability with the
> > > > > > traditional memory allocation model.
> > > > > >
> > > > > > ISTM that if we used a regular file, we'd need to deal with the
> > > > > > traditional file interface somehow or another (file read/pagecache
> > > > > > lookup -> record ??).
> > > > > 
> > > > > Yes, that's all neatly wrapped up in kernel_read() and kernel_write() so
> > > > > all we need is a (struct file *).
> > > > > 
> > > > > > We could repurpose some existing mechanism like the directory code or
> > > > > > quota inode mechanism to use xfs buffers for that purpose, but I think
> > > > > > that would require us to always use an internal inode. Allowing
> > > > > > userspace to pass an fd/file passes that consideration on to the user,
> > > > > > which might be more flexible. We could always warn about additional
> > > > > > limitations if that fd happens to be based on the target fs.
> > > > > 
> > > > > <nod> A second advantage of the struct file/kernel_{read,write} approach
> > > > > is that we if we ever decide to let userspace pass in a fd, it's trivial
> > > > > to feed that struct file to the kernel io routines instead of a memfd
> > > > > one.
> > > > > 
> > > > 
> > > > Yeah, I like this flexibility. In fact, I'm wondering why we wouldn't do
> > > > something like this anyways. Could/should xfs_scrub be responsible for
> > > > allocating a memfd and passing along the fd? Another advantage of doing
> > > > that is whatever logic we may need to clean up old repair files or
> > > > whatever is pushed to userspace.
> > > 
> > > There are two ways we could do this -- one is to have the kernel manage
> > > the memfd creation internally (like my patches do now); the other is for
> > > xfs_scrub to pass in creat(O_TMPFILE).
> > > 
> > > When repair fputs the file (or fdputs the fd if we switch to using
> > > that), the kernel will perform the usual deletion of the zero-linkcount
> > > zero-refcount file.  We get all the "cleanup" for free by closing the
> > > file.
> > > 
> > 
> > Ok. FWIW, the latter approach where xfs_scrub creates a file and passes
> > the fd along to the kernel seems preferable to me, but perhaps others
> > have different opinions. We could accept a pathname from the user to
> > create the file or otherwise attempt to allocate an memfd by default and
> > pass that along.
> > 
> > > One other potential complication is that a couple of the repair
> > > functions need two memfds.  The extended attribute repair creates a
> > > fixed-record array for attr keys and an xblob to hold names and values;
> > > each structure gets its own memfd.  The refcount repair creates two
> > > fixed-record arrays, one for refcount records and another to act as a
> > > stack of rmaps to compute reference counts.
> > > 
> > 
> > Hmm, I guess there's nothing stopping scrub from passing in two fds.
> > Maybe it would make more sense for the userspace option to be a path
> > basename or directory where scrub is allowed to create whatever scratch
> > files it needs.
> > 
> > That aside, is there any reason the repair mechanism couldn't emulate
> > multiple files with a single fd via a magic offset delimeter or
> > something? E.g., "file 1" starts at offset 0, "file 2" starts at offset
> > 1TB, etc. (1TB is probably overkill, but you get the idea..).
> 
> Hmm, ok, so to summarize, I see five options:
> 
> 1) Pass in a dirfd, repair can internally openat(dirfd, O_TMPFILE...)
> however many files it needs.
> 
> 2) Pass in a however many file fds we need and segment the space.
> 
> 3) Pass in a single file fd.
> 
> 4) Let the repair code create as many memfd files as it wants.
> 
> 5) Let the repair code create one memfd file and segment the space.
> 
> I'm pretty sure we don't want to support (2) because that just seems
> like a requirements communication nightmare and can burn up a lot of
> space in struct xfs_scrub_metadata.
> 
> (3) and (5) are basically the same except for where the file comes from.
> For (3) we'd have to make sure the fd filesystem supports large sparse
> files (and presumably isn't the xfs we're trying to repair), which
> shouldn't be too difficult to probe.  For (5) we know that tmpfs already
> supports large sparse files.  Another difficulty might be that on 32-bit
> the page cache only supports offsets as high as (ULONG_MAX * PAGE_SIZE),
> though I suppose at this point we only need two files and 8TB should be
> enough for anyone.
> 
> (I also think it's reasonable to consider not supporting online repair
> on a 32-bit system with a large filesystem...)
> 
> In general, the "pass in a thing from userspace" variants come with the
> complication that we have to check the functionality of whatever gets
> passed in.  On the plus side it likely unlocks access to a lot more
> storage than we could get with mem+swap.  On the minus side someone
> passes in a fd to a drive-managed SMR on USB 2.0, and...
> 
> (1) seems like it would maximize the kernel's flexibility to create as
> many (regular, non-sparse) files as it needs, but now we're calling
> do_sys_open and managing files ourselves, which might be avoided.
> 
> (4) of course is what we do right now. :)
> 
> Soooo... the simplest userspace interface (I think) is to allow
> userspace to pass in a single file fd.  Scrub can reject it if it
> doesn't measure up (fs is the same, sparse not supported, high offsets
> not supported, etc.).  If userspace doesn't pass in an fd then we create
> a memfd and use that instead.  We end up with a hybrid between (3) and (5).
> 

That all sounds about right to me except I was thinking userspace would
do the memfd fallback of #5 rather than the kernel, just to keep the
policy out of the kernel as much as possible. Is there any major
advantage to doing it in the kernel? I guess it would slightly
complicate 'xfs_io -c repair' ...

Brian

> --D
> 
> > Brian
> > 
> > > (In theory the xbitmap could also be converted to use the fixed record
> > > array, but in practice they haven't (yet) become large enough to warrant
> > > it, and there's currently no way to insert or delete records from the
> > > middle of the array.)
> > > 
> > > > > > > > I'm not familiar with memfd. The manpage suggests it's ram backed, is it
> > > > > > > > swappable or something?
> > > > > > > 
> > > > > > > It's supposed to be.  The quick test I ran (allocate a memfd, write 1GB
> > > > > > > of junk to it on a VM with 400M of RAM) seemed to push about 980MB into
> > > > > > > the swap file.
> > > > > > > 
> > > > > > 
> > > > > > Ok.
> > > > > > 
> > > > > > > > If so, that sounds a reasonable option provided the swap space
> > > > > > > > requirement can be made clear to users
> > > > > > > 
> > > > > > > We can document it.  I don't think it's any worse than xfs_repair being
> > > > > > > able to use up all the memory + swap... and since we're probably only
> > > > > > > going to be repairing one thing at a time, most likely scrub won't need
> > > > > > > as much memory.
> > > > > > > 
> > > > > > 
> > > > > > Right, but as noted below, my concerns with the xfs_repair comparison
> > > > > > are that 1.) the kernel generally has more of a limit on anonymous
> > > > > > memory allocations than userspace (i.e., not swappable AFAIU?) and 2.)
> > > > > > it's not clear how effectively running the system out of memory via the
> > > > > > kernel will behave from a failure perspective.
> > > > > > 
> > > > > > IOW, xfs_repair can run the system out of memory but for the most part
> > > > > > that ends up being a simple problem for the system: OOM kill the bloated
> > > > > > xfs_repair process. For an online repair in a similar situation, I have
> > > > > > no idea what's going to happen.
> > > > > 
> > > > > Back in the days of the huge linked lists the oom killer would target
> > > > > other proceses because it doesn't know that the online repair thread is
> > > > > sitting on a ton of pinned kernel memory...
> > > > > 
> > > > 
> > > > Makes sense, kind of what I'd expect...
> > > > 
> > > > > > The hope is that the online repair hits -ENOMEM and unwinds, but ISTM
> > > > > > we'd still be at risk of other subsystems running into memory
> > > > > > allocation problems, filling up swap, the OOM killer going after
> > > > > > unrelated processes, etc.  What if, for example, the OOM killer starts
> > > > > > picking off processes in service to a running online repair that
> > > > > > immediately consumes freed up memory until the system is borked?
> > > > > 
> > > > > Yeah.  One thing we /could/ do is register an oom notifier that would
> > > > > urge any running repair threads to bail out if they can.  It seems to me
> > > > > that the oom killer blocks on the oom_notify_list chain, so our handler
> > > > > could wait until at least one thread exits before returning.
> > > > > 
> > > > 
> > > > Ok, something like that could be useful. I agree that we probably don't
> > > > need to go that far until the mechanism is nailed down and testing shows
> > > > that OOM is a problem.
> > > 
> > > It already is a problem on my contrived "2TB hardlink/reflink farm fs" +
> > > "400M of RAM and no swap" scenario.  Granted, pretty much every other
> > > xfs utility also blows out on that so I'm not sure how hard I really
> > > need to try...
> > > 
> > > > > > I don't know how likely that is or if it really ends up much different
> > > > > > from the analogous xfs_repair situation. My only point right now is
> > > > > > that failure scenario is something we should explore for any solution
> > > > > > we ultimately consider because it may be an unexpected use case of the
> > > > > > underlying mechanism.
> > > > > 
> > > > > Ideally, online repair would always be the victim since we know we have
> > > > > a reasonable fallback.  At least for memfd, however, I think the only
> > > > > clues we have to decide the question "is this memfd getting in the way
> > > > > of other threads?" is either seeing ENOMEM, short writes, or getting
> > > > > kicked by an oom notification.  Maybe that'll be enough?
> > > > > 
> > > > 
> > > > Hm, yeah. It may be challenging to track memfd usage as such. If
> > > > userspace has access to the fd on an OOM notification or whatever, it
> > > > might be able to do more accurate analysis based on an fstat() or
> > > > something.
> > > > 
> > > > Related question... is the online repair sequence currently
> > > > interruptible, if xfs_scrub receives a fatal signal while pulling in
> > > > entries during an allocbt scan for example?
> > > 
> > > It's interruptible (fatal signals only) during the scan phase, but once
> > > it starts logging metadata updates it will run all the way to
> > > completion.
> > > 
> > > > > > (To the contrary, just using a cached file seems a natural fit from
> > > > > > that perspective.)
> > > > > 
> > > > > Same here.
> > > > > 
> > > > > > > > and the failure characteristics aren't more severe than for userspace.
> > > > > > > > An online repair that puts the broader system at risk of OOM as
> > > > > > > > opposed to predictably failing gracefully may not be the most useful
> > > > > > > > tool.
> > > > > > > 
> > > > > > > Agreed.  One huge downside of memfd seems to be the lack of a mechanism
> > > > > > > for the vm to push back on us if we successfully write all we need to
> > > > > > > the memfd but then other processes need some memory.  Obviously, if the
> > > > > > > memfd write itself comes up short or fails then we dump the memfd and
> > > > > > > error back to userspace.  We might simply have to free array memory
> > > > > > > while we iterate the records to minimize the time spent at peak memory
> > > > > > > usage.
> > > > > > > 
> > > > > > 
> > > > > > Hm, yeah. Some kind of fixed/relative size in-core memory pool approach
> > > > > > may simplify things because we could allocate it up front and know right
> > > > > > away whether we just don't have enough memory available to repair.
> > > > > 
> > > > > Hmm.  Apparently we actually /can/ call fallocate on memfd to grab all
> > > > > the pages at once, provided we have some guesstimate beforehand of how
> > > > > much space we think we'll need.
> > > > > 
> > > > > So long as my earlier statement about the memory requirements being no
> > > > > more than the size of the btree leaves is actually true (I haven't
> > > > > rigorously tried to prove it), we need about (xrep_calc_ag_resblks() *
> > > > > blocksize) worth of space in the memfd file.  Maybe we ask for 1.5x
> > > > > that and if we don't get it, we kill the memfd and exit.
> > > > > 
> > > > 
> > > > Indeed. It would be nice if we could do all of the file management bits
> > > > in userspace.
> > > 
> > > Agreed, though no file management would be even better. :)
> > > 
> > > --D
> > > 
> > > > Brian
> > > > 
> > > > > --D
> > > > > 
> > > > > > 
> > > > > > Brian
> > > > > > 
> > > > > > > --D
> > > > > > > 
> > > > > > > > 
> > > > > > > > Brian
> > > > > > > > 
> > > > > > > > > --D
> > > > > > > > > 
> > > > > > > > > > Brian
> > > > > > > > > > 
> > > > > > > > > > > --D
> > > > > > > > > > > 
> > > > > > > > > > > > Brian
> > > > > > > > > > > > 
> > > > > > > > > > > > > --D
> > > > > > > > > > > > > 
> > > > > > > > > > > > > > Brian
> > > > > > > > > > > > > > 
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > +done:
> > > > > > > > > > > > > > > > > +	/* Free all the OWN_AG blocks that are not in the rmapbt/agfl. */
> > > > > > > > > > > > > > > > > +	xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG);
> > > > > > > > > > > > > > > > > +	return xrep_reap_extents(sc, old_allocbt_blocks, &oinfo,
> > > > > > > > > > > > > > > > > +			XFS_AG_RESV_NONE);
> > > > > > > > > > > > > > > > > +}
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > ...
> > > > > > > > > > > > > > > > > diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
> > > > > > > > > > > > > > > > > index 0ed68379e551..82f99633a597 100644
> > > > > > > > > > > > > > > > > --- a/fs/xfs/xfs_extent_busy.c
> > > > > > > > > > > > > > > > > +++ b/fs/xfs/xfs_extent_busy.c
> > > > > > > > > > > > > > > > > @@ -657,3 +657,17 @@ xfs_extent_busy_ag_cmp(
> > > > > > > > > > > > > > > > >  		diff = b1->bno - b2->bno;
> > > > > > > > > > > > > > > > >  	return diff;
> > > > > > > > > > > > > > > > >  }
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > +/* Are there any busy extents in this AG? */
> > > > > > > > > > > > > > > > > +bool
> > > > > > > > > > > > > > > > > +xfs_extent_busy_list_empty(
> > > > > > > > > > > > > > > > > +	struct xfs_perag	*pag)
> > > > > > > > > > > > > > > > > +{
> > > > > > > > > > > > > > > > > +	spin_lock(&pag->pagb_lock);
> > > > > > > > > > > > > > > > > +	if (pag->pagb_tree.rb_node) {
> > > > > > > > > > > > > > > > 
> > > > > > > > > > > > > > > > RB_EMPTY_ROOT()?
> > > > > > > > > > > > > > > 
> > > > > > > > > > > > > > > Good suggestion, thank you!
> > > > > > > > > > > > > > > 
> > > > > > > > > > > > > > > --D
> > > > > > > > > > > > > > > 
> > > > > > > > > > > > > > > > Brian
> > > > > > > > > > > > > > > > 
> > > > > > > > > > > > > > > > > +		spin_unlock(&pag->pagb_lock);
> > > > > > > > > > > > > > > > > +		return false;
> > > > > > > > > > > > > > > > > +	}
> > > > > > > > > > > > > > > > > +	spin_unlock(&pag->pagb_lock);
> > > > > > > > > > > > > > > > > +	return true;
> > > > > > > > > > > > > > > > > +}
> > > > > > > > > > > > > > > > > diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h
> > > > > > > > > > > > > > > > > index 990ab3891971..2f8c73c712c6 100644
> > > > > > > > > > > > > > > > > --- a/fs/xfs/xfs_extent_busy.h
> > > > > > > > > > > > > > > > > +++ b/fs/xfs/xfs_extent_busy.h
> > > > > > > > > > > > > > > > > @@ -65,4 +65,6 @@ static inline void xfs_extent_busy_sort(struct list_head *list)
> > > > > > > > > > > > > > > > >  	list_sort(NULL, list, xfs_extent_busy_ag_cmp);
> > > > > > > > > > > > > > > > >  }
> > > > > > > > > > > > > > > > >  
> > > > > > > > > > > > > > > > > +bool xfs_extent_busy_list_empty(struct xfs_perag *pag);
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > >  #endif /* __XFS_EXTENT_BUSY_H__ */
> > > > > > > > > > > > > > > > > 
> > > > > > > > > > > > > > > > > --
> > > > > > > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > > > > > > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > > > > > > > > > > > --
> > > > > > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > > > > > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > > > > > > > > > > --
> > > > > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > > > > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > > > > > > > > > --
> > > > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > > > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > > > > > > > --
> > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > > > > > > --
> > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > > > > > --
> > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > > > > --
> > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > > > --
> > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > > --
> > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > --
> > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > --
> > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > the body of a message to majordomo@vger.kernel.org
> > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > --
> > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > the body of a message to majordomo@vger.kernel.org
> > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > --
> > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > the body of a message to majordomo@vger.kernel.org
> > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
Darrick J. Wong Aug. 10, 2018, 7:36 p.m. UTC | #18
On Fri, Aug 10, 2018 at 03:07:40PM -0400, Brian Foster wrote:
> On Fri, Aug 10, 2018 at 08:39:44AM -0700, Darrick J. Wong wrote:
> > On Fri, Aug 10, 2018 at 06:33:52AM -0400, Brian Foster wrote:
> > > On Thu, Aug 09, 2018 at 08:59:59AM -0700, Darrick J. Wong wrote:
> > > > On Thu, Aug 09, 2018 at 08:00:28AM -0400, Brian Foster wrote:
> > > > > On Wed, Aug 08, 2018 at 03:42:32PM -0700, Darrick J. Wong wrote:
> > > > > > On Wed, Aug 08, 2018 at 08:29:54AM -0400, Brian Foster wrote:
> > > > > > > On Tue, Aug 07, 2018 at 04:34:58PM -0700, Darrick J. Wong wrote:
> > > > > > > > On Fri, Aug 03, 2018 at 06:49:40AM -0400, Brian Foster wrote:
> > > > > > > > > On Thu, Aug 02, 2018 at 12:22:05PM -0700, Darrick J. Wong wrote:
> > > > > > > > > > On Thu, Aug 02, 2018 at 09:48:24AM -0400, Brian Foster wrote:
> > > > > > > > > > > On Wed, Aug 01, 2018 at 11:28:45PM -0700, Darrick J. Wong wrote:
> > > > > > > > > > > > On Wed, Aug 01, 2018 at 02:39:20PM -0400, Brian Foster wrote:
> > > > > > > > > > > > > On Wed, Aug 01, 2018 at 09:23:16AM -0700, Darrick J. Wong wrote:
> > > > > > > > > > > > > > On Wed, Aug 01, 2018 at 07:54:09AM -0400, Brian Foster wrote:
> > > > > > > > > > > > > > > On Tue, Jul 31, 2018 at 03:01:25PM -0700, Darrick J. Wong wrote:
> > > > > > > > > > > > > > > > On Tue, Jul 31, 2018 at 01:47:23PM -0400, Brian Foster wrote:
> > > > > > > > > > > > > > > > > On Sun, Jul 29, 2018 at 10:48:21PM -0700, Darrick J. Wong wrote:
> > > ...
> > > > > > > What _seems_ beneficial about that approach is we get (potentially
> > > > > > > external) persistent backing and memory reclaim ability with the
> > > > > > > traditional memory allocation model.
> > > > > > >
> > > > > > > ISTM that if we used a regular file, we'd need to deal with the
> > > > > > > traditional file interface somehow or another (file read/pagecache
> > > > > > > lookup -> record ??).
> > > > > > 
> > > > > > Yes, that's all neatly wrapped up in kernel_read() and kernel_write() so
> > > > > > all we need is a (struct file *).
> > > > > > 
> > > > > > > We could repurpose some existing mechanism like the directory code or
> > > > > > > quota inode mechanism to use xfs buffers for that purpose, but I think
> > > > > > > that would require us to always use an internal inode. Allowing
> > > > > > > userspace to pass an fd/file passes that consideration on to the user,
> > > > > > > which might be more flexible. We could always warn about additional
> > > > > > > limitations if that fd happens to be based on the target fs.
> > > > > > 
> > > > > > <nod> A second advantage of the struct file/kernel_{read,write} approach
> > > > > > is that we if we ever decide to let userspace pass in a fd, it's trivial
> > > > > > to feed that struct file to the kernel io routines instead of a memfd
> > > > > > one.
> > > > > > 
> > > > > 
> > > > > Yeah, I like this flexibility. In fact, I'm wondering why we wouldn't do
> > > > > something like this anyways. Could/should xfs_scrub be responsible for
> > > > > allocating a memfd and passing along the fd? Another advantage of doing
> > > > > that is whatever logic we may need to clean up old repair files or
> > > > > whatever is pushed to userspace.
> > > > 
> > > > There are two ways we could do this -- one is to have the kernel manage
> > > > the memfd creation internally (like my patches do now); the other is for
> > > > xfs_scrub to pass in creat(O_TMPFILE).
> > > > 
> > > > When repair fputs the file (or fdputs the fd if we switch to using
> > > > that), the kernel will perform the usual deletion of the zero-linkcount
> > > > zero-refcount file.  We get all the "cleanup" for free by closing the
> > > > file.
> > > > 
> > > 
> > > Ok. FWIW, the latter approach where xfs_scrub creates a file and passes
> > > the fd along to the kernel seems preferable to me, but perhaps others
> > > have different opinions. We could accept a pathname from the user to
> > > create the file or otherwise attempt to allocate an memfd by default and
> > > pass that along.
> > > 
> > > > One other potential complication is that a couple of the repair
> > > > functions need two memfds.  The extended attribute repair creates a
> > > > fixed-record array for attr keys and an xblob to hold names and values;
> > > > each structure gets its own memfd.  The refcount repair creates two
> > > > fixed-record arrays, one for refcount records and another to act as a
> > > > stack of rmaps to compute reference counts.
> > > > 
> > > 
> > > Hmm, I guess there's nothing stopping scrub from passing in two fds.
> > > Maybe it would make more sense for the userspace option to be a path
> > > basename or directory where scrub is allowed to create whatever scratch
> > > files it needs.
> > > 
> > > That aside, is there any reason the repair mechanism couldn't emulate
> > > multiple files with a single fd via a magic offset delimeter or
> > > something? E.g., "file 1" starts at offset 0, "file 2" starts at offset
> > > 1TB, etc. (1TB is probably overkill, but you get the idea..).
> > 
> > Hmm, ok, so to summarize, I see five options:
> > 
> > 1) Pass in a dirfd, repair can internally openat(dirfd, O_TMPFILE...)
> > however many files it needs.
> > 
> > 2) Pass in a however many file fds we need and segment the space.
> > 
> > 3) Pass in a single file fd.
> > 
> > 4) Let the repair code create as many memfd files as it wants.
> > 
> > 5) Let the repair code create one memfd file and segment the space.
> > 
> > I'm pretty sure we don't want to support (2) because that just seems
> > like a requirements communication nightmare and can burn up a lot of
> > space in struct xfs_scrub_metadata.
> > 
> > (3) and (5) are basically the same except for where the file comes from.
> > For (3) we'd have to make sure the fd filesystem supports large sparse
> > files (and presumably isn't the xfs we're trying to repair), which
> > shouldn't be too difficult to probe.  For (5) we know that tmpfs already
> > supports large sparse files.  Another difficulty might be that on 32-bit
> > the page cache only supports offsets as high as (ULONG_MAX * PAGE_SIZE),
> > though I suppose at this point we only need two files and 8TB should be
> > enough for anyone.
> > 
> > (I also think it's reasonable to consider not supporting online repair
> > on a 32-bit system with a large filesystem...)
> > 
> > In general, the "pass in a thing from userspace" variants come with the
> > complication that we have to check the functionality of whatever gets
> > passed in.  On the plus side it likely unlocks access to a lot more
> > storage than we could get with mem+swap.  On the minus side someone
> > passes in a fd to a drive-managed SMR on USB 2.0, and...
> > 
> > (1) seems like it would maximize the kernel's flexibility to create as
> > many (regular, non-sparse) files as it needs, but now we're calling
> > do_sys_open and managing files ourselves, which might be avoided.
> > 
> > (4) of course is what we do right now. :)
> > 
> > Soooo... the simplest userspace interface (I think) is to allow
> > userspace to pass in a single file fd.  Scrub can reject it if it
> > doesn't measure up (fs is the same, sparse not supported, high offsets
> > not supported, etc.).  If userspace doesn't pass in an fd then we create
> > a memfd and use that instead.  We end up with a hybrid between (3) and (5).
> > 
> 
> That all sounds about right to me except I was thinking userspace would
> do the memfd fallback of #5 rather than the kernel, just to keep the
> policy out of the kernel as much as possible. Is there any major
> advantage to doing it in the kernel? I guess it would slightly
> complicate 'xfs_io -c repair' ...

Hm.  We'll have to use one of the reserved areas of struct
xfs_scrub_metadata to pass in the file descriptor.  If we create a new
XFS_SCRUB_IFLAG_FD flag to indicate that we're passing in a file
descriptor then either we lose compatibility with old kernels (because
they reject unknown flags) or xfs_scrub will have to try a repair
without a fd (to see if the kernel even cares) and retry if the repair
fails with some prearranged error code that means "give me a swapfile,
please".  Alternately we simply require that the fd cannot be fd 0 since
using stdin for swap space is a stupid idea anyways.

Technically we're not supposed to have flag days, but otoh this is a
xfs-only ioctl for a feature that's still experimental, so perhaps it's
not crucial to maintain compatibility with old kernels where the feature
is incomplete and experimental?

Hmm.  We could define the fd field with the requirement that fd > 0, and
if the repair function requires an fd and one hasn't been provided, it
can fail out with ENOMEM.  If it doesn't need extra memory it can just
ignore the contents of the fd field.  xfs_scrub can then arrange to pass
in mem fds or file fds or whatever.

--D

> Brian
> 
> > --D
> > 
> > > Brian
> > > 
> > > > (In theory the xbitmap could also be converted to use the fixed record
> > > > array, but in practice they haven't (yet) become large enough to warrant
> > > > it, and there's currently no way to insert or delete records from the
> > > > middle of the array.)
> > > > 
> > > > > > > > > I'm not familiar with memfd. The manpage suggests it's ram backed, is it
> > > > > > > > > swappable or something?
> > > > > > > > 
> > > > > > > > It's supposed to be.  The quick test I ran (allocate a memfd, write 1GB
> > > > > > > > of junk to it on a VM with 400M of RAM) seemed to push about 980MB into
> > > > > > > > the swap file.
> > > > > > > > 
> > > > > > > 
> > > > > > > Ok.
> > > > > > > 
> > > > > > > > > If so, that sounds a reasonable option provided the swap space
> > > > > > > > > requirement can be made clear to users
> > > > > > > > 
> > > > > > > > We can document it.  I don't think it's any worse than xfs_repair being
> > > > > > > > able to use up all the memory + swap... and since we're probably only
> > > > > > > > going to be repairing one thing at a time, most likely scrub won't need
> > > > > > > > as much memory.
> > > > > > > > 
> > > > > > > 
> > > > > > > Right, but as noted below, my concerns with the xfs_repair comparison
> > > > > > > are that 1.) the kernel generally has more of a limit on anonymous
> > > > > > > memory allocations than userspace (i.e., not swappable AFAIU?) and 2.)
> > > > > > > it's not clear how effectively running the system out of memory via the
> > > > > > > kernel will behave from a failure perspective.
> > > > > > > 
> > > > > > > IOW, xfs_repair can run the system out of memory but for the most part
> > > > > > > that ends up being a simple problem for the system: OOM kill the bloated
> > > > > > > xfs_repair process. For an online repair in a similar situation, I have
> > > > > > > no idea what's going to happen.
> > > > > > 
> > > > > > Back in the days of the huge linked lists the oom killer would target
> > > > > > other proceses because it doesn't know that the online repair thread is
> > > > > > sitting on a ton of pinned kernel memory...
> > > > > > 
> > > > > 
> > > > > Makes sense, kind of what I'd expect...
> > > > > 
> > > > > > > The hope is that the online repair hits -ENOMEM and unwinds, but ISTM
> > > > > > > we'd still be at risk of other subsystems running into memory
> > > > > > > allocation problems, filling up swap, the OOM killer going after
> > > > > > > unrelated processes, etc.  What if, for example, the OOM killer starts
> > > > > > > picking off processes in service to a running online repair that
> > > > > > > immediately consumes freed up memory until the system is borked?
> > > > > > 
> > > > > > Yeah.  One thing we /could/ do is register an oom notifier that would
> > > > > > urge any running repair threads to bail out if they can.  It seems to me
> > > > > > that the oom killer blocks on the oom_notify_list chain, so our handler
> > > > > > could wait until at least one thread exits before returning.
> > > > > > 
> > > > > 
> > > > > Ok, something like that could be useful. I agree that we probably don't
> > > > > need to go that far until the mechanism is nailed down and testing shows
> > > > > that OOM is a problem.
> > > > 
> > > > It already is a problem on my contrived "2TB hardlink/reflink farm fs" +
> > > > "400M of RAM and no swap" scenario.  Granted, pretty much every other
> > > > xfs utility also blows out on that so I'm not sure how hard I really
> > > > need to try...
> > > > 
> > > > > > > I don't know how likely that is or if it really ends up much different
> > > > > > > from the analogous xfs_repair situation. My only point right now is
> > > > > > > that failure scenario is something we should explore for any solution
> > > > > > > we ultimately consider because it may be an unexpected use case of the
> > > > > > > underlying mechanism.
> > > > > > 
> > > > > > Ideally, online repair would always be the victim since we know we have
> > > > > > a reasonable fallback.  At least for memfd, however, I think the only
> > > > > > clues we have to decide the question "is this memfd getting in the way
> > > > > > of other threads?" is either seeing ENOMEM, short writes, or getting
> > > > > > kicked by an oom notification.  Maybe that'll be enough?
> > > > > > 
> > > > > 
> > > > > Hm, yeah. It may be challenging to track memfd usage as such. If
> > > > > userspace has access to the fd on an OOM notification or whatever, it
> > > > > might be able to do more accurate analysis based on an fstat() or
> > > > > something.
> > > > > 
> > > > > Related question... is the online repair sequence currently
> > > > > interruptible, if xfs_scrub receives a fatal signal while pulling in
> > > > > entries during an allocbt scan for example?
> > > > 
> > > > It's interruptible (fatal signals only) during the scan phase, but once
> > > > it starts logging metadata updates it will run all the way to
> > > > completion.
> > > > 
> > > > > > > (To the contrary, just using a cached file seems a natural fit from
> > > > > > > that perspective.)
> > > > > > 
> > > > > > Same here.
> > > > > > 
> > > > > > > > > and the failure characteristics aren't more severe than for userspace.
> > > > > > > > > An online repair that puts the broader system at risk of OOM as
> > > > > > > > > opposed to predictably failing gracefully may not be the most useful
> > > > > > > > > tool.
> > > > > > > > 
> > > > > > > > Agreed.  One huge downside of memfd seems to be the lack of a mechanism
> > > > > > > > for the vm to push back on us if we successfully write all we need to
> > > > > > > > the memfd but then other processes need some memory.  Obviously, if the
> > > > > > > > memfd write itself comes up short or fails then we dump the memfd and
> > > > > > > > error back to userspace.  We might simply have to free array memory
> > > > > > > > while we iterate the records to minimize the time spent at peak memory
> > > > > > > > usage.
> > > > > > > > 
> > > > > > > 
> > > > > > > Hm, yeah. Some kind of fixed/relative size in-core memory pool approach
> > > > > > > may simplify things because we could allocate it up front and know right
> > > > > > > away whether we just don't have enough memory available to repair.
> > > > > > 
> > > > > > Hmm.  Apparently we actually /can/ call fallocate on memfd to grab all
> > > > > > the pages at once, provided we have some guesstimate beforehand of how
> > > > > > much space we think we'll need.
> > > > > > 
> > > > > > So long as my earlier statement about the memory requirements being no
> > > > > > more than the size of the btree leaves is actually true (I haven't
> > > > > > rigorously tried to prove it), we need about (xrep_calc_ag_resblks() *
> > > > > > blocksize) worth of space in the memfd file.  Maybe we ask for 1.5x
> > > > > > that and if we don't get it, we kill the memfd and exit.
> > > > > > 
> > > > > 
> > > > > Indeed. It would be nice if we could do all of the file management bits
> > > > > in userspace.
> > > > 
> > > > Agreed, though no file management would be even better. :)
> > > > 
> > > > --D
> > > > 
> > > > > Brian
> > > > > 
> > > > > > --D
> > > > > > 
> > > > > > > 
> > > > > > > Brian
> > > > > > > 
> > > > > > > > --D
> > > > > > > > 
> > > > > > > > > 
> > > > > > > > > Brian
> > > > > > > > > 
> > > > > > > > > > --D
> > > > > > > > > > 
> > > > > > > > > > > Brian
> > > > > > > > > > > 
> > > > > > > > > > > > --D
> > > > > > > > > > > > 
> > > > > > > > > > > > > Brian
> > > > > > > > > > > > > 
> > > > > > > > > > > > > > --D
> > > > > > > > > > > > > > 
> > > > > > > > > > > > > > > Brian
> > > > > > > > > > > > > > > 
> > > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > > +done:
> > > > > > > > > > > > > > > > > > +	/* Free all the OWN_AG blocks that are not in the rmapbt/agfl. */
> > > > > > > > > > > > > > > > > > +	xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG);
> > > > > > > > > > > > > > > > > > +	return xrep_reap_extents(sc, old_allocbt_blocks, &oinfo,
> > > > > > > > > > > > > > > > > > +			XFS_AG_RESV_NONE);
> > > > > > > > > > > > > > > > > > +}
> > > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > ...
> > > > > > > > > > > > > > > > > > diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
> > > > > > > > > > > > > > > > > > index 0ed68379e551..82f99633a597 100644
> > > > > > > > > > > > > > > > > > --- a/fs/xfs/xfs_extent_busy.c
> > > > > > > > > > > > > > > > > > +++ b/fs/xfs/xfs_extent_busy.c
> > > > > > > > > > > > > > > > > > @@ -657,3 +657,17 @@ xfs_extent_busy_ag_cmp(
> > > > > > > > > > > > > > > > > >  		diff = b1->bno - b2->bno;
> > > > > > > > > > > > > > > > > >  	return diff;
> > > > > > > > > > > > > > > > > >  }
> > > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > > +/* Are there any busy extents in this AG? */
> > > > > > > > > > > > > > > > > > +bool
> > > > > > > > > > > > > > > > > > +xfs_extent_busy_list_empty(
> > > > > > > > > > > > > > > > > > +	struct xfs_perag	*pag)
> > > > > > > > > > > > > > > > > > +{
> > > > > > > > > > > > > > > > > > +	spin_lock(&pag->pagb_lock);
> > > > > > > > > > > > > > > > > > +	if (pag->pagb_tree.rb_node) {
> > > > > > > > > > > > > > > > > 
> > > > > > > > > > > > > > > > > RB_EMPTY_ROOT()?
> > > > > > > > > > > > > > > > 
> > > > > > > > > > > > > > > > Good suggestion, thank you!
> > > > > > > > > > > > > > > > 
> > > > > > > > > > > > > > > > --D
> > > > > > > > > > > > > > > > 
> > > > > > > > > > > > > > > > > Brian
> > > > > > > > > > > > > > > > > 
> > > > > > > > > > > > > > > > > > +		spin_unlock(&pag->pagb_lock);
> > > > > > > > > > > > > > > > > > +		return false;
> > > > > > > > > > > > > > > > > > +	}
> > > > > > > > > > > > > > > > > > +	spin_unlock(&pag->pagb_lock);
> > > > > > > > > > > > > > > > > > +	return true;
> > > > > > > > > > > > > > > > > > +}
> > > > > > > > > > > > > > > > > > diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h
> > > > > > > > > > > > > > > > > > index 990ab3891971..2f8c73c712c6 100644
> > > > > > > > > > > > > > > > > > --- a/fs/xfs/xfs_extent_busy.h
> > > > > > > > > > > > > > > > > > +++ b/fs/xfs/xfs_extent_busy.h
> > > > > > > > > > > > > > > > > > @@ -65,4 +65,6 @@ static inline void xfs_extent_busy_sort(struct list_head *list)
> > > > > > > > > > > > > > > > > >  	list_sort(NULL, list, xfs_extent_busy_ag_cmp);
> > > > > > > > > > > > > > > > > >  }
> > > > > > > > > > > > > > > > > >  
> > > > > > > > > > > > > > > > > > +bool xfs_extent_busy_list_empty(struct xfs_perag *pag);
> > > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > >  #endif /* __XFS_EXTENT_BUSY_H__ */
> > > > > > > > > > > > > > > > > > 
> > > > > > > > > > > > > > > > > > --
> > > > > > > > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > > > > > > > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > > > > > > > > > > > > --
> > > > > > > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > > > > > > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > > > > > > > > > > > --
> > > > > > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > > > > > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > > > > > > > > > > --
> > > > > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > > > > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > > > > > > > > --
> > > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > > > > > > > --
> > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > > > > > > --
> > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > > > > > --
> > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > > > > --
> > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > > > --
> > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > > --
> > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > --
> > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > --
> > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > the body of a message to majordomo@vger.kernel.org
> > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > --
> > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > the body of a message to majordomo@vger.kernel.org
> > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
Brian Foster Aug. 11, 2018, 12:50 p.m. UTC | #19
On Fri, Aug 10, 2018 at 12:36:51PM -0700, Darrick J. Wong wrote:
> On Fri, Aug 10, 2018 at 03:07:40PM -0400, Brian Foster wrote:
> > On Fri, Aug 10, 2018 at 08:39:44AM -0700, Darrick J. Wong wrote:
> > > On Fri, Aug 10, 2018 at 06:33:52AM -0400, Brian Foster wrote:
> > > > On Thu, Aug 09, 2018 at 08:59:59AM -0700, Darrick J. Wong wrote:
> > > > > On Thu, Aug 09, 2018 at 08:00:28AM -0400, Brian Foster wrote:
> > > > > > On Wed, Aug 08, 2018 at 03:42:32PM -0700, Darrick J. Wong wrote:
> > > > > > > On Wed, Aug 08, 2018 at 08:29:54AM -0400, Brian Foster wrote:
> > > > > > > > On Tue, Aug 07, 2018 at 04:34:58PM -0700, Darrick J. Wong wrote:
> > > > > > > > > On Fri, Aug 03, 2018 at 06:49:40AM -0400, Brian Foster wrote:
> > > > > > > > > > On Thu, Aug 02, 2018 at 12:22:05PM -0700, Darrick J. Wong wrote:
> > > > > > > > > > > On Thu, Aug 02, 2018 at 09:48:24AM -0400, Brian Foster wrote:
> > > > > > > > > > > > On Wed, Aug 01, 2018 at 11:28:45PM -0700, Darrick J. Wong wrote:
> > > > > > > > > > > > > On Wed, Aug 01, 2018 at 02:39:20PM -0400, Brian Foster wrote:
> > > > > > > > > > > > > > On Wed, Aug 01, 2018 at 09:23:16AM -0700, Darrick J. Wong wrote:
> > > > > > > > > > > > > > > On Wed, Aug 01, 2018 at 07:54:09AM -0400, Brian Foster wrote:
> > > > > > > > > > > > > > > > On Tue, Jul 31, 2018 at 03:01:25PM -0700, Darrick J. Wong wrote:
> > > > > > > > > > > > > > > > > On Tue, Jul 31, 2018 at 01:47:23PM -0400, Brian Foster wrote:
> > > > > > > > > > > > > > > > > > On Sun, Jul 29, 2018 at 10:48:21PM -0700, Darrick J. Wong wrote:
...
> > > 
> > > Hmm, ok, so to summarize, I see five options:
> > > 
> > > 1) Pass in a dirfd, repair can internally openat(dirfd, O_TMPFILE...)
> > > however many files it needs.
> > > 
> > > 2) Pass in a however many file fds we need and segment the space.
> > > 
> > > 3) Pass in a single file fd.
> > > 
> > > 4) Let the repair code create as many memfd files as it wants.
> > > 
> > > 5) Let the repair code create one memfd file and segment the space.
> > > 
> > > I'm pretty sure we don't want to support (2) because that just seems
> > > like a requirements communication nightmare and can burn up a lot of
> > > space in struct xfs_scrub_metadata.
> > > 
> > > (3) and (5) are basically the same except for where the file comes from.
> > > For (3) we'd have to make sure the fd filesystem supports large sparse
> > > files (and presumably isn't the xfs we're trying to repair), which
> > > shouldn't be too difficult to probe.  For (5) we know that tmpfs already
> > > supports large sparse files.  Another difficulty might be that on 32-bit
> > > the page cache only supports offsets as high as (ULONG_MAX * PAGE_SIZE),
> > > though I suppose at this point we only need two files and 8TB should be
> > > enough for anyone.
> > > 
> > > (I also think it's reasonable to consider not supporting online repair
> > > on a 32-bit system with a large filesystem...)
> > > 
> > > In general, the "pass in a thing from userspace" variants come with the
> > > complication that we have to check the functionality of whatever gets
> > > passed in.  On the plus side it likely unlocks access to a lot more
> > > storage than we could get with mem+swap.  On the minus side someone
> > > passes in a fd to a drive-managed SMR on USB 2.0, and...
> > > 
> > > (1) seems like it would maximize the kernel's flexibility to create as
> > > many (regular, non-sparse) files as it needs, but now we're calling
> > > do_sys_open and managing files ourselves, which might be avoided.
> > > 
> > > (4) of course is what we do right now. :)
> > > 
> > > Soooo... the simplest userspace interface (I think) is to allow
> > > userspace to pass in a single file fd.  Scrub can reject it if it
> > > doesn't measure up (fs is the same, sparse not supported, high offsets
> > > not supported, etc.).  If userspace doesn't pass in an fd then we create
> > > a memfd and use that instead.  We end up with a hybrid between (3) and (5).
> > > 
> > 
> > That all sounds about right to me except I was thinking userspace would
> > do the memfd fallback of #5 rather than the kernel, just to keep the
> > policy out of the kernel as much as possible. Is there any major
> > advantage to doing it in the kernel? I guess it would slightly
> > complicate 'xfs_io -c repair' ...
> 
> Hm.  We'll have to use one of the reserved areas of struct
> xfs_scrub_metadata to pass in the file descriptor.  If we create a new
> XFS_SCRUB_IFLAG_FD flag to indicate that we're passing in a file
> descriptor then either we lose compatibility with old kernels (because
> they reject unknown flags) or xfs_scrub will have to try a repair
> without a fd (to see if the kernel even cares) and retry if the repair
> fails with some prearranged error code that means "give me a swapfile,
> please".  Alternately we simply require that the fd cannot be fd 0 since
> using stdin for swap space is a stupid idea anyways.
> 

I'm assuming that the kernel would have some basic checks on the fd to
ensure it's usable (seekable, large offsets, etc.), as you mentioned
previously.

With regard to xfs_scrub_metadata, it sounds like we need to deal with
that regardless if we want to support the ability to specify an external
file. Is the issue backwards compatibility with the interface as it
exists today..?

> Technically we're not supposed to have flag days, but otoh this is a
> xfs-only ioctl for a feature that's still experimental, so perhaps it's
> not crucial to maintain compatibility with old kernels where the feature
> is incomplete and experimental?
> 

In my mind, I kind of take the experimental status as all bits/interface
may explode and are otherwise subject to change or disappear. Perhaps
others feel differently, it does seem we've kind of hinted towards the
contrary recently with respect to the per-inode dax bits and then now in
this discussion, but IMO that's kind of an inherent risk of doing
incremental work on complex features upstream.

I dunno, perhaps that's just a misunderstanding on my part. If so, I do
wonder if we should be a bit more cautious (in the future) about
exposing interfaces to experimental features (DEBUG mode only, for
example) for a period of time until the underlying mechanism is fleshed
out enough to establish confidence in the interface. It's one thing if
an experimental feature is shiny new and potentially unstable at the
time it is merged, but enough bits are there for reviewers to understand
the design and interface requirements. It's another thing if the
implementation is not yet complete, because then it's obviously harder
to surmise whether the interface is ultimately sufficient.

This of course is all higher level discussion from how to handle scrub..

> Hmm.  We could define the fd field with the requirement that fd > 0, and
> if the repair function requires an fd and one hasn't been provided, it
> can fail out with ENOMEM.  If it doesn't need extra memory it can just
> ignore the contents of the fd field.  xfs_scrub can then arrange to pass
> in mem fds or file fds or whatever.
> 

Is there a versioning mechanism to the interface? I thought we used that
approach (or planned to..) in other similar internal commands, so a
particular kernel could bump the version and appropriately decide how to
handle older versions.

Brian

> --D
> 
> > Brian
> > 
> > > --D
> > > 
> > > > Brian
> > > > 
> > > > > (In theory the xbitmap could also be converted to use the fixed record
> > > > > array, but in practice they haven't (yet) become large enough to warrant
> > > > > it, and there's currently no way to insert or delete records from the
> > > > > middle of the array.)
> > > > > 
> > > > > > > > > > I'm not familiar with memfd. The manpage suggests it's ram backed, is it
> > > > > > > > > > swappable or something?
> > > > > > > > > 
> > > > > > > > > It's supposed to be.  The quick test I ran (allocate a memfd, write 1GB
> > > > > > > > > of junk to it on a VM with 400M of RAM) seemed to push about 980MB into
> > > > > > > > > the swap file.
> > > > > > > > > 
> > > > > > > > 
> > > > > > > > Ok.
> > > > > > > > 
> > > > > > > > > > If so, that sounds a reasonable option provided the swap space
> > > > > > > > > > requirement can be made clear to users
> > > > > > > > > 
> > > > > > > > > We can document it.  I don't think it's any worse than xfs_repair being
> > > > > > > > > able to use up all the memory + swap... and since we're probably only
> > > > > > > > > going to be repairing one thing at a time, most likely scrub won't need
> > > > > > > > > as much memory.
> > > > > > > > > 
> > > > > > > > 
> > > > > > > > Right, but as noted below, my concerns with the xfs_repair comparison
> > > > > > > > are that 1.) the kernel generally has more of a limit on anonymous
> > > > > > > > memory allocations than userspace (i.e., not swappable AFAIU?) and 2.)
> > > > > > > > it's not clear how effectively running the system out of memory via the
> > > > > > > > kernel will behave from a failure perspective.
> > > > > > > > 
> > > > > > > > IOW, xfs_repair can run the system out of memory but for the most part
> > > > > > > > that ends up being a simple problem for the system: OOM kill the bloated
> > > > > > > > xfs_repair process. For an online repair in a similar situation, I have
> > > > > > > > no idea what's going to happen.
> > > > > > > 
> > > > > > > Back in the days of the huge linked lists the oom killer would target
> > > > > > > other proceses because it doesn't know that the online repair thread is
> > > > > > > sitting on a ton of pinned kernel memory...
> > > > > > > 
> > > > > > 
> > > > > > Makes sense, kind of what I'd expect...
> > > > > > 
> > > > > > > > The hope is that the online repair hits -ENOMEM and unwinds, but ISTM
> > > > > > > > we'd still be at risk of other subsystems running into memory
> > > > > > > > allocation problems, filling up swap, the OOM killer going after
> > > > > > > > unrelated processes, etc.  What if, for example, the OOM killer starts
> > > > > > > > picking off processes in service to a running online repair that
> > > > > > > > immediately consumes freed up memory until the system is borked?
> > > > > > > 
> > > > > > > Yeah.  One thing we /could/ do is register an oom notifier that would
> > > > > > > urge any running repair threads to bail out if they can.  It seems to me
> > > > > > > that the oom killer blocks on the oom_notify_list chain, so our handler
> > > > > > > could wait until at least one thread exits before returning.
> > > > > > > 
> > > > > > 
> > > > > > Ok, something like that could be useful. I agree that we probably don't
> > > > > > need to go that far until the mechanism is nailed down and testing shows
> > > > > > that OOM is a problem.
> > > > > 
> > > > > It already is a problem on my contrived "2TB hardlink/reflink farm fs" +
> > > > > "400M of RAM and no swap" scenario.  Granted, pretty much every other
> > > > > xfs utility also blows out on that so I'm not sure how hard I really
> > > > > need to try...
> > > > > 
> > > > > > > > I don't know how likely that is or if it really ends up much different
> > > > > > > > from the analogous xfs_repair situation. My only point right now is
> > > > > > > > that failure scenario is something we should explore for any solution
> > > > > > > > we ultimately consider because it may be an unexpected use case of the
> > > > > > > > underlying mechanism.
> > > > > > > 
> > > > > > > Ideally, online repair would always be the victim since we know we have
> > > > > > > a reasonable fallback.  At least for memfd, however, I think the only
> > > > > > > clues we have to decide the question "is this memfd getting in the way
> > > > > > > of other threads?" is either seeing ENOMEM, short writes, or getting
> > > > > > > kicked by an oom notification.  Maybe that'll be enough?
> > > > > > > 
> > > > > > 
> > > > > > Hm, yeah. It may be challenging to track memfd usage as such. If
> > > > > > userspace has access to the fd on an OOM notification or whatever, it
> > > > > > might be able to do more accurate analysis based on an fstat() or
> > > > > > something.
> > > > > > 
> > > > > > Related question... is the online repair sequence currently
> > > > > > interruptible, if xfs_scrub receives a fatal signal while pulling in
> > > > > > entries during an allocbt scan for example?
> > > > > 
> > > > > It's interruptible (fatal signals only) during the scan phase, but once
> > > > > it starts logging metadata updates it will run all the way to
> > > > > completion.
> > > > > 
> > > > > > > > (To the contrary, just using a cached file seems a natural fit from
> > > > > > > > that perspective.)
> > > > > > > 
> > > > > > > Same here.
> > > > > > > 
> > > > > > > > > > and the failure characteristics aren't more severe than for userspace.
> > > > > > > > > > An online repair that puts the broader system at risk of OOM as
> > > > > > > > > > opposed to predictably failing gracefully may not be the most useful
> > > > > > > > > > tool.
> > > > > > > > > 
> > > > > > > > > Agreed.  One huge downside of memfd seems to be the lack of a mechanism
> > > > > > > > > for the vm to push back on us if we successfully write all we need to
> > > > > > > > > the memfd but then other processes need some memory.  Obviously, if the
> > > > > > > > > memfd write itself comes up short or fails then we dump the memfd and
> > > > > > > > > error back to userspace.  We might simply have to free array memory
> > > > > > > > > while we iterate the records to minimize the time spent at peak memory
> > > > > > > > > usage.
> > > > > > > > > 
> > > > > > > > 
> > > > > > > > Hm, yeah. Some kind of fixed/relative size in-core memory pool approach
> > > > > > > > may simplify things because we could allocate it up front and know right
> > > > > > > > away whether we just don't have enough memory available to repair.
> > > > > > > 
> > > > > > > Hmm.  Apparently we actually /can/ call fallocate on memfd to grab all
> > > > > > > the pages at once, provided we have some guesstimate beforehand of how
> > > > > > > much space we think we'll need.
> > > > > > > 
> > > > > > > So long as my earlier statement about the memory requirements being no
> > > > > > > more than the size of the btree leaves is actually true (I haven't
> > > > > > > rigorously tried to prove it), we need about (xrep_calc_ag_resblks() *
> > > > > > > blocksize) worth of space in the memfd file.  Maybe we ask for 1.5x
> > > > > > > that and if we don't get it, we kill the memfd and exit.
> > > > > > > 
> > > > > > 
> > > > > > Indeed. It would be nice if we could do all of the file management bits
> > > > > > in userspace.
> > > > > 
> > > > > Agreed, though no file management would be even better. :)
> > > > > 
> > > > > --D
> > > > > 
> > > > > > Brian
> > > > > > 
> > > > > > > --D
> > > > > > > 
> > > > > > > > 
> > > > > > > > Brian
> > > > > > > > 
> > > > > > > > > --D
> > > > > > > > > 
> > > > > > > > > > 
> > > > > > > > > > Brian
> > > > > > > > > > 
> > > > > > > > > > > --D
> > > > > > > > > > > 
> > > > > > > > > > > > Brian
> > > > > > > > > > > > 
> > > > > > > > > > > > > --D
> > > > > > > > > > > > > 
> > > > > > > > > > > > > > Brian
> > > > > > > > > > > > > > 
> > > > > > > > > > > > > > > --D
> > > > > > > > > > > > > > > 
> > > > > > > > > > > > > > > > Brian
> > > > > > > > > > > > > > > > 
> > > > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > > > +done:
> > > > > > > > > > > > > > > > > > > +	/* Free all the OWN_AG blocks that are not in the rmapbt/agfl. */
> > > > > > > > > > > > > > > > > > > +	xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG);
> > > > > > > > > > > > > > > > > > > +	return xrep_reap_extents(sc, old_allocbt_blocks, &oinfo,
> > > > > > > > > > > > > > > > > > > +			XFS_AG_RESV_NONE);
> > > > > > > > > > > > > > > > > > > +}
> > > > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > > ...
> > > > > > > > > > > > > > > > > > > diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
> > > > > > > > > > > > > > > > > > > index 0ed68379e551..82f99633a597 100644
> > > > > > > > > > > > > > > > > > > --- a/fs/xfs/xfs_extent_busy.c
> > > > > > > > > > > > > > > > > > > +++ b/fs/xfs/xfs_extent_busy.c
> > > > > > > > > > > > > > > > > > > @@ -657,3 +657,17 @@ xfs_extent_busy_ag_cmp(
> > > > > > > > > > > > > > > > > > >  		diff = b1->bno - b2->bno;
> > > > > > > > > > > > > > > > > > >  	return diff;
> > > > > > > > > > > > > > > > > > >  }
> > > > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > > > +/* Are there any busy extents in this AG? */
> > > > > > > > > > > > > > > > > > > +bool
> > > > > > > > > > > > > > > > > > > +xfs_extent_busy_list_empty(
> > > > > > > > > > > > > > > > > > > +	struct xfs_perag	*pag)
> > > > > > > > > > > > > > > > > > > +{
> > > > > > > > > > > > > > > > > > > +	spin_lock(&pag->pagb_lock);
> > > > > > > > > > > > > > > > > > > +	if (pag->pagb_tree.rb_node) {
> > > > > > > > > > > > > > > > > > 
> > > > > > > > > > > > > > > > > > RB_EMPTY_ROOT()?
> > > > > > > > > > > > > > > > > 
> > > > > > > > > > > > > > > > > Good suggestion, thank you!
> > > > > > > > > > > > > > > > > 
> > > > > > > > > > > > > > > > > --D
> > > > > > > > > > > > > > > > > 
> > > > > > > > > > > > > > > > > > Brian
> > > > > > > > > > > > > > > > > > 
> > > > > > > > > > > > > > > > > > > +		spin_unlock(&pag->pagb_lock);
> > > > > > > > > > > > > > > > > > > +		return false;
> > > > > > > > > > > > > > > > > > > +	}
> > > > > > > > > > > > > > > > > > > +	spin_unlock(&pag->pagb_lock);
> > > > > > > > > > > > > > > > > > > +	return true;
> > > > > > > > > > > > > > > > > > > +}
> > > > > > > > > > > > > > > > > > > diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h
> > > > > > > > > > > > > > > > > > > index 990ab3891971..2f8c73c712c6 100644
> > > > > > > > > > > > > > > > > > > --- a/fs/xfs/xfs_extent_busy.h
> > > > > > > > > > > > > > > > > > > +++ b/fs/xfs/xfs_extent_busy.h
> > > > > > > > > > > > > > > > > > > @@ -65,4 +65,6 @@ static inline void xfs_extent_busy_sort(struct list_head *list)
> > > > > > > > > > > > > > > > > > >  	list_sort(NULL, list, xfs_extent_busy_ag_cmp);
> > > > > > > > > > > > > > > > > > >  }
> > > > > > > > > > > > > > > > > > >  
> > > > > > > > > > > > > > > > > > > +bool xfs_extent_busy_list_empty(struct xfs_perag *pag);
> > > > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > > >  #endif /* __XFS_EXTENT_BUSY_H__ */
> > > > > > > > > > > > > > > > > > > 
> > > > > > > > > > > > > > > > > > > --
> > > > > > > > > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > > > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > > > > > > > > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > > > > > > > > > > > > > --
> > > > > > > > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > > > > > > > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > > > > > > > > > > > > --
> > > > > > > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > > > > > > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > > > > > > > > > > > --
> > > > > > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > > > > > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > > > > > > > > > --
> > > > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > > > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > > > > > > > > --
> > > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > > > > > > > --
> > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > > > > > > --
> > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > > > > > --
> > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > > > > --
> > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > > > --
> > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > > --
> > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > --
> > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > --
> > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > the body of a message to majordomo@vger.kernel.org
> > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
Darrick J. Wong Aug. 11, 2018, 3:48 p.m. UTC | #20
On Sat, Aug 11, 2018 at 08:50:49AM -0400, Brian Foster wrote:
> On Fri, Aug 10, 2018 at 12:36:51PM -0700, Darrick J. Wong wrote:
> > On Fri, Aug 10, 2018 at 03:07:40PM -0400, Brian Foster wrote:
> > > On Fri, Aug 10, 2018 at 08:39:44AM -0700, Darrick J. Wong wrote:
> > > > On Fri, Aug 10, 2018 at 06:33:52AM -0400, Brian Foster wrote:
> > > > > On Thu, Aug 09, 2018 at 08:59:59AM -0700, Darrick J. Wong wrote:
> > > > > > On Thu, Aug 09, 2018 at 08:00:28AM -0400, Brian Foster wrote:
> > > > > > > On Wed, Aug 08, 2018 at 03:42:32PM -0700, Darrick J. Wong wrote:
> > > > > > > > On Wed, Aug 08, 2018 at 08:29:54AM -0400, Brian Foster wrote:
> > > > > > > > > On Tue, Aug 07, 2018 at 04:34:58PM -0700, Darrick J. Wong wrote:
> > > > > > > > > > On Fri, Aug 03, 2018 at 06:49:40AM -0400, Brian Foster wrote:
> > > > > > > > > > > On Thu, Aug 02, 2018 at 12:22:05PM -0700, Darrick J. Wong wrote:
> > > > > > > > > > > > On Thu, Aug 02, 2018 at 09:48:24AM -0400, Brian Foster wrote:
> > > > > > > > > > > > > On Wed, Aug 01, 2018 at 11:28:45PM -0700, Darrick J. Wong wrote:
> > > > > > > > > > > > > > On Wed, Aug 01, 2018 at 02:39:20PM -0400, Brian Foster wrote:
> > > > > > > > > > > > > > > On Wed, Aug 01, 2018 at 09:23:16AM -0700, Darrick J. Wong wrote:
> > > > > > > > > > > > > > > > On Wed, Aug 01, 2018 at 07:54:09AM -0400, Brian Foster wrote:
> > > > > > > > > > > > > > > > > On Tue, Jul 31, 2018 at 03:01:25PM -0700, Darrick J. Wong wrote:
> > > > > > > > > > > > > > > > > > On Tue, Jul 31, 2018 at 01:47:23PM -0400, Brian Foster wrote:
> > > > > > > > > > > > > > > > > > > On Sun, Jul 29, 2018 at 10:48:21PM -0700, Darrick J. Wong wrote:
> ...
> > > > 
> > > > Hmm, ok, so to summarize, I see five options:
> > > > 
> > > > 1) Pass in a dirfd, repair can internally openat(dirfd, O_TMPFILE...)
> > > > however many files it needs.
> > > > 
> > > > 2) Pass in a however many file fds we need and segment the space.
> > > > 
> > > > 3) Pass in a single file fd.
> > > > 
> > > > 4) Let the repair code create as many memfd files as it wants.
> > > > 
> > > > 5) Let the repair code create one memfd file and segment the space.
> > > > 
> > > > I'm pretty sure we don't want to support (2) because that just seems
> > > > like a requirements communication nightmare and can burn up a lot of
> > > > space in struct xfs_scrub_metadata.
> > > > 
> > > > (3) and (5) are basically the same except for where the file comes from.
> > > > For (3) we'd have to make sure the fd filesystem supports large sparse
> > > > files (and presumably isn't the xfs we're trying to repair), which
> > > > shouldn't be too difficult to probe.  For (5) we know that tmpfs already
> > > > supports large sparse files.  Another difficulty might be that on 32-bit
> > > > the page cache only supports offsets as high as (ULONG_MAX * PAGE_SIZE),
> > > > though I suppose at this point we only need two files and 8TB should be
> > > > enough for anyone.
> > > > 
> > > > (I also think it's reasonable to consider not supporting online repair
> > > > on a 32-bit system with a large filesystem...)
> > > > 
> > > > In general, the "pass in a thing from userspace" variants come with the
> > > > complication that we have to check the functionality of whatever gets
> > > > passed in.  On the plus side it likely unlocks access to a lot more
> > > > storage than we could get with mem+swap.  On the minus side someone
> > > > passes in a fd to a drive-managed SMR on USB 2.0, and...
> > > > 
> > > > (1) seems like it would maximize the kernel's flexibility to create as
> > > > many (regular, non-sparse) files as it needs, but now we're calling
> > > > do_sys_open and managing files ourselves, which might be avoided.
> > > > 
> > > > (4) of course is what we do right now. :)
> > > > 
> > > > Soooo... the simplest userspace interface (I think) is to allow
> > > > userspace to pass in a single file fd.  Scrub can reject it if it
> > > > doesn't measure up (fs is the same, sparse not supported, high offsets
> > > > not supported, etc.).  If userspace doesn't pass in an fd then we create
> > > > a memfd and use that instead.  We end up with a hybrid between (3) and (5).
> > > > 
> > > 
> > > That all sounds about right to me except I was thinking userspace would
> > > do the memfd fallback of #5 rather than the kernel, just to keep the
> > > policy out of the kernel as much as possible. Is there any major
> > > advantage to doing it in the kernel? I guess it would slightly
> > > complicate 'xfs_io -c repair' ...
> > 
> > Hm.  We'll have to use one of the reserved areas of struct
> > xfs_scrub_metadata to pass in the file descriptor.  If we create a new
> > XFS_SCRUB_IFLAG_FD flag to indicate that we're passing in a file
> > descriptor then either we lose compatibility with old kernels (because
> > they reject unknown flags) or xfs_scrub will have to try a repair
> > without a fd (to see if the kernel even cares) and retry if the repair
> > fails with some prearranged error code that means "give me a swapfile,
> > please".  Alternately we simply require that the fd cannot be fd 0 since
> > using stdin for swap space is a stupid idea anyways.
> > 
> 
> I'm assuming that the kernel would have some basic checks on the fd to
> ensure it's usable (seekable, large offsets, etc.), as you mentioned
> previously.

Of course. :)

> With regard to xfs_scrub_metadata, it sounds like we need to deal with
> that regardless if we want to support the ability to specify an external
> file. Is the issue backwards compatibility with the interface as it
> exists today..?

Yes, my question is how hard do we try to maintain backwards
compatibility with an ioctl that controls an EXPERIMENTAL feature that
is disabled by default in Kconfig? :)

> > Technically we're not supposed to have flag days, but otoh this is a
> > xfs-only ioctl for a feature that's still experimental, so perhaps it's
> > not crucial to maintain compatibility with old kernels where the feature
> > is incomplete and experimental?
> > 
> 
> In my mind, I kind of take the experimental status as all bits/interface
> may explode and are otherwise subject to change or disappear. Perhaps
> others feel differently, it does seem we've kind of hinted towards the
> contrary recently with respect to the per-inode dax bits and then now in
> this discussion, but IMO that's kind of an inherent risk of doing
> incremental work on complex features upstream.
> 
> I dunno, perhaps that's just a misunderstanding on my part. If so, I do
> wonder if we should be a bit more cautious (in the future) about
> exposing interfaces to experimental features (DEBUG mode only, for
> example) for a period of time until the underlying mechanism is fleshed
> out enough to establish confidence in the interface.

That was my reason for hiding it all behind a 'default N' Kconfig
option -- to limit the number of users to those who build their own
kernels.

> It's one thing if an experimental feature is shiny new and potentially
> unstable at the time it is merged, but enough bits are there for
> reviewers to understand the design and interface requirements. It's
> another thing if the implementation is not yet complete, because then
> it's obviously harder to surmise whether the interface is ultimately
> sufficient.

<nod> I decided that it if we left experimental warnings in dmesg and
the xfs_scrub output and forced users to rebuild their kernel to turn on
scrub/repair then it was reasonable that we could change the ioctl
interface without worrying too much about backwards compatibility.

I think it's fine to add a 's32 sm_fd' field that can't be zero and can
be picked up by scrub or repair if they want access to more space.

> This of course is all higher level discussion from how to handle scrub..
> 
> > Hmm.  We could define the fd field with the requirement that fd > 0, and
> > if the repair function requires an fd and one hasn't been provided, it
> > can fail out with ENOMEM.  If it doesn't need extra memory it can just
> > ignore the contents of the fd field.  xfs_scrub can then arrange to pass
> > in mem fds or file fds or whatever.
> > 
> 
> Is there a versioning mechanism to the interface? I thought we used that
> approach (or planned to..) in other similar internal commands, so a
> particular kernel could bump the version and appropriately decide how to
> handle older versions.

There's plenty of space in the structure that's all required to be zero,
so we could easily add a u8 sm_version some day.  The IFLAG bit I
mentioned would be sufficient for the fd field.

--D

> 
> Brian
> 
> > --D
> > 
> > > Brian
> > > 
> > > > --D
> > > > 
> > > > > Brian
> > > > > 
> > > > > > (In theory the xbitmap could also be converted to use the fixed record
> > > > > > array, but in practice they haven't (yet) become large enough to warrant
> > > > > > it, and there's currently no way to insert or delete records from the
> > > > > > middle of the array.)
> > > > > > 
> > > > > > > > > > > I'm not familiar with memfd. The manpage suggests it's ram backed, is it
> > > > > > > > > > > swappable or something?
> > > > > > > > > > 
> > > > > > > > > > It's supposed to be.  The quick test I ran (allocate a memfd, write 1GB
> > > > > > > > > > of junk to it on a VM with 400M of RAM) seemed to push about 980MB into
> > > > > > > > > > the swap file.
> > > > > > > > > > 
> > > > > > > > > 
> > > > > > > > > Ok.
> > > > > > > > > 
> > > > > > > > > > > If so, that sounds a reasonable option provided the swap space
> > > > > > > > > > > requirement can be made clear to users
> > > > > > > > > > 
> > > > > > > > > > We can document it.  I don't think it's any worse than xfs_repair being
> > > > > > > > > > able to use up all the memory + swap... and since we're probably only
> > > > > > > > > > going to be repairing one thing at a time, most likely scrub won't need
> > > > > > > > > > as much memory.
> > > > > > > > > > 
> > > > > > > > > 
> > > > > > > > > Right, but as noted below, my concerns with the xfs_repair comparison
> > > > > > > > > are that 1.) the kernel generally has more of a limit on anonymous
> > > > > > > > > memory allocations than userspace (i.e., not swappable AFAIU?) and 2.)
> > > > > > > > > it's not clear how effectively running the system out of memory via the
> > > > > > > > > kernel will behave from a failure perspective.
> > > > > > > > > 
> > > > > > > > > IOW, xfs_repair can run the system out of memory but for the most part
> > > > > > > > > that ends up being a simple problem for the system: OOM kill the bloated
> > > > > > > > > xfs_repair process. For an online repair in a similar situation, I have
> > > > > > > > > no idea what's going to happen.
> > > > > > > > 
> > > > > > > > Back in the days of the huge linked lists the oom killer would target
> > > > > > > > other proceses because it doesn't know that the online repair thread is
> > > > > > > > sitting on a ton of pinned kernel memory...
> > > > > > > > 
> > > > > > > 
> > > > > > > Makes sense, kind of what I'd expect...
> > > > > > > 
> > > > > > > > > The hope is that the online repair hits -ENOMEM and unwinds, but ISTM
> > > > > > > > > we'd still be at risk of other subsystems running into memory
> > > > > > > > > allocation problems, filling up swap, the OOM killer going after
> > > > > > > > > unrelated processes, etc.  What if, for example, the OOM killer starts
> > > > > > > > > picking off processes in service to a running online repair that
> > > > > > > > > immediately consumes freed up memory until the system is borked?
> > > > > > > > 
> > > > > > > > Yeah.  One thing we /could/ do is register an oom notifier that would
> > > > > > > > urge any running repair threads to bail out if they can.  It seems to me
> > > > > > > > that the oom killer blocks on the oom_notify_list chain, so our handler
> > > > > > > > could wait until at least one thread exits before returning.
> > > > > > > > 
> > > > > > > 
> > > > > > > Ok, something like that could be useful. I agree that we probably don't
> > > > > > > need to go that far until the mechanism is nailed down and testing shows
> > > > > > > that OOM is a problem.
> > > > > > 
> > > > > > It already is a problem on my contrived "2TB hardlink/reflink farm fs" +
> > > > > > "400M of RAM and no swap" scenario.  Granted, pretty much every other
> > > > > > xfs utility also blows out on that so I'm not sure how hard I really
> > > > > > need to try...
> > > > > > 
> > > > > > > > > I don't know how likely that is or if it really ends up much different
> > > > > > > > > from the analogous xfs_repair situation. My only point right now is
> > > > > > > > > that failure scenario is something we should explore for any solution
> > > > > > > > > we ultimately consider because it may be an unexpected use case of the
> > > > > > > > > underlying mechanism.
> > > > > > > > 
> > > > > > > > Ideally, online repair would always be the victim since we know we have
> > > > > > > > a reasonable fallback.  At least for memfd, however, I think the only
> > > > > > > > clues we have to decide the question "is this memfd getting in the way
> > > > > > > > of other threads?" is either seeing ENOMEM, short writes, or getting
> > > > > > > > kicked by an oom notification.  Maybe that'll be enough?
> > > > > > > > 
> > > > > > > 
> > > > > > > Hm, yeah. It may be challenging to track memfd usage as such. If
> > > > > > > userspace has access to the fd on an OOM notification or whatever, it
> > > > > > > might be able to do more accurate analysis based on an fstat() or
> > > > > > > something.
> > > > > > > 
> > > > > > > Related question... is the online repair sequence currently
> > > > > > > interruptible, if xfs_scrub receives a fatal signal while pulling in
> > > > > > > entries during an allocbt scan for example?
> > > > > > 
> > > > > > It's interruptible (fatal signals only) during the scan phase, but once
> > > > > > it starts logging metadata updates it will run all the way to
> > > > > > completion.
> > > > > > 
> > > > > > > > > (To the contrary, just using a cached file seems a natural fit from
> > > > > > > > > that perspective.)
> > > > > > > > 
> > > > > > > > Same here.
> > > > > > > > 
> > > > > > > > > > > and the failure characteristics aren't more severe than for userspace.
> > > > > > > > > > > An online repair that puts the broader system at risk of OOM as
> > > > > > > > > > > opposed to predictably failing gracefully may not be the most useful
> > > > > > > > > > > tool.
> > > > > > > > > > 
> > > > > > > > > > Agreed.  One huge downside of memfd seems to be the lack of a mechanism
> > > > > > > > > > for the vm to push back on us if we successfully write all we need to
> > > > > > > > > > the memfd but then other processes need some memory.  Obviously, if the
> > > > > > > > > > memfd write itself comes up short or fails then we dump the memfd and
> > > > > > > > > > error back to userspace.  We might simply have to free array memory
> > > > > > > > > > while we iterate the records to minimize the time spent at peak memory
> > > > > > > > > > usage.
> > > > > > > > > > 
> > > > > > > > > 
> > > > > > > > > Hm, yeah. Some kind of fixed/relative size in-core memory pool approach
> > > > > > > > > may simplify things because we could allocate it up front and know right
> > > > > > > > > away whether we just don't have enough memory available to repair.
> > > > > > > > 
> > > > > > > > Hmm.  Apparently we actually /can/ call fallocate on memfd to grab all
> > > > > > > > the pages at once, provided we have some guesstimate beforehand of how
> > > > > > > > much space we think we'll need.
> > > > > > > > 
> > > > > > > > So long as my earlier statement about the memory requirements being no
> > > > > > > > more than the size of the btree leaves is actually true (I haven't
> > > > > > > > rigorously tried to prove it), we need about (xrep_calc_ag_resblks() *
> > > > > > > > blocksize) worth of space in the memfd file.  Maybe we ask for 1.5x
> > > > > > > > that and if we don't get it, we kill the memfd and exit.
> > > > > > > > 
> > > > > > > 
> > > > > > > Indeed. It would be nice if we could do all of the file management bits
> > > > > > > in userspace.
> > > > > > 
> > > > > > Agreed, though no file management would be even better. :)
> > > > > > 
> > > > > > --D
> > > > > > 
> > > > > > > Brian
> > > > > > > 
> > > > > > > > --D
> > > > > > > > 
> > > > > > > > > 
> > > > > > > > > Brian
> > > > > > > > > 
> > > > > > > > > > --D
> > > > > > > > > > 
> > > > > > > > > > > 
> > > > > > > > > > > Brian
> > > > > > > > > > > 
> > > > > > > > > > > > --D
> > > > > > > > > > > > 
> > > > > > > > > > > > > Brian
> > > > > > > > > > > > > 
> > > > > > > > > > > > > > --D
> > > > > > > > > > > > > > 
> > > > > > > > > > > > > > > Brian
> > > > > > > > > > > > > > > 
> > > > > > > > > > > > > > > > --D
> > > > > > > > > > > > > > > > 
> > > > > > > > > > > > > > > > > Brian
> > > > > > > > > > > > > > > > > 
> > > > > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > > > > +done:
> > > > > > > > > > > > > > > > > > > > +	/* Free all the OWN_AG blocks that are not in the rmapbt/agfl. */
> > > > > > > > > > > > > > > > > > > > +	xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG);
> > > > > > > > > > > > > > > > > > > > +	return xrep_reap_extents(sc, old_allocbt_blocks, &oinfo,
> > > > > > > > > > > > > > > > > > > > +			XFS_AG_RESV_NONE);
> > > > > > > > > > > > > > > > > > > > +}
> > > > > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > > > ...
> > > > > > > > > > > > > > > > > > > > diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
> > > > > > > > > > > > > > > > > > > > index 0ed68379e551..82f99633a597 100644
> > > > > > > > > > > > > > > > > > > > --- a/fs/xfs/xfs_extent_busy.c
> > > > > > > > > > > > > > > > > > > > +++ b/fs/xfs/xfs_extent_busy.c
> > > > > > > > > > > > > > > > > > > > @@ -657,3 +657,17 @@ xfs_extent_busy_ag_cmp(
> > > > > > > > > > > > > > > > > > > >  		diff = b1->bno - b2->bno;
> > > > > > > > > > > > > > > > > > > >  	return diff;
> > > > > > > > > > > > > > > > > > > >  }
> > > > > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > > > > +/* Are there any busy extents in this AG? */
> > > > > > > > > > > > > > > > > > > > +bool
> > > > > > > > > > > > > > > > > > > > +xfs_extent_busy_list_empty(
> > > > > > > > > > > > > > > > > > > > +	struct xfs_perag	*pag)
> > > > > > > > > > > > > > > > > > > > +{
> > > > > > > > > > > > > > > > > > > > +	spin_lock(&pag->pagb_lock);
> > > > > > > > > > > > > > > > > > > > +	if (pag->pagb_tree.rb_node) {
> > > > > > > > > > > > > > > > > > > 
> > > > > > > > > > > > > > > > > > > RB_EMPTY_ROOT()?
> > > > > > > > > > > > > > > > > > 
> > > > > > > > > > > > > > > > > > Good suggestion, thank you!
> > > > > > > > > > > > > > > > > > 
> > > > > > > > > > > > > > > > > > --D
> > > > > > > > > > > > > > > > > > 
> > > > > > > > > > > > > > > > > > > Brian
> > > > > > > > > > > > > > > > > > > 
> > > > > > > > > > > > > > > > > > > > +		spin_unlock(&pag->pagb_lock);
> > > > > > > > > > > > > > > > > > > > +		return false;
> > > > > > > > > > > > > > > > > > > > +	}
> > > > > > > > > > > > > > > > > > > > +	spin_unlock(&pag->pagb_lock);
> > > > > > > > > > > > > > > > > > > > +	return true;
> > > > > > > > > > > > > > > > > > > > +}
> > > > > > > > > > > > > > > > > > > > diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h
> > > > > > > > > > > > > > > > > > > > index 990ab3891971..2f8c73c712c6 100644
> > > > > > > > > > > > > > > > > > > > --- a/fs/xfs/xfs_extent_busy.h
> > > > > > > > > > > > > > > > > > > > +++ b/fs/xfs/xfs_extent_busy.h
> > > > > > > > > > > > > > > > > > > > @@ -65,4 +65,6 @@ static inline void xfs_extent_busy_sort(struct list_head *list)
> > > > > > > > > > > > > > > > > > > >  	list_sort(NULL, list, xfs_extent_busy_ag_cmp);
> > > > > > > > > > > > > > > > > > > >  }
> > > > > > > > > > > > > > > > > > > >  
> > > > > > > > > > > > > > > > > > > > +bool xfs_extent_busy_list_empty(struct xfs_perag *pag);
> > > > > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > > > >  #endif /* __XFS_EXTENT_BUSY_H__ */
> > > > > > > > > > > > > > > > > > > > 
> > > > > > > > > > > > > > > > > > > > --
> > > > > > > > > > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > > > > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > > > > > > > > > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > > > > > > > > > > > > > > --
> > > > > > > > > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > > > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > > > > > > > > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > > > > > > > > > > > > > --
> > > > > > > > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > > > > > > > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > > > > > > > > > > > > --
> > > > > > > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > > > > > > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > > > > > > > > > > --
> > > > > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > > > > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > > > > > > > > > --
> > > > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > > > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > > > > > > > > --
> > > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > > > > > > > --
> > > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > > > > > > --
> > > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > > > > > --
> > > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > > > > --
> > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > > > --
> > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > > --
> > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > --
> > > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dave Chinner Aug. 13, 2018, 2:46 a.m. UTC | #21
On Sat, Aug 11, 2018 at 08:50:49AM -0400, Brian Foster wrote:
> On Fri, Aug 10, 2018 at 12:36:51PM -0700, Darrick J. Wong wrote:
> > Technically we're not supposed to have flag days, but otoh this is a
> > xfs-only ioctl for a feature that's still experimental, so perhaps it's
> > not crucial to maintain compatibility with old kernels where the feature
> > is incomplete and experimental?
> > 
> 
> In my mind, I kind of take the experimental status as all bits/interface
> may explode and are otherwise subject to change or disappear. Perhaps
> others feel differently, it does seem we've kind of hinted towards the
> contrary recently with respect to the per-inode dax bits and then now in
> this discussion, but IMO that's kind of an inherent risk of doing
> incremental work on complex features upstream.

I've always considered that the experimental tag covers the
user/ioctl interfaces as much as it does the functionality and
on-disk format. i.e. like the on-disk format, the ioctl interfaces
are subject to change until we clear the exp. tag, at which point
they are essentially fixed forever. We /try/ not to have to change
them after the initial merge, but sometimes we screw up and need to
fix them before we commit to long term support.

Cheers,

Dave.
diff mbox series

Patch

diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 57ec46951ede..44ddd112acd2 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -164,6 +164,7 @@  xfs-$(CONFIG_XFS_QUOTA)		+= scrub/quota.o
 ifeq ($(CONFIG_XFS_ONLINE_REPAIR),y)
 xfs-y				+= $(addprefix scrub/, \
 				   agheader_repair.o \
+				   alloc_repair.o \
 				   bitmap.o \
 				   repair.o \
 				   )
diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c
index 036b5c7021eb..c9b34ba312ab 100644
--- a/fs/xfs/scrub/alloc.c
+++ b/fs/xfs/scrub/alloc.c
@@ -15,7 +15,6 @@ 
 #include "xfs_log_format.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
-#include "xfs_alloc.h"
 #include "xfs_rmap.h"
 #include "xfs_alloc.h"
 #include "scrub/xfs_scrub.h"
diff --git a/fs/xfs/scrub/alloc_repair.c b/fs/xfs/scrub/alloc_repair.c
new file mode 100644
index 000000000000..b228c2906de2
--- /dev/null
+++ b/fs/xfs/scrub/alloc_repair.c
@@ -0,0 +1,581 @@ 
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2018 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_alloc.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_inode.h"
+#include "xfs_refcount.h"
+#include "xfs_extent_busy.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/btree.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/bitmap.h"
+
+/*
+ * Free Space Btree Repair
+ * =======================
+ *
+ * The reverse mappings are supposed to record all space usage for the entire
+ * AG.  Therefore, we can recalculate the free extents in an AG by looking for
+ * gaps in the physical extents recorded in the rmapbt.  On a reflink
+ * filesystem this is a little more tricky in that we have to be aware that
+ * the rmap records are allowed to overlap.
+ *
+ * We derive which blocks belonged to the old bnobt/cntbt by recording all the
+ * OWN_AG extents and subtracting out the blocks owned by all other OWN_AG
+ * metadata: the rmapbt blocks visited while iterating the reverse mappings
+ * and the AGFL blocks.
+ *
+ * Once we have both of those pieces, we can reconstruct the bnobt and cntbt
+ * by blowing out the free block state and freeing all the extents that we
+ * found.  This adds the requirement that we can't have any busy extents in
+ * the AG because the busy code cannot handle duplicate records.
+ *
+ * Note that we can only rebuild both free space btrees at the same time
+ * because the regular extent freeing infrastructure loads both btrees at the
+ * same time.
+ *
+ * We use the prefix 'xrep_abt' here because we regenerate both free space
+ * allocation btrees at the same time.
+ */
+
+struct xrep_abt_extent {
+	struct list_head	list;
+	xfs_agblock_t		bno;
+	xfs_extlen_t		len;
+};
+
+struct xrep_abt {
+	/* Blocks owned by the rmapbt or the agfl. */
+	struct xfs_bitmap	nobtlist;
+
+	/* All OWN_AG blocks. */
+	struct xfs_bitmap	*btlist;
+
+	/* Free space extents. */
+	struct list_head	*extlist;
+
+	struct xfs_scrub	*sc;
+
+	/* Length of extlist. */
+	uint64_t		nr_records;
+
+	/*
+	 * Next block we anticipate seeing in the rmap records.  If the next
+	 * rmap record is greater than next_bno, we have found unused space.
+	 */
+	xfs_agblock_t		next_bno;
+
+	/* Number of free blocks in this AG. */
+	xfs_agblock_t		nr_blocks;
+};
+
+/* Record extents that aren't in use from gaps in the rmap records. */
+STATIC int
+xrep_abt_walk_rmap(
+	struct xfs_btree_cur	*cur,
+	struct xfs_rmap_irec	*rec,
+	void			*priv)
+{
+	struct xrep_abt		*ra = priv;
+	struct xrep_abt_extent	*rae;
+	xfs_fsblock_t		fsb;
+	int			error;
+
+	/* Record all the OWN_AG blocks... */
+	if (rec->rm_owner == XFS_RMAP_OWN_AG) {
+		fsb = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_private.a.agno,
+				rec->rm_startblock);
+		error = xfs_bitmap_set(ra->btlist, fsb, rec->rm_blockcount);
+		if (error)
+			return error;
+	}
+
+	/* ...and all the rmapbt blocks... */
+	error = xfs_bitmap_set_btcur_path(&ra->nobtlist, cur);
+	if (error)
+		return error;
+
+	/* ...and all the free space. */
+	if (rec->rm_startblock > ra->next_bno) {
+		trace_xrep_abt_walk_rmap(cur->bc_mp, cur->bc_private.a.agno,
+				ra->next_bno, rec->rm_startblock - ra->next_bno,
+				XFS_RMAP_OWN_NULL, 0, 0);
+
+		rae = kmem_alloc(sizeof(struct xrep_abt_extent), KM_MAYFAIL);
+		if (!rae)
+			return -ENOMEM;
+		INIT_LIST_HEAD(&rae->list);
+		rae->bno = ra->next_bno;
+		rae->len = rec->rm_startblock - ra->next_bno;
+		list_add_tail(&rae->list, ra->extlist);
+		ra->nr_records++;
+		ra->nr_blocks += rae->len;
+	}
+	ra->next_bno = max_t(xfs_agblock_t, ra->next_bno,
+			rec->rm_startblock + rec->rm_blockcount);
+	return 0;
+}
+
+/* Collect an AGFL block for the not-to-release list. */
+static int
+xrep_abt_walk_agfl(
+	struct xfs_mount	*mp,
+	xfs_agblock_t		bno,
+	void			*priv)
+{
+	struct xrep_abt		*ra = priv;
+	xfs_fsblock_t		fsb;
+
+	fsb = XFS_AGB_TO_FSB(mp, ra->sc->sa.agno, bno);
+	return xfs_bitmap_set(&ra->nobtlist, fsb, 1);
+}
+
+/* Compare two free space extents. */
+static int
+xrep_abt_extent_cmp(
+	void			*priv,
+	struct list_head	*a,
+	struct list_head	*b)
+{
+	struct xrep_abt_extent	*ap;
+	struct xrep_abt_extent	*bp;
+
+	ap = container_of(a, struct xrep_abt_extent, list);
+	bp = container_of(b, struct xrep_abt_extent, list);
+
+	if (ap->bno > bp->bno)
+		return 1;
+	else if (ap->bno < bp->bno)
+		return -1;
+	return 0;
+}
+
+/* Free an extent, which creates a record in the bnobt/cntbt. */
+STATIC int
+xrep_abt_free_extent(
+	struct xfs_scrub	*sc,
+	xfs_fsblock_t		fsbno,
+	xfs_extlen_t		len,
+	struct xfs_owner_info	*oinfo)
+{
+	int			error;
+
+	error = xfs_free_extent(sc->tp, fsbno, len, oinfo, 0);
+	if (error)
+		return error;
+	error = xrep_roll_ag_trans(sc);
+	if (error)
+		return error;
+	return xfs_mod_fdblocks(sc->mp, -(int64_t)len, false);
+}
+
+/* Find the longest free extent in the list. */
+static struct xrep_abt_extent *
+xrep_abt_get_longest(
+	struct list_head	*free_extents)
+{
+	struct xrep_abt_extent	*rae;
+	struct xrep_abt_extent	*res = NULL;
+
+	list_for_each_entry(rae, free_extents, list) {
+		if (!res || rae->len > res->len)
+			res = rae;
+	}
+	return res;
+}
+
+/*
+ * Allocate a block from the (cached) first extent in the AG.  In theory
+ * this should never fail, since we already checked that there was enough
+ * space to handle the new btrees.
+ */
+STATIC xfs_fsblock_t
+xrep_abt_alloc_block(
+	struct xfs_scrub	*sc,
+	struct list_head	*free_extents)
+{
+	struct xrep_abt_extent	*ext;
+
+	/* Pull the first free space extent off the list, and... */
+	ext = list_first_entry(free_extents, struct xrep_abt_extent, list);
+
+	/* ...take its first block. */
+	ext->bno++;
+	ext->len--;
+	if (ext->len == 0) {
+		list_del(&ext->list);
+		kmem_free(ext);
+	}
+
+	return XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, ext->bno - 1);
+}
+
+/* Free every record in the extent list. */
+STATIC void
+xrep_abt_cancel_freelist(
+	struct list_head	*extlist)
+{
+	struct xrep_abt_extent	*rae;
+	struct xrep_abt_extent	*n;
+
+	list_for_each_entry_safe(rae, n, extlist, list) {
+		list_del(&rae->list);
+		kmem_free(rae);
+	}
+}
+
+/*
+ * Iterate all reverse mappings to find (1) the free extents, (2) the OWN_AG
+ * extents, (3) the rmapbt blocks, and (4) the AGFL blocks.  The free space is
+ * (1) + (2) - (3) - (4).  Figure out if we have enough free space to
+ * reconstruct the free space btrees.  Caller must clean up the input lists
+ * if something goes wrong.
+ */
+STATIC int
+xrep_abt_find_freespace(
+	struct xfs_scrub	*sc,
+	struct list_head	*free_extents,
+	struct xfs_bitmap	*old_allocbt_blocks)
+{
+	struct xrep_abt		ra;
+	struct xrep_abt_extent	*rae;
+	struct xfs_btree_cur	*cur;
+	struct xfs_mount	*mp = sc->mp;
+	xfs_agblock_t		agend;
+	xfs_agblock_t		nr_blocks;
+	int			error;
+
+	ra.extlist = free_extents;
+	ra.btlist = old_allocbt_blocks;
+	xfs_bitmap_init(&ra.nobtlist);
+	ra.next_bno = 0;
+	ra.nr_records = 0;
+	ra.nr_blocks = 0;
+	ra.sc = sc;
+
+	/*
+	 * Iterate all the reverse mappings to find gaps in the physical
+	 * mappings, all the OWN_AG blocks, and all the rmapbt extents.
+	 */
+	cur = xfs_rmapbt_init_cursor(mp, sc->tp, sc->sa.agf_bp, sc->sa.agno);
+	error = xfs_rmap_query_all(cur, xrep_abt_walk_rmap, &ra);
+	if (error)
+		goto err;
+	xfs_btree_del_cursor(cur, error);
+	cur = NULL;
+
+	/* Insert a record for space between the last rmap and EOAG. */
+	agend = be32_to_cpu(XFS_BUF_TO_AGF(sc->sa.agf_bp)->agf_length);
+	if (ra.next_bno < agend) {
+		rae = kmem_alloc(sizeof(struct xrep_abt_extent), KM_MAYFAIL);
+		if (!rae) {
+			error = -ENOMEM;
+			goto err;
+		}
+		INIT_LIST_HEAD(&rae->list);
+		rae->bno = ra.next_bno;
+		rae->len = agend - ra.next_bno;
+		list_add_tail(&rae->list, free_extents);
+		ra.nr_records++;
+		ra.nr_blocks += rae->len;
+	}
+
+	/* Collect all the AGFL blocks. */
+	error = xfs_agfl_walk(mp, XFS_BUF_TO_AGF(sc->sa.agf_bp),
+			sc->sa.agfl_bp, xrep_abt_walk_agfl, &ra);
+	if (error)
+		goto err;
+
+	/* Do we have enough space to rebuild both freespace btrees? */
+	nr_blocks = 2 * xfs_allocbt_calc_size(mp, ra.nr_records);
+	if (!xrep_ag_has_space(sc->sa.pag, nr_blocks, XFS_AG_RESV_NONE) ||
+	    ra.nr_blocks < nr_blocks) {
+		error = -ENOSPC;
+		goto err;
+	}
+
+	/* Compute the old bnobt/cntbt blocks. */
+	error = xfs_bitmap_disunion(old_allocbt_blocks, &ra.nobtlist);
+err:
+	xfs_bitmap_destroy(&ra.nobtlist);
+	if (cur)
+		xfs_btree_del_cursor(cur, error);
+	return error;
+}
+
+/*
+ * Reset the global free block counter and the per-AG counters to make it look
+ * like this AG has no free space.
+ */
+STATIC int
+xrep_abt_reset_counters(
+	struct xfs_scrub	*sc,
+	int			*log_flags)
+{
+	struct xfs_perag	*pag = sc->sa.pag;
+	struct xfs_agf		*agf;
+	xfs_agblock_t		new_btblks;
+	xfs_agblock_t		to_free;
+	int			error;
+
+	/*
+	 * Since we're abandoning the old bnobt/cntbt, we have to decrease
+	 * fdblocks by the # of blocks in those trees.  btreeblks counts the
+	 * non-root blocks of the free space and rmap btrees.  Do this before
+	 * resetting the AGF counters.
+	 */
+	agf = XFS_BUF_TO_AGF(sc->sa.agf_bp);
+
+	/* rmap_blocks accounts root block, btreeblks doesn't */
+	new_btblks = be32_to_cpu(agf->agf_rmap_blocks) - 1;
+
+	/* btreeblks doesn't account bno/cnt root blocks */
+	to_free = pag->pagf_btreeblks + 2;
+
+	/* and don't account for the blocks we aren't freeing */
+	to_free -= new_btblks;
+
+	error = xfs_mod_fdblocks(sc->mp, -(int64_t)to_free, false);
+	if (error)
+		return error;
+
+	/*
+	 * Reset the per-AG info, both incore and ondisk.  Mark the incore
+	 * state stale in case we fail out of here.
+	 */
+	ASSERT(pag->pagf_init);
+	pag->pagf_init = 0;
+	pag->pagf_btreeblks = new_btblks;
+	pag->pagf_freeblks = 0;
+	pag->pagf_longest = 0;
+
+	agf->agf_btreeblks = cpu_to_be32(new_btblks);
+	agf->agf_freeblks = 0;
+	agf->agf_longest = 0;
+	*log_flags |= XFS_AGF_BTREEBLKS | XFS_AGF_LONGEST | XFS_AGF_FREEBLKS;
+
+	return 0;
+}
+
+/* Initialize a new free space btree root and implant into AGF. */
+STATIC int
+xrep_abt_reset_btree(
+	struct xfs_scrub	*sc,
+	xfs_btnum_t		btnum,
+	struct list_head	*free_extents)
+{
+	struct xfs_owner_info	oinfo;
+	struct xfs_buf		*bp;
+	struct xfs_perag	*pag = sc->sa.pag;
+	struct xfs_mount	*mp = sc->mp;
+	struct xfs_agf		*agf = XFS_BUF_TO_AGF(sc->sa.agf_bp);
+	xfs_fsblock_t		fsbno;
+	int			error;
+
+	/* Allocate new root block. */
+	fsbno = xrep_abt_alloc_block(sc, free_extents);
+	if (fsbno == NULLFSBLOCK)
+		return -ENOSPC;
+
+	/* Initialize new tree root. */
+	error = xrep_init_btblock(sc, fsbno, &bp, btnum, &xfs_allocbt_buf_ops);
+	if (error)
+		return error;
+
+	/* Implant into AGF. */
+	agf->agf_roots[btnum] = cpu_to_be32(XFS_FSB_TO_AGBNO(mp, fsbno));
+	agf->agf_levels[btnum] = cpu_to_be32(1);
+
+	/* Add rmap records for the btree roots */
+	xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG);
+	error = xfs_rmap_alloc(sc->tp, sc->sa.agf_bp, sc->sa.agno,
+			XFS_FSB_TO_AGBNO(mp, fsbno), 1, &oinfo);
+	if (error)
+		return error;
+
+	/* Reset the incore state. */
+	pag->pagf_levels[btnum] = 1;
+
+	return 0;
+}
+
+/* Initialize new bnobt/cntbt roots and implant them into the AGF. */
+STATIC int
+xrep_abt_reset_btrees(
+	struct xfs_scrub	*sc,
+	struct list_head	*free_extents,
+	int			*log_flags)
+{
+	int			error;
+
+	error = xrep_abt_reset_btree(sc, XFS_BTNUM_BNOi, free_extents);
+	if (error)
+		return error;
+	error = xrep_abt_reset_btree(sc, XFS_BTNUM_CNTi, free_extents);
+	if (error)
+		return error;
+
+	*log_flags |= XFS_AGF_ROOTS | XFS_AGF_LEVELS;
+	return 0;
+}
+
+/*
+ * Make our new freespace btree roots permanent so that we can start freeing
+ * unused space back into the AG.
+ */
+STATIC int
+xrep_abt_commit_new(
+	struct xfs_scrub	*sc,
+	struct xfs_bitmap	*old_allocbt_blocks,
+	int			log_flags)
+{
+	int			error;
+
+	xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, log_flags);
+
+	/* Invalidate the old freespace btree blocks and commit. */
+	error = xrep_invalidate_blocks(sc, old_allocbt_blocks);
+	if (error)
+		return error;
+	error = xrep_roll_ag_trans(sc);
+	if (error)
+		return error;
+
+	/* Now that we've succeeded, mark the incore state valid again. */
+	sc->sa.pag->pagf_init = 1;
+	return 0;
+}
+
+/* Build new free space btrees and dispose of the old one. */
+STATIC int
+xrep_abt_rebuild_trees(
+	struct xfs_scrub	*sc,
+	struct list_head	*free_extents,
+	struct xfs_bitmap	*old_allocbt_blocks)
+{
+	struct xfs_owner_info	oinfo;
+	struct xrep_abt_extent	*rae;
+	struct xrep_abt_extent	*n;
+	struct xrep_abt_extent	*longest;
+	int			error;
+
+	xfs_rmap_skip_owner_update(&oinfo);
+
+	/*
+	 * Insert the longest free extent in case it's necessary to
+	 * refresh the AGFL with multiple blocks.  If there is no longest
+	 * extent, we had exactly the free space we needed; we're done.
+	 */
+	longest = xrep_abt_get_longest(free_extents);
+	if (!longest)
+		goto done;
+	error = xrep_abt_free_extent(sc,
+			XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, longest->bno),
+			longest->len, &oinfo);
+	list_del(&longest->list);
+	kmem_free(longest);
+	if (error)
+		return error;
+
+	/* Insert records into the new btrees. */
+	list_for_each_entry_safe(rae, n, free_extents, list) {
+		error = xrep_abt_free_extent(sc,
+				XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, rae->bno),
+				rae->len, &oinfo);
+		if (error)
+			return error;
+		list_del(&rae->list);
+		kmem_free(rae);
+	}
+
+done:
+	/* Free all the OWN_AG blocks that are not in the rmapbt/agfl. */
+	xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG);
+	return xrep_reap_extents(sc, old_allocbt_blocks, &oinfo,
+			XFS_AG_RESV_NONE);
+}
+
+/* Repair the freespace btrees for some AG. */
+int
+xrep_allocbt(
+	struct xfs_scrub	*sc)
+{
+	struct list_head	free_extents;
+	struct xfs_bitmap	old_allocbt_blocks;
+	struct xfs_mount	*mp = sc->mp;
+	int			log_flags = 0;
+	int			error;
+
+	/* We require the rmapbt to rebuild anything. */
+	if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
+		return -EOPNOTSUPP;
+
+	xchk_perag_get(sc->mp, &sc->sa);
+
+	/*
+	 * Make sure the busy extent list is clear because we can't put
+	 * extents on there twice.
+	 */
+	if (!xfs_extent_busy_list_empty(sc->sa.pag))
+		return -EDEADLOCK;
+
+	/* Collect the free space data and find the old btree blocks. */
+	INIT_LIST_HEAD(&free_extents);
+	xfs_bitmap_init(&old_allocbt_blocks);
+	error = xrep_abt_find_freespace(sc, &free_extents, &old_allocbt_blocks);
+	if (error)
+		goto out;
+
+	/* Make sure we got some free space. */
+	if (list_empty(&free_extents)) {
+		error = -ENOSPC;
+		goto out;
+	}
+
+	/*
+	 * Sort the free extents by block number to avoid bnobt splits when we
+	 * rebuild the free space btrees.
+	 */
+	list_sort(NULL, &free_extents, xrep_abt_extent_cmp);
+
+	/*
+	 * Blow out the old free space btrees.  This is the point at which
+	 * we are no longer able to bail out gracefully.
+	 */
+	error = xrep_abt_reset_counters(sc, &log_flags);
+	if (error)
+		goto out;
+	error = xrep_abt_reset_btrees(sc, &free_extents, &log_flags);
+	if (error)
+		goto out;
+	error = xrep_abt_commit_new(sc, &old_allocbt_blocks, log_flags);
+	if (error)
+		goto out;
+
+	/* Now rebuild the freespace information. */
+	error = xrep_abt_rebuild_trees(sc, &free_extents, &old_allocbt_blocks);
+out:
+	xrep_abt_cancel_freelist(&free_extents);
+	xfs_bitmap_destroy(&old_allocbt_blocks);
+	return error;
+}
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index 346b02abccf7..0fb949afaca9 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -623,8 +623,14 @@  xchk_setup_ag_btree(
 	 * expensive operation should be performed infrequently and only
 	 * as a last resort.  Any caller that sets force_log should
 	 * document why they need to do so.
+	 *
+	 * Force everything in memory out to disk if we're repairing.
+	 * This ensures we won't get tripped up by btree blocks sitting
+	 * in memory waiting to have LSNs stamped in.  The AGF/AGI repair
+	 * routines use any available rmap data to try to find a btree
+	 * root that also passes the read verifiers.
 	 */
-	if (force_log) {
+	if (force_log || (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)) {
 		error = xchk_checkpoint_log(mp);
 		if (error)
 			return error;
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index 9de321eee4ab..bc1a5f1cbcdc 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -61,6 +61,7 @@  int xrep_superblock(struct xfs_scrub *sc);
 int xrep_agf(struct xfs_scrub *sc);
 int xrep_agfl(struct xfs_scrub *sc);
 int xrep_agi(struct xfs_scrub *sc);
+int xrep_allocbt(struct xfs_scrub *sc);
 
 #else
 
@@ -87,6 +88,7 @@  xrep_calc_ag_resblks(
 #define xrep_agf			xrep_notsupported
 #define xrep_agfl			xrep_notsupported
 #define xrep_agi			xrep_notsupported
+#define xrep_allocbt			xrep_notsupported
 
 #endif /* CONFIG_XFS_ONLINE_REPAIR */
 
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index 4bfae1e61d30..2133a3199372 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -232,13 +232,13 @@  static const struct xchk_meta_ops meta_scrub_ops[] = {
 		.type	= ST_PERAG,
 		.setup	= xchk_setup_ag_allocbt,
 		.scrub	= xchk_bnobt,
-		.repair	= xrep_notsupported,
+		.repair	= xrep_allocbt,
 	},
 	[XFS_SCRUB_TYPE_CNTBT] = {	/* cntbt */
 		.type	= ST_PERAG,
 		.setup	= xchk_setup_ag_allocbt,
 		.scrub	= xchk_cntbt,
-		.repair	= xrep_notsupported,
+		.repair	= xrep_allocbt,
 	},
 	[XFS_SCRUB_TYPE_INOBT] = {	/* inobt */
 		.type	= ST_PERAG,
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index 4e20f0e48232..26bd5dc68efe 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -551,7 +551,7 @@  DEFINE_EVENT(xrep_rmap_class, name, \
 		 xfs_agblock_t agbno, xfs_extlen_t len, \
 		 uint64_t owner, uint64_t offset, unsigned int flags), \
 	TP_ARGS(mp, agno, agbno, len, owner, offset, flags))
-DEFINE_REPAIR_RMAP_EVENT(xrep_alloc_extent_fn);
+DEFINE_REPAIR_RMAP_EVENT(xrep_abt_walk_rmap);
 DEFINE_REPAIR_RMAP_EVENT(xrep_ialloc_extent_fn);
 DEFINE_REPAIR_RMAP_EVENT(xrep_rmap_extent_fn);
 DEFINE_REPAIR_RMAP_EVENT(xrep_bmap_extent_fn);
diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
index 0ed68379e551..82f99633a597 100644
--- a/fs/xfs/xfs_extent_busy.c
+++ b/fs/xfs/xfs_extent_busy.c
@@ -657,3 +657,17 @@  xfs_extent_busy_ag_cmp(
 		diff = b1->bno - b2->bno;
 	return diff;
 }
+
+/* Are there any busy extents in this AG? */
+bool
+xfs_extent_busy_list_empty(
+	struct xfs_perag	*pag)
+{
+	spin_lock(&pag->pagb_lock);
+	if (pag->pagb_tree.rb_node) {
+		spin_unlock(&pag->pagb_lock);
+		return false;
+	}
+	spin_unlock(&pag->pagb_lock);
+	return true;
+}
diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h
index 990ab3891971..2f8c73c712c6 100644
--- a/fs/xfs/xfs_extent_busy.h
+++ b/fs/xfs/xfs_extent_busy.h
@@ -65,4 +65,6 @@  static inline void xfs_extent_busy_sort(struct list_head *list)
 	list_sort(NULL, list, xfs_extent_busy_ag_cmp);
 }
 
+bool xfs_extent_busy_list_empty(struct xfs_perag *pag);
+
 #endif /* __XFS_EXTENT_BUSY_H__ */