diff mbox

[10/21] xfs: add helper routines for the repair code

Message ID 152269903810.16346.11410639151264264247.stgit@magnolia (mailing list archive)
State New, archived
Headers show

Commit Message

Darrick J. Wong April 2, 2018, 7:57 p.m. UTC
From: Darrick J. Wong <darrick.wong@oracle.com>

Add some helper functions for repair functions that will help us to
allocate and initialize new metadata blocks for btrees that we're
rebuilding.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/scrub/bmap.c   |    3 
 fs/xfs/scrub/common.c |    8 
 fs/xfs/scrub/common.h |    9 +
 fs/xfs/scrub/inode.c  |    4 
 fs/xfs/scrub/repair.c |  816 +++++++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/scrub/repair.h |   78 +++++
 fs/xfs/scrub/scrub.c  |    2 
 fs/xfs/scrub/scrub.h  |    1 
 8 files changed, 915 insertions(+), 6 deletions(-)



--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Dave Chinner April 6, 2018, 12:52 a.m. UTC | #1
On Mon, Apr 02, 2018 at 12:57:18PM -0700, Darrick J. Wong wrote:
> From: Darrick J. Wong <darrick.wong@oracle.com>
> 
> Add some helper functions for repair functions that will help us to
> allocate and initialize new metadata blocks for btrees that we're
> rebuilding.

This is more than "helper functions" - my replay is almost 700 lines
long!

i.e. This is a stack of extent invalidation, freeing and rmap/free
space rebuild code. Needs a lot more description and context than
"helper functions".

.....
> @@ -574,7 +575,10 @@ xfs_scrub_setup_fs(
>  	struct xfs_scrub_context	*sc,
>  	struct xfs_inode		*ip)
>  {
> -	return xfs_scrub_trans_alloc(sc->sm, sc->mp, &sc->tp);
> +	uint				resblks;
> +
> +	resblks = xfs_repair_calc_ag_resblks(sc);
> +	return xfs_scrub_trans_alloc(sc->sm, sc->mp, resblks, &sc->tp);
>  }

What's the block reservation for?

> +/*
> + * Roll a transaction, keeping the AG headers locked and reinitializing
> + * the btree cursors.
> + */
> +int
> +xfs_repair_roll_ag_trans(
> +	struct xfs_scrub_context	*sc)
> +{
> +	struct xfs_trans		*tp;
> +	int				error;
> +
> +	/* Keep the AG header buffers locked so we can keep going. */
> +	xfs_trans_bhold(sc->tp, sc->sa.agi_bp);
> +	xfs_trans_bhold(sc->tp, sc->sa.agf_bp);
> +	xfs_trans_bhold(sc->tp, sc->sa.agfl_bp);
> +
> +	/* Roll the transaction. */
> +	tp = sc->tp;
> +	error = xfs_trans_roll(&sc->tp);
> +	if (error)
> +		return error;

Who releases those buffers if we get an error here?

> +
> +	/* Join the buffer to the new transaction or release the hold. */
> +	if (sc->tp != tp) {

When does xfs_trans_roll() return successfully without allocating a
new transaction?

> +		xfs_trans_bjoin(sc->tp, sc->sa.agi_bp);
> +		xfs_trans_bjoin(sc->tp, sc->sa.agf_bp);
> +		xfs_trans_bjoin(sc->tp, sc->sa.agfl_bp);
> +	} else {
> +		xfs_trans_bhold_release(sc->tp, sc->sa.agi_bp);
> +		xfs_trans_bhold_release(sc->tp, sc->sa.agf_bp);
> +		xfs_trans_bhold_release(sc->tp, sc->sa.agfl_bp);
> +	}
> +
> +	return error;
> +}
.....

> +/* Allocate a block in an AG. */
> +int
> +xfs_repair_alloc_ag_block(
> +	struct xfs_scrub_context	*sc,
> +	struct xfs_owner_info		*oinfo,
> +	xfs_fsblock_t			*fsbno,
> +	enum xfs_ag_resv_type		resv)
> +{
> +	struct xfs_alloc_arg		args = {0};
> +	xfs_agblock_t			bno;
> +	int				error;
> +
> +	if (resv == XFS_AG_RESV_AGFL) {
> +		error = xfs_alloc_get_freelist(sc->tp, sc->sa.agf_bp, &bno, 1);
> +		if (error)
> +			return error;
> +		if (bno == NULLAGBLOCK)
> +			return -ENOSPC;
> +		xfs_extent_busy_reuse(sc->mp, sc->sa.agno, bno,
> +				1, false);
> +		*fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, bno);
> +		return 0;
> +	}
> +
> +	args.tp = sc->tp;
> +	args.mp = sc->mp;
> +	args.oinfo = *oinfo;
> +	args.fsbno = XFS_AGB_TO_FSB(args.mp, sc->sa.agno, 0);

So our target BNO is the start of the AG.

> +	args.minlen = 1;
> +	args.maxlen = 1;
> +	args.prod = 1;
> +	args.type = XFS_ALLOCTYPE_NEAR_BNO;

And we do a "near" search" for a single block. i.e. we are looking
for a single block near to the start of the AG. Basically, we are
looking for the extent at the left edge of the by-bno tree, which
may not be a single block.

Would it be better to do a XFS_ALLOCTYPE_THIS_AG allocation here so
we look up the by-size btree for a single block extent? The left
edge of that tree will always be the smallest free extent closest to
the start of the AG, and so using THIS_AG will tend to fill
small freespace holes rather than tear up larger free spaces for
single block allocations.....

> +/* Put a block back on the AGFL. */
> +int
> +xfs_repair_put_freelist(
> +	struct xfs_scrub_context	*sc,
> +	xfs_agblock_t			agbno)
> +{
> +	struct xfs_owner_info		oinfo;
> +	int				error;
> +
> +	/*
> +	 * Since we're "freeing" a lost block onto the AGFL, we have to
> +	 * create an rmap for the block prior to merging it or else other
> +	 * parts will break.
> +	 */
> +	xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG);
> +	error = xfs_rmap_alloc(sc->tp, sc->sa.agf_bp, sc->sa.agno, agbno, 1,
> +			&oinfo);
> +	if (error)
> +		return error;
> +
> +	/* Put the block on the AGFL. */
> +	error = xfs_alloc_put_freelist(sc->tp, sc->sa.agf_bp, sc->sa.agfl_bp,
> +			agbno, 0);

At what point do we check there's actually room in the AGFL for this
block?

> +	if (error)
> +		return error;
> +	xfs_extent_busy_insert(sc->tp, sc->sa.agno, agbno, 1,
> +			XFS_EXTENT_BUSY_SKIP_DISCARD);
> +
> +	/* Make sure the AGFL doesn't overfill. */
> +	return xfs_repair_fix_freelist(sc, true);

i.e. shouldn't this be done first?

> +}
> +
> +/*
> + * For a given metadata extent and owner, delete the associated rmap.
> + * If the block has no other owners, free it.
> + */
> +STATIC int
> +xfs_repair_free_or_unmap_extent(
> +	struct xfs_scrub_context	*sc,
> +	xfs_fsblock_t			fsbno,
> +	xfs_extlen_t			len,
> +	struct xfs_owner_info		*oinfo,
> +	enum xfs_ag_resv_type		resv)
> +{
> +	struct xfs_mount		*mp = sc->mp;
> +	struct xfs_btree_cur		*rmap_cur;
> +	struct xfs_buf			*agf_bp = NULL;
> +	xfs_agnumber_t			agno;
> +	xfs_agblock_t			agbno;
> +	bool				has_other_rmap;
> +	int				error = 0;
> +
> +	ASSERT(xfs_sb_version_hasrmapbt(&mp->m_sb));
> +	agno = XFS_FSB_TO_AGNO(mp, fsbno);
> +	agbno = XFS_FSB_TO_AGBNO(mp, fsbno);
> +
> +	trace_xfs_repair_free_or_unmap_extent(mp, agno, agbno, len);
> +
> +	for (; len > 0 && !error; len--, agbno++, fsbno++) {
> +		ASSERT(sc->ip != NULL || agno == sc->sa.agno);
> +
> +		/* Can we find any other rmappings? */
> +		if (sc->ip) {
> +			error = xfs_alloc_read_agf(mp, sc->tp, agno, 0,
> +					&agf_bp);
> +			if (error)
> +				break;
> +			if (!agf_bp) {
> +				error = -ENOMEM;
> +				break;
> +			}
> +		}
> +		rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp,
> +				agf_bp ? agf_bp : sc->sa.agf_bp, agno);

Why is the inode case treated differently here - why do we have a
special reference to agf_bp here rather than using sc->sa.agf_bp?


> +		error = xfs_rmap_has_other_keys(rmap_cur, agbno, 1, oinfo,
> +				&has_other_rmap);
> +		if (error)
> +			goto out_cur;
> +		xfs_btree_del_cursor(rmap_cur, XFS_BTREE_NOERROR);
> +		if (agf_bp)
> +			xfs_trans_brelse(sc->tp, agf_bp);

agf_bp released here.

> +
> +		/*
> +		 * If there are other rmappings, this block is cross
> +		 * linked and must not be freed.  Remove the reverse
> +		 * mapping and move on.  Otherwise, we were the only
> +		 * owner of the block, so free the extent, which will
> +		 * also remove the rmap.
> +		 */
> +		if (has_other_rmap)
> +			error = xfs_rmap_free(sc->tp, agf_bp, agno, agbno, 1,
> +					oinfo);

And then used here. Use-after-free?

> +		else if (resv == XFS_AG_RESV_AGFL)
> +			error = xfs_repair_put_freelist(sc, agbno);
> +		else
> +			error = xfs_free_extent(sc->tp, fsbno, 1, oinfo, resv);
> +		if (error)
> +			break;
> +
> +		if (sc->ip)
> +			error = xfs_trans_roll_inode(&sc->tp, sc->ip);
> +		else
> +			error = xfs_repair_roll_ag_trans(sc);

why don't we break on error here?

> +	}
> +
> +	return error;
> +out_cur:
> +	xfs_btree_del_cursor(rmap_cur, XFS_BTREE_ERROR);
> +	if (agf_bp)
> +		xfs_trans_brelse(sc->tp, agf_bp);
> +	return error;

Can you put a blank line preceeding the out label so there is clear
separation between the normal return code and error handling stack?

> +}
> +
> +/* Collect a dead btree extent for later disposal. */
> +int
> +xfs_repair_collect_btree_extent(
> +	struct xfs_scrub_context	*sc,
> +	struct xfs_repair_extent_list	*exlist,
> +	xfs_fsblock_t			fsbno,
> +	xfs_extlen_t			len)
> +{
> +	struct xfs_repair_extent	*rae;

What's "rae" short for? "rex" I'd understand, but I can't work out
what the "a" in rae means :/

> +
> +	trace_xfs_repair_collect_btree_extent(sc->mp,
> +			XFS_FSB_TO_AGNO(sc->mp, fsbno),
> +			XFS_FSB_TO_AGBNO(sc->mp, fsbno), len);
> +
> +	rae = kmem_alloc(sizeof(struct xfs_repair_extent),
> +			KM_MAYFAIL | KM_NOFS);

Why KM_NOFS?

> +	if (!rae)
> +		return -ENOMEM;
> +
> +	INIT_LIST_HEAD(&rae->list);
> +	rae->fsbno = fsbno;
> +	rae->len = len;
> +	list_add_tail(&rae->list, &exlist->list);
> +
> +	return 0;
> +}
> +
> +/* Invalidate buffers for blocks we're dumping. */
> +int
> +xfs_repair_invalidate_blocks(
> +	struct xfs_scrub_context	*sc,
> +	struct xfs_repair_extent_list	*exlist)
> +{
> +	struct xfs_repair_extent	*rae;
> +	struct xfs_repair_extent	*n;
> +	struct xfs_buf			*bp;
> +	xfs_agnumber_t			agno;
> +	xfs_agblock_t			agbno;
> +	xfs_agblock_t			i;
> +
> +	for_each_xfs_repair_extent_safe(rae, n, exlist) {
> +		agno = XFS_FSB_TO_AGNO(sc->mp, rae->fsbno);
> +		agbno = XFS_FSB_TO_AGBNO(sc->mp, rae->fsbno);
> +		for (i = 0; i < rae->len; i++) {
> +			bp = xfs_btree_get_bufs(sc->mp, sc->tp, agno,
> +					agbno + i, 0);
> +			xfs_trans_binval(sc->tp, bp);
> +		}
> +	}

What if they are data blocks we are dumping? xfs_trans_binval is
fine for metadata blocks, but creating a stale metadata buffer and
logging a buffer cancel for user data blocks doesn't make any sense
to me....

> +/* Dispose of dead btree extents.  If oinfo is NULL, just delete the list. */
> +int
> +xfs_repair_reap_btree_extents(
> +	struct xfs_scrub_context	*sc,
> +	struct xfs_repair_extent_list	*exlist,
> +	struct xfs_owner_info		*oinfo,
> +	enum xfs_ag_resv_type		type)
> +{
> +	struct xfs_repair_extent	*rae;
> +	struct xfs_repair_extent	*n;
> +	int				error = 0;
> +
> +	for_each_xfs_repair_extent_safe(rae, n, exlist) {
> +		if (oinfo) {
> +			error = xfs_repair_free_or_unmap_extent(sc, rae->fsbno,
> +					rae->len, oinfo, type);
> +			if (error)
> +				oinfo = NULL;
> +		}
> +		list_del(&rae->list);
> +		kmem_free(rae);
> +	}
> +
> +	return error;
> +}

So does this happen before or after the extent list invalidation?

> +/* Errors happened, just delete the dead btree extent list. */
> +void
> +xfs_repair_cancel_btree_extents(
> +	struct xfs_scrub_context	*sc,
> +	struct xfs_repair_extent_list	*exlist)
> +{
> +	xfs_repair_reap_btree_extents(sc, exlist, NULL, XFS_AG_RESV_NONE);
> +}

So if an error happened durign rebuild, we just trash whatever we
were trying to rebuild?

> +/* Compare two btree extents. */
> +static int
> +xfs_repair_btree_extent_cmp(
> +	void				*priv,
> +	struct list_head		*a,
> +	struct list_head		*b)
> +{
> +	struct xfs_repair_extent	*ap;
> +	struct xfs_repair_extent	*bp;
> +
> +	ap = container_of(a, struct xfs_repair_extent, list);
> +	bp = container_of(b, struct xfs_repair_extent, list);
> +
> +	if (ap->fsbno > bp->fsbno)
> +		return 1;
> +	else if (ap->fsbno < bp->fsbno)
> +		return -1;
> +	return 0;
> +}
> +
> +/* Remove all the blocks in sublist from exlist. */
> +#define LEFT_CONTIG	(1 << 0)
> +#define RIGHT_CONTIG	(1 << 1)

Namespace, please.

> +int
> +xfs_repair_subtract_extents(
> +	struct xfs_scrub_context	*sc,
> +	struct xfs_repair_extent_list	*exlist,
> +	struct xfs_repair_extent_list	*sublist)

What is this function supposed to do? I can't really review the code
(looks complex) because I don't know what it's function is supposed
to be.

> +struct xfs_repair_find_ag_btree_roots_info {
> +	struct xfs_buf			*agfl_bp;
> +	struct xfs_repair_find_ag_btree	*btree_info;
> +};
> +
> +/* Is this an OWN_AG block in the AGFL? */
> +STATIC bool
> +xfs_repair_is_block_in_agfl(
> +	struct xfs_mount		*mp,
> +	uint64_t			rmap_owner,
> +	xfs_agblock_t			agbno,
> +	struct xfs_buf			*agf_bp,
> +	struct xfs_buf			*agfl_bp)
> +{
> +	struct xfs_agf			*agf;
> +	__be32				*agfl_bno;
> +	unsigned int			flfirst;
> +	unsigned int			fllast;
> +	int				i;
> +
> +	if (rmap_owner != XFS_RMAP_OWN_AG)
> +		return false;
> +
> +	agf = XFS_BUF_TO_AGF(agf_bp);
> +	agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agfl_bp);
> +	flfirst = be32_to_cpu(agf->agf_flfirst);
> +	fllast = be32_to_cpu(agf->agf_fllast);
> +
> +	/* Skip an empty AGFL. */
> +	if (agf->agf_flcount == cpu_to_be32(0))
> +		return false;
> +
> +	/* first to last is a consecutive list. */
> +	if (fllast >= flfirst) {
> +		for (i = flfirst; i <= fllast; i++) {
> +			if (be32_to_cpu(agfl_bno[i]) == agbno)
> +				return true;
> +		}
> +
> +		return false;
> +	}
> +
> +	/* first to the end */
> +	for (i = flfirst; i < xfs_agfl_size(mp); i++) {
> +		if (be32_to_cpu(agfl_bno[i]) == agbno)
> +			return true;
> +	}
> +
> +	/* the start to last. */
> +	for (i = 0; i <= fllast; i++) {
> +		if (be32_to_cpu(agfl_bno[i]) == agbno)
> +			return true;
> +	}
> +
> +	return false;

Didn't you just create a generic agfl walk function for this?

> +}
> +
> +/* Find btree roots from the AGF. */
> +STATIC int
> +xfs_repair_find_ag_btree_roots_helper(
> +	struct xfs_btree_cur		*cur,
> +	struct xfs_rmap_irec		*rec,
> +	void				*priv)

Again, I'm not sure exactly what this function is doing. It look
slike it's trying to tell if a freed block is a btree block,
but I'm not sure of that, nor the context in which we'd be searching
free blocks for a specific metadata contents.

> +/* Find the roots of the given btrees from the rmap info. */
> +int
> +xfs_repair_find_ag_btree_roots(
> +	struct xfs_scrub_context	*sc,
> +	struct xfs_buf			*agf_bp,
> +	struct xfs_repair_find_ag_btree	*btree_info,
> +	struct xfs_buf			*agfl_bp)
> +{
> +	struct xfs_mount		*mp = sc->mp;
> +	struct xfs_repair_find_ag_btree_roots_info	ri;
> +	struct xfs_repair_find_ag_btree	*fab;
> +	struct xfs_btree_cur		*cur;
> +	int				error;
> +
> +	ri.btree_info = btree_info;
> +	ri.agfl_bp = agfl_bp;
> +	for (fab = btree_info; fab->buf_ops; fab++) {
> +		ASSERT(agfl_bp || fab->rmap_owner != XFS_RMAP_OWN_AG);
> +		fab->root = NULLAGBLOCK;
> +		fab->level = 0;
> +	}
> +
> +	cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno);
> +	error = xfs_rmap_query_all(cur, xfs_repair_find_ag_btree_roots_helper,
> +			&ri);
> +	xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
> +
> +	for (fab = btree_info; !error && fab->buf_ops; fab++)
> +		if (fab->root != NULLAGBLOCK)
> +			fab->level++;
> +
> +	return error;
> +}
> +
> +/* Reset the superblock counters from the AGF/AGI. */
> +int
> +xfs_repair_reset_counters(
> +	struct xfs_mount	*mp)
> +{
> +	struct xfs_trans	*tp;
> +	struct xfs_buf		*agi_bp;
> +	struct xfs_buf		*agf_bp;
> +	struct xfs_agi		*agi;
> +	struct xfs_agf		*agf;
> +	xfs_agnumber_t		agno;
> +	xfs_ino_t		icount = 0;
> +	xfs_ino_t		ifree = 0;
> +	xfs_filblks_t		fdblocks = 0;
> +	int64_t			delta_icount;
> +	int64_t			delta_ifree;
> +	int64_t			delta_fdblocks;
> +	int			error;
> +
> +	trace_xfs_repair_reset_counters(mp);
> +
> +	error = xfs_trans_alloc_empty(mp, &tp);
> +	if (error)
> +		return error;
> +
> +	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
> +		/* Count all the inodes... */
> +		error = xfs_ialloc_read_agi(mp, tp, agno, &agi_bp);
> +		if (error)
> +			goto out;
> +		agi = XFS_BUF_TO_AGI(agi_bp);
> +		icount += be32_to_cpu(agi->agi_count);
> +		ifree += be32_to_cpu(agi->agi_freecount);
> +
> +		/* Add up the free/freelist/bnobt/cntbt blocks... */
> +		error = xfs_alloc_read_agf(mp, tp, agno, 0, &agf_bp);
> +		if (error)
> +			goto out;
> +		if (!agf_bp) {
> +			error = -ENOMEM;
> +			goto out;
> +		}
> +		agf = XFS_BUF_TO_AGF(agf_bp);
> +		fdblocks += be32_to_cpu(agf->agf_freeblks);
> +		fdblocks += be32_to_cpu(agf->agf_flcount);
> +		fdblocks += be32_to_cpu(agf->agf_btreeblks);
> +	}
> +
> +	/*
> +	 * Reinitialize the counters.  The on-disk and in-core counters
> +	 * differ by the number of inodes/blocks reserved by the admin,
> +	 * the per-AG reservation, and any transactions in progress, so
> +	 * we have to account for that.
> +	 */
> +	spin_lock(&mp->m_sb_lock);
> +	delta_icount = (int64_t)mp->m_sb.sb_icount - icount;
> +	delta_ifree = (int64_t)mp->m_sb.sb_ifree - ifree;
> +	delta_fdblocks = (int64_t)mp->m_sb.sb_fdblocks - fdblocks;
> +	mp->m_sb.sb_icount = icount;
> +	mp->m_sb.sb_ifree = ifree;
> +	mp->m_sb.sb_fdblocks = fdblocks;
> +	spin_unlock(&mp->m_sb_lock);

This is largely a copy and paste of the code run by mount after log
recovery on an unclean mount. Can you factor this into a single
function?

> +
> +	if (delta_icount) {
> +		error = xfs_mod_icount(mp, delta_icount);
> +		if (error)
> +			goto out;
> +	}
> +	if (delta_ifree) {
> +		error = xfs_mod_ifree(mp, delta_ifree);
> +		if (error)
> +			goto out;
> +	}
> +	if (delta_fdblocks) {
> +		error = xfs_mod_fdblocks(mp, delta_fdblocks, false);
> +		if (error)
> +			goto out;
> +	}
> +
> +out:
> +	xfs_trans_cancel(tp);
> +	return error;

Ummmm - why do we do all this superblock modification work in a
transaction context, and then just cancel it?

> +}
> +
> +/* Figure out how many blocks to reserve for an AG repair. */
> +xfs_extlen_t
> +xfs_repair_calc_ag_resblks(
> +	struct xfs_scrub_context	*sc)
> +{
> +	struct xfs_mount		*mp = sc->mp;
> +	struct xfs_scrub_metadata	*sm = sc->sm;
> +	struct xfs_agi			*agi;
> +	struct xfs_agf			*agf;
> +	struct xfs_buf			*bp;
> +	xfs_agino_t			icount;
> +	xfs_extlen_t			aglen;
> +	xfs_extlen_t			usedlen;
> +	xfs_extlen_t			freelen;
> +	xfs_extlen_t			bnobt_sz;
> +	xfs_extlen_t			inobt_sz;
> +	xfs_extlen_t			rmapbt_sz;
> +	xfs_extlen_t			refcbt_sz;
> +	int				error;
> +
> +	if (!(sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR))
> +		return 0;
> +
> +	/*
> +	 * Try to get the actual counters from disk; if not, make
> +	 * some worst case assumptions.
> +	 */
> +	error = xfs_read_agi(mp, NULL, sm->sm_agno, &bp);
> +	if (!error) {
> +		agi = XFS_BUF_TO_AGI(bp);
> +		icount = be32_to_cpu(agi->agi_count);
> +		xfs_trans_brelse(NULL, bp);

xfs_buf_relse(bp)?

And, well, why not just use pag->pagi_count rather than having to
read it from the on-disk buffer?

> +	} else {
> +		icount = mp->m_sb.sb_agblocks / mp->m_sb.sb_inopblock;
> +	}
> +
> +	error = xfs_alloc_read_agf(mp, NULL, sm->sm_agno, 0, &bp);
> +	if (!error && bp) {
> +		agf = XFS_BUF_TO_AGF(bp);
> +		aglen = be32_to_cpu(agf->agf_length);
> +		freelen = be32_to_cpu(agf->agf_freeblks);
> +		usedlen = aglen - freelen;
> +		xfs_trans_brelse(NULL, bp);
> +	} else {
> +		aglen = mp->m_sb.sb_agblocks;
> +		freelen = aglen;
> +		usedlen = aglen;
> +	}

ditto for there - it's all in the xfs_perag, right?

> +
> +	trace_xfs_repair_calc_ag_resblks(mp, sm->sm_agno, icount, aglen,
> +			freelen, usedlen);
> +
> +	/*
> +	 * Figure out how many blocks we'd need worst case to rebuild
> +	 * each type of btree.  Note that we can only rebuild the
> +	 * bnobt/cntbt or inobt/finobt as pairs.
> +	 */
> +	bnobt_sz = 2 * xfs_allocbt_calc_size(mp, freelen);
> +	if (xfs_sb_version_hassparseinodes(&mp->m_sb))
> +		inobt_sz = xfs_iallocbt_calc_size(mp, icount /
> +				XFS_INODES_PER_HOLEMASK_BIT);
> +	else
> +		inobt_sz = xfs_iallocbt_calc_size(mp, icount /
> +				XFS_INODES_PER_CHUNK);
> +	if (xfs_sb_version_hasfinobt(&mp->m_sb))
> +		inobt_sz *= 2;
> +	if (xfs_sb_version_hasreflink(&mp->m_sb)) {
> +		rmapbt_sz = xfs_rmapbt_calc_size(mp, aglen);
> +		refcbt_sz = xfs_refcountbt_calc_size(mp, usedlen);
> +	} else {
> +		rmapbt_sz = xfs_rmapbt_calc_size(mp, usedlen);
> +		refcbt_sz = 0;
> +	}
> +	if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
> +		rmapbt_sz = 0;
> +
> +	trace_xfs_repair_calc_ag_resblks_btsize(mp, sm->sm_agno, bnobt_sz,
> +			inobt_sz, rmapbt_sz, refcbt_sz);
> +
> +	return max(max(bnobt_sz, inobt_sz), max(rmapbt_sz, refcbt_sz));

Why are we only returning the resblks for a single tree rebuild?
Shouldn't it return the total blocks require to rebuild all all the
trees?

Cheers,

Dave.
Darrick J. Wong April 7, 2018, 5:55 p.m. UTC | #2
On Fri, Apr 06, 2018 at 10:52:51AM +1000, Dave Chinner wrote:
> On Mon, Apr 02, 2018 at 12:57:18PM -0700, Darrick J. Wong wrote:
> > From: Darrick J. Wong <darrick.wong@oracle.com>
> > 
> > Add some helper functions for repair functions that will help us to
> > allocate and initialize new metadata blocks for btrees that we're
> > rebuilding.
> 
> This is more than "helper functions" - my replay is almost 700 lines
> long!
> 
> i.e. This is a stack of extent invalidation, freeing and rmap/free
> space rebuild code. Needs a lot more description and context than
> "helper functions".

I agree now; this patch has become overly long and incohesive.  It could
be broken into the following pieces:

- modifying transaction allocations to take resblks
	or "ensuring sufficient free space to rebuild"

- rolling transactions

- allocation and reinit functions

- fixing/putting on the freelist

- dealing with leftover blocks after we rebuild a tree

- finding btree roots

- resetting superblock counters

So I'll break this up into seven smaller pieces.

> .....
> > @@ -574,7 +575,10 @@ xfs_scrub_setup_fs(
> >  	struct xfs_scrub_context	*sc,
> >  	struct xfs_inode		*ip)
> >  {
> > -	return xfs_scrub_trans_alloc(sc->sm, sc->mp, &sc->tp);
> > +	uint				resblks;
> > +
> > +	resblks = xfs_repair_calc_ag_resblks(sc);
> > +	return xfs_scrub_trans_alloc(sc->sm, sc->mp, resblks, &sc->tp);
> >  }
> 
> What's the block reservation for?

We rebuild a metadata structure by constructing a completely new btree
with blocks we got from the free space and switching the root when we're
done.  This isn't treated any differently from any other btree shape
change, so we need to reserve blocks in the transaction.

> 
> > +/*
> > + * Roll a transaction, keeping the AG headers locked and reinitializing
> > + * the btree cursors.
> > + */
> > +int
> > +xfs_repair_roll_ag_trans(
> > +	struct xfs_scrub_context	*sc)
> > +{
> > +	struct xfs_trans		*tp;
> > +	int				error;
> > +
> > +	/* Keep the AG header buffers locked so we can keep going. */
> > +	xfs_trans_bhold(sc->tp, sc->sa.agi_bp);
> > +	xfs_trans_bhold(sc->tp, sc->sa.agf_bp);
> > +	xfs_trans_bhold(sc->tp, sc->sa.agfl_bp);
> > +
> > +	/* Roll the transaction. */
> > +	tp = sc->tp;
> > +	error = xfs_trans_roll(&sc->tp);
> > +	if (error)
> > +		return error;
> 
> Who releases those buffers if we get an error here?

Oops. :(

> > +
> > +	/* Join the buffer to the new transaction or release the hold. */
> > +	if (sc->tp != tp) {
> 
> When does xfs_trans_roll() return successfully without allocating a
> new transaction?

I think this is a leftover from the way we used to do rolls?  Though I've
noticed that with trans_roll one can pingpong between the same two slots
in a slab, so this might not even be an accurate test anyway.

> > +		xfs_trans_bjoin(sc->tp, sc->sa.agi_bp);
> > +		xfs_trans_bjoin(sc->tp, sc->sa.agf_bp);
> > +		xfs_trans_bjoin(sc->tp, sc->sa.agfl_bp);
> > +	} else {
> > +		xfs_trans_bhold_release(sc->tp, sc->sa.agi_bp);
> > +		xfs_trans_bhold_release(sc->tp, sc->sa.agf_bp);
> > +		xfs_trans_bhold_release(sc->tp, sc->sa.agfl_bp);

Ok, so this whole thing should be restructured as:

error = xfs_trans_roll(...);
if (error) {
	xfs_trans_bhold_release(...);
	...
} else {
	xfs_trans_bjoin(...);
	...
}
return error;

> > +	}
> > +
> > +	return error;
> > +}
> .....
> 
> > +/* Allocate a block in an AG. */
> > +int
> > +xfs_repair_alloc_ag_block(
> > +	struct xfs_scrub_context	*sc,
> > +	struct xfs_owner_info		*oinfo,
> > +	xfs_fsblock_t			*fsbno,
> > +	enum xfs_ag_resv_type		resv)
> > +{
> > +	struct xfs_alloc_arg		args = {0};
> > +	xfs_agblock_t			bno;
> > +	int				error;
> > +
> > +	if (resv == XFS_AG_RESV_AGFL) {
> > +		error = xfs_alloc_get_freelist(sc->tp, sc->sa.agf_bp, &bno, 1);
> > +		if (error)
> > +			return error;
> > +		if (bno == NULLAGBLOCK)
> > +			return -ENOSPC;
> > +		xfs_extent_busy_reuse(sc->mp, sc->sa.agno, bno,
> > +				1, false);
> > +		*fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, bno);
> > +		return 0;
> > +	}
> > +
> > +	args.tp = sc->tp;
> > +	args.mp = sc->mp;
> > +	args.oinfo = *oinfo;
> > +	args.fsbno = XFS_AGB_TO_FSB(args.mp, sc->sa.agno, 0);
> 
> So our target BNO is the start of the AG.
> 
> > +	args.minlen = 1;
> > +	args.maxlen = 1;
> > +	args.prod = 1;
> > +	args.type = XFS_ALLOCTYPE_NEAR_BNO;
> 
> And we do a "near" search" for a single block. i.e. we are looking
> for a single block near to the start of the AG. Basically, we are
> looking for the extent at the left edge of the by-bno tree, which
> may not be a single block.
> 
> Would it be better to do a XFS_ALLOCTYPE_THIS_AG allocation here so
> we look up the by-size btree for a single block extent? The left
> edge of that tree will always be the smallest free extent closest to
> the start of the AG, and so using THIS_AG will tend to fill
> small freespace holes rather than tear up larger free spaces for
> single block allocations.....

Makes sense.  Originally I was thinking that we try to put the rebuilt
btree as close to the start of the AG as possible, but there's little
point in doing that so long as regular btree block allocation doesn't
bother.

TBH I've been wondering on the side if it makes more sense for us to
detect things like dm-{thinp,cache} which have large(ish) underlying
blocks and try to consolidate metadata into a smaller number of those
underlying blocks at the start (or the end) of the AG?  It could be
useful to pack the metadata into a smaller number of underlying blocks
so that if one piece of metadata gets hot enough the rest will end up in
the fast cache as well.

But, that's purely speculative at this point. :)

> > +/* Put a block back on the AGFL. */
> > +int
> > +xfs_repair_put_freelist(
> > +	struct xfs_scrub_context	*sc,
> > +	xfs_agblock_t			agbno)
> > +{
> > +	struct xfs_owner_info		oinfo;
> > +	int				error;
> > +
> > +	/*
> > +	 * Since we're "freeing" a lost block onto the AGFL, we have to
> > +	 * create an rmap for the block prior to merging it or else other
> > +	 * parts will break.
> > +	 */
> > +	xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG);
> > +	error = xfs_rmap_alloc(sc->tp, sc->sa.agf_bp, sc->sa.agno, agbno, 1,
> > +			&oinfo);
> > +	if (error)
> > +		return error;
> > +
> > +	/* Put the block on the AGFL. */
> > +	error = xfs_alloc_put_freelist(sc->tp, sc->sa.agf_bp, sc->sa.agfl_bp,
> > +			agbno, 0);
> 
> At what point do we check there's actually room in the AGFL for this
> block?
> 
> > +	if (error)
> > +		return error;
> > +	xfs_extent_busy_insert(sc->tp, sc->sa.agno, agbno, 1,
> > +			XFS_EXTENT_BUSY_SKIP_DISCARD);
> > +
> > +	/* Make sure the AGFL doesn't overfill. */
> > +	return xfs_repair_fix_freelist(sc, true);
> 
> i.e. shouldn't this be done first?



> 
> > +}
> > +
> > +/*
> > + * For a given metadata extent and owner, delete the associated rmap.
> > + * If the block has no other owners, free it.
> > + */
> > +STATIC int
> > +xfs_repair_free_or_unmap_extent(
> > +	struct xfs_scrub_context	*sc,
> > +	xfs_fsblock_t			fsbno,
> > +	xfs_extlen_t			len,
> > +	struct xfs_owner_info		*oinfo,
> > +	enum xfs_ag_resv_type		resv)
> > +{
> > +	struct xfs_mount		*mp = sc->mp;
> > +	struct xfs_btree_cur		*rmap_cur;
> > +	struct xfs_buf			*agf_bp = NULL;
> > +	xfs_agnumber_t			agno;
> > +	xfs_agblock_t			agbno;
> > +	bool				has_other_rmap;
> > +	int				error = 0;
> > +
> > +	ASSERT(xfs_sb_version_hasrmapbt(&mp->m_sb));
> > +	agno = XFS_FSB_TO_AGNO(mp, fsbno);
> > +	agbno = XFS_FSB_TO_AGBNO(mp, fsbno);
> > +
> > +	trace_xfs_repair_free_or_unmap_extent(mp, agno, agbno, len);
> > +
> > +	for (; len > 0 && !error; len--, agbno++, fsbno++) {
> > +		ASSERT(sc->ip != NULL || agno == sc->sa.agno);
> > +
> > +		/* Can we find any other rmappings? */
> > +		if (sc->ip) {
> > +			error = xfs_alloc_read_agf(mp, sc->tp, agno, 0,
> > +					&agf_bp);
> > +			if (error)
> > +				break;
> > +			if (!agf_bp) {
> > +				error = -ENOMEM;
> > +				break;
> > +			}
> > +		}
> > +		rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp,
> > +				agf_bp ? agf_bp : sc->sa.agf_bp, agno);
> 
> Why is the inode case treated differently here - why do we have a
> special reference to agf_bp here rather than using sc->sa.agf_bp?

This function is called to free the old btree blocks from AG btree
repair and inode btree (bmbt) repair.  The inode btree scrub functions
don't necessarily set up sc->sa.agf_bp, so we treat it differently here.

> 
> 
> > +		error = xfs_rmap_has_other_keys(rmap_cur, agbno, 1, oinfo,
> > +				&has_other_rmap);
> > +		if (error)
> > +			goto out_cur;
> > +		xfs_btree_del_cursor(rmap_cur, XFS_BTREE_NOERROR);
> > +		if (agf_bp)
> > +			xfs_trans_brelse(sc->tp, agf_bp);
> 
> agf_bp released here.
> 
> > +
> > +		/*
> > +		 * If there are other rmappings, this block is cross
> > +		 * linked and must not be freed.  Remove the reverse
> > +		 * mapping and move on.  Otherwise, we were the only
> > +		 * owner of the block, so free the extent, which will
> > +		 * also remove the rmap.
> > +		 */
> > +		if (has_other_rmap)
> > +			error = xfs_rmap_free(sc->tp, agf_bp, agno, agbno, 1,
> > +					oinfo);
> 
> And then used here. Use-after-free?

Yep, fixed.

> > +		else if (resv == XFS_AG_RESV_AGFL)
> > +			error = xfs_repair_put_freelist(sc, agbno);
> > +		else
> > +			error = xfs_free_extent(sc->tp, fsbno, 1, oinfo, resv);
> > +		if (error)
> > +			break;
> > +
> > +		if (sc->ip)
> > +			error = xfs_trans_roll_inode(&sc->tp, sc->ip);
> > +		else
> > +			error = xfs_repair_roll_ag_trans(sc);
> 
> why don't we break on error here?

We do, but it's up in the for loop.

> > +	}
> > +
> > +	return error;
> > +out_cur:
> > +	xfs_btree_del_cursor(rmap_cur, XFS_BTREE_ERROR);
> > +	if (agf_bp)
> > +		xfs_trans_brelse(sc->tp, agf_bp);
> > +	return error;
> 
> Can you put a blank line preceeding the out label so there is clear
> separation between the normal return code and error handling stack?

Ok.

> > +}
> > +
> > +/* Collect a dead btree extent for later disposal. */
> > +int
> > +xfs_repair_collect_btree_extent(
> > +	struct xfs_scrub_context	*sc,
> > +	struct xfs_repair_extent_list	*exlist,
> > +	xfs_fsblock_t			fsbno,
> > +	xfs_extlen_t			len)
> > +{
> > +	struct xfs_repair_extent	*rae;
> 
> What's "rae" short for? "rex" I'd understand, but I can't work out
> what the "a" in rae means :/

Ah, right, these used to be struct xfs_repair_ag_extent before I decided
that the "ag" part was obvious and removed it.  I'll change it to rex.

> > +
> > +	trace_xfs_repair_collect_btree_extent(sc->mp,
> > +			XFS_FSB_TO_AGNO(sc->mp, fsbno),
> > +			XFS_FSB_TO_AGBNO(sc->mp, fsbno), len);
> > +
> > +	rae = kmem_alloc(sizeof(struct xfs_repair_extent),
> > +			KM_MAYFAIL | KM_NOFS);
> 
> Why KM_NOFS?

The caller holds a transaction and (potentially) has locked an entire
AG, so we want to avoid recursing into the filesystem to free up memory.

> 
> > +	if (!rae)
> > +		return -ENOMEM;
> > +
> > +	INIT_LIST_HEAD(&rae->list);
> > +	rae->fsbno = fsbno;
> > +	rae->len = len;
> > +	list_add_tail(&rae->list, &exlist->list);
> > +
> > +	return 0;
> > +}
> > +
> > +/* Invalidate buffers for blocks we're dumping. */
> > +int
> > +xfs_repair_invalidate_blocks(
> > +	struct xfs_scrub_context	*sc,
> > +	struct xfs_repair_extent_list	*exlist)
> > +{
> > +	struct xfs_repair_extent	*rae;
> > +	struct xfs_repair_extent	*n;
> > +	struct xfs_buf			*bp;
> > +	xfs_agnumber_t			agno;
> > +	xfs_agblock_t			agbno;
> > +	xfs_agblock_t			i;
> > +
> > +	for_each_xfs_repair_extent_safe(rae, n, exlist) {
> > +		agno = XFS_FSB_TO_AGNO(sc->mp, rae->fsbno);
> > +		agbno = XFS_FSB_TO_AGBNO(sc->mp, rae->fsbno);
> > +		for (i = 0; i < rae->len; i++) {
> > +			bp = xfs_btree_get_bufs(sc->mp, sc->tp, agno,
> > +					agbno + i, 0);
> > +			xfs_trans_binval(sc->tp, bp);
> > +		}
> > +	}
> 
> What if they are data blocks we are dumping? xfs_trans_binval is
> fine for metadata blocks, but creating a stale metadata buffer and
> logging a buffer cancel for user data blocks doesn't make any sense
> to me....

exlist is only supposed to contain references to metadata blocks that we
collected from the rmapbt.  The current iteration of the repair code
shouldn't ever dump file data blocks.

OTOH I now find myself at a loss for why this isn't done in the freeing
part of free_or_unmap_extent.  I think the reason was to undirty the old
btree blocks just prior to committing the new tree.

> 
> > +/* Dispose of dead btree extents.  If oinfo is NULL, just delete the list. */
> > +int
> > +xfs_repair_reap_btree_extents(
> > +	struct xfs_scrub_context	*sc,
> > +	struct xfs_repair_extent_list	*exlist,
> > +	struct xfs_owner_info		*oinfo,
> > +	enum xfs_ag_resv_type		type)
> > +{
> > +	struct xfs_repair_extent	*rae;
> > +	struct xfs_repair_extent	*n;
> > +	int				error = 0;
> > +
> > +	for_each_xfs_repair_extent_safe(rae, n, exlist) {
> > +		if (oinfo) {
> > +			error = xfs_repair_free_or_unmap_extent(sc, rae->fsbno,
> > +					rae->len, oinfo, type);
> > +			if (error)
> > +				oinfo = NULL;
> > +		}
> > +		list_del(&rae->list);
> > +		kmem_free(rae);
> > +	}
> > +
> > +	return error;
> > +}
> 
> So does this happen before or after the extent list invalidation?

After.

> > +/* Errors happened, just delete the dead btree extent list. */
> > +void
> > +xfs_repair_cancel_btree_extents(
> > +	struct xfs_scrub_context	*sc,
> > +	struct xfs_repair_extent_list	*exlist)
> > +{
> > +	xfs_repair_reap_btree_extents(sc, exlist, NULL, XFS_AG_RESV_NONE);
> > +}
> 
> So if an error happened durign rebuild, we just trash whatever we
> were trying to rebuild?

The oinfo parameter is NULL, so this only deletes the items in exlist.
Could probably just open-code the for_each/list_del/kmem_free bit here.

> 
> > +/* Compare two btree extents. */
> > +static int
> > +xfs_repair_btree_extent_cmp(
> > +	void				*priv,
> > +	struct list_head		*a,
> > +	struct list_head		*b)
> > +{
> > +	struct xfs_repair_extent	*ap;
> > +	struct xfs_repair_extent	*bp;
> > +
> > +	ap = container_of(a, struct xfs_repair_extent, list);
> > +	bp = container_of(b, struct xfs_repair_extent, list);
> > +
> > +	if (ap->fsbno > bp->fsbno)
> > +		return 1;
> > +	else if (ap->fsbno < bp->fsbno)
> > +		return -1;
> > +	return 0;
> > +}
> > +
> > +/* Remove all the blocks in sublist from exlist. */
> > +#define LEFT_CONTIG	(1 << 0)
> > +#define RIGHT_CONTIG	(1 << 1)
> 
> Namespace, please.
> 
> > +int
> > +xfs_repair_subtract_extents(
> > +	struct xfs_scrub_context	*sc,
> > +	struct xfs_repair_extent_list	*exlist,
> > +	struct xfs_repair_extent_list	*sublist)
> 
> What is this function supposed to do? I can't really review the code
> (looks complex) because I don't know what it's function is supposed
> to be.

The intent is that we iterate the rmapbt for all of its records for a
given owner to generate exlist.  Then we iterate all blocks of metadata
structures that we are not rebuilding but have the same rmapbt owner to
generate sublist.  This routine subtracts all the blocks mentioned in
sublist from all the extents linked in exlist, which leaves exlist as
the list of all blocks owned by the old version of btree that we're
rebuilding.  The list is fed into _reap_btree_extents to free the old
btree blocks (or merely remove the rmap if the block is crosslinked).

> > +struct xfs_repair_find_ag_btree_roots_info {
> > +	struct xfs_buf			*agfl_bp;
> > +	struct xfs_repair_find_ag_btree	*btree_info;
> > +};
> > +
> > +/* Is this an OWN_AG block in the AGFL? */
> > +STATIC bool
> > +xfs_repair_is_block_in_agfl(
> > +	struct xfs_mount		*mp,
> > +	uint64_t			rmap_owner,
> > +	xfs_agblock_t			agbno,
> > +	struct xfs_buf			*agf_bp,
> > +	struct xfs_buf			*agfl_bp)
> > +{
> > +	struct xfs_agf			*agf;
> > +	__be32				*agfl_bno;
> > +	unsigned int			flfirst;
> > +	unsigned int			fllast;
> > +	int				i;
> > +
> > +	if (rmap_owner != XFS_RMAP_OWN_AG)
> > +		return false;
> > +
> > +	agf = XFS_BUF_TO_AGF(agf_bp);
> > +	agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agfl_bp);
> > +	flfirst = be32_to_cpu(agf->agf_flfirst);
> > +	fllast = be32_to_cpu(agf->agf_fllast);
> > +
> > +	/* Skip an empty AGFL. */
> > +	if (agf->agf_flcount == cpu_to_be32(0))
> > +		return false;
> > +
> > +	/* first to last is a consecutive list. */
> > +	if (fllast >= flfirst) {
> > +		for (i = flfirst; i <= fllast; i++) {
> > +			if (be32_to_cpu(agfl_bno[i]) == agbno)
> > +				return true;
> > +		}
> > +
> > +		return false;
> > +	}
> > +
> > +	/* first to the end */
> > +	for (i = flfirst; i < xfs_agfl_size(mp); i++) {
> > +		if (be32_to_cpu(agfl_bno[i]) == agbno)
> > +			return true;
> > +	}
> > +
> > +	/* the start to last. */
> > +	for (i = 0; i <= fllast; i++) {
> > +		if (be32_to_cpu(agfl_bno[i]) == agbno)
> > +			return true;
> > +	}
> > +
> > +	return false;
> 
> Didn't you just create a generic agfl walk function for this?

Er... yes.

> > +}
> > +
> > +/* Find btree roots from the AGF. */
> > +STATIC int
> > +xfs_repair_find_ag_btree_roots_helper(
> > +	struct xfs_btree_cur		*cur,
> > +	struct xfs_rmap_irec		*rec,
> > +	void				*priv)
> 
> Again, I'm not sure exactly what this function is doing. It look
> slike it's trying to tell if a freed block is a btree block,
> but I'm not sure of that, nor the context in which we'd be searching
> free blocks for a specific metadata contents.

Ick.  Bad comment for both of these functions. :)

/*
 * Find the roots of the given btrees.
 *
 * Given a list of {rmap owner, buf_ops, magic}, search every rmapbt
 * record for references to blocks with a matching rmap owner and magic
 * number.  For each btree type, retain a reference to the block with
 * the highest level; this is presumed to be the root of the btree.
 */

> > +/* Find the roots of the given btrees from the rmap info. */
> > +int
> > +xfs_repair_find_ag_btree_roots(
> > +	struct xfs_scrub_context	*sc,
> > +	struct xfs_buf			*agf_bp,
> > +	struct xfs_repair_find_ag_btree	*btree_info,
> > +	struct xfs_buf			*agfl_bp)
> > +{
> > +	struct xfs_mount		*mp = sc->mp;
> > +	struct xfs_repair_find_ag_btree_roots_info	ri;
> > +	struct xfs_repair_find_ag_btree	*fab;
> > +	struct xfs_btree_cur		*cur;
> > +	int				error;
> > +
> > +	ri.btree_info = btree_info;
> > +	ri.agfl_bp = agfl_bp;
> > +	for (fab = btree_info; fab->buf_ops; fab++) {
> > +		ASSERT(agfl_bp || fab->rmap_owner != XFS_RMAP_OWN_AG);
> > +		fab->root = NULLAGBLOCK;
> > +		fab->level = 0;
> > +	}
> > +
> > +	cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno);
> > +	error = xfs_rmap_query_all(cur, xfs_repair_find_ag_btree_roots_helper,
> > +			&ri);
> > +	xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
> > +
> > +	for (fab = btree_info; !error && fab->buf_ops; fab++)
> > +		if (fab->root != NULLAGBLOCK)
> > +			fab->level++;
> > +
> > +	return error;
> > +}
> > +
> > +/* Reset the superblock counters from the AGF/AGI. */
> > +int
> > +xfs_repair_reset_counters(
> > +	struct xfs_mount	*mp)
> > +{
> > +	struct xfs_trans	*tp;
> > +	struct xfs_buf		*agi_bp;
> > +	struct xfs_buf		*agf_bp;
> > +	struct xfs_agi		*agi;
> > +	struct xfs_agf		*agf;
> > +	xfs_agnumber_t		agno;
> > +	xfs_ino_t		icount = 0;
> > +	xfs_ino_t		ifree = 0;
> > +	xfs_filblks_t		fdblocks = 0;
> > +	int64_t			delta_icount;
> > +	int64_t			delta_ifree;
> > +	int64_t			delta_fdblocks;
> > +	int			error;
> > +
> > +	trace_xfs_repair_reset_counters(mp);
> > +
> > +	error = xfs_trans_alloc_empty(mp, &tp);
> > +	if (error)
> > +		return error;
> > +
> > +	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
> > +		/* Count all the inodes... */
> > +		error = xfs_ialloc_read_agi(mp, tp, agno, &agi_bp);
> > +		if (error)
> > +			goto out;
> > +		agi = XFS_BUF_TO_AGI(agi_bp);
> > +		icount += be32_to_cpu(agi->agi_count);
> > +		ifree += be32_to_cpu(agi->agi_freecount);
> > +
> > +		/* Add up the free/freelist/bnobt/cntbt blocks... */
> > +		error = xfs_alloc_read_agf(mp, tp, agno, 0, &agf_bp);
> > +		if (error)
> > +			goto out;
> > +		if (!agf_bp) {
> > +			error = -ENOMEM;
> > +			goto out;
> > +		}
> > +		agf = XFS_BUF_TO_AGF(agf_bp);
> > +		fdblocks += be32_to_cpu(agf->agf_freeblks);
> > +		fdblocks += be32_to_cpu(agf->agf_flcount);
> > +		fdblocks += be32_to_cpu(agf->agf_btreeblks);
> > +	}
> > +
> > +	/*
> > +	 * Reinitialize the counters.  The on-disk and in-core counters
> > +	 * differ by the number of inodes/blocks reserved by the admin,
> > +	 * the per-AG reservation, and any transactions in progress, so
> > +	 * we have to account for that.
> > +	 */
> > +	spin_lock(&mp->m_sb_lock);
> > +	delta_icount = (int64_t)mp->m_sb.sb_icount - icount;
> > +	delta_ifree = (int64_t)mp->m_sb.sb_ifree - ifree;
> > +	delta_fdblocks = (int64_t)mp->m_sb.sb_fdblocks - fdblocks;
> > +	mp->m_sb.sb_icount = icount;
> > +	mp->m_sb.sb_ifree = ifree;
> > +	mp->m_sb.sb_fdblocks = fdblocks;
> > +	spin_unlock(&mp->m_sb_lock);
> 
> This is largely a copy and paste of the code run by mount after log
> recovery on an unclean mount. Can you factor this into a single
> function?

Ok.

> > +
> > +	if (delta_icount) {
> > +		error = xfs_mod_icount(mp, delta_icount);
> > +		if (error)
> > +			goto out;
> > +	}
> > +	if (delta_ifree) {
> > +		error = xfs_mod_ifree(mp, delta_ifree);
> > +		if (error)
> > +			goto out;
> > +	}
> > +	if (delta_fdblocks) {
> > +		error = xfs_mod_fdblocks(mp, delta_fdblocks, false);
> > +		if (error)
> > +			goto out;
> > +	}
> > +
> > +out:
> > +	xfs_trans_cancel(tp);
> > +	return error;
> 
> Ummmm - why do we do all this superblock modification work in a
> transaction context, and then just cancel it?

It's an empty transaction, so nothing gets dirty and nothing needs
committing.  TBH I don't even think this is necessary, since we only use
it for reading the agi/agf, and for that we can certainly _buf_relse
instead of letting the _trans_cancel do it.

> 
> > +}
> > +
> > +/* Figure out how many blocks to reserve for an AG repair. */
> > +xfs_extlen_t
> > +xfs_repair_calc_ag_resblks(
> > +	struct xfs_scrub_context	*sc)
> > +{
> > +	struct xfs_mount		*mp = sc->mp;
> > +	struct xfs_scrub_metadata	*sm = sc->sm;
> > +	struct xfs_agi			*agi;
> > +	struct xfs_agf			*agf;
> > +	struct xfs_buf			*bp;
> > +	xfs_agino_t			icount;
> > +	xfs_extlen_t			aglen;
> > +	xfs_extlen_t			usedlen;
> > +	xfs_extlen_t			freelen;
> > +	xfs_extlen_t			bnobt_sz;
> > +	xfs_extlen_t			inobt_sz;
> > +	xfs_extlen_t			rmapbt_sz;
> > +	xfs_extlen_t			refcbt_sz;
> > +	int				error;
> > +
> > +	if (!(sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR))
> > +		return 0;
> > +
> > +	/*
> > +	 * Try to get the actual counters from disk; if not, make
> > +	 * some worst case assumptions.
> > +	 */
> > +	error = xfs_read_agi(mp, NULL, sm->sm_agno, &bp);
> > +	if (!error) {
> > +		agi = XFS_BUF_TO_AGI(bp);
> > +		icount = be32_to_cpu(agi->agi_count);
> > +		xfs_trans_brelse(NULL, bp);
> 
> xfs_buf_relse(bp)?
> 
> And, well, why not just use pag->pagi_count rather than having to
> read it from the on-disk buffer?

Hm, good point.

> > +	} else {
> > +		icount = mp->m_sb.sb_agblocks / mp->m_sb.sb_inopblock;
> > +	}
> > +
> > +	error = xfs_alloc_read_agf(mp, NULL, sm->sm_agno, 0, &bp);
> > +	if (!error && bp) {
> > +		agf = XFS_BUF_TO_AGF(bp);
> > +		aglen = be32_to_cpu(agf->agf_length);
> > +		freelen = be32_to_cpu(agf->agf_freeblks);
> > +		usedlen = aglen - freelen;
> > +		xfs_trans_brelse(NULL, bp);
> > +	} else {
> > +		aglen = mp->m_sb.sb_agblocks;
> > +		freelen = aglen;
> > +		usedlen = aglen;
> > +	}
> 
> ditto for there - it's all in the xfs_perag, right?
> 
> > +
> > +	trace_xfs_repair_calc_ag_resblks(mp, sm->sm_agno, icount, aglen,
> > +			freelen, usedlen);
> > +
> > +	/*
> > +	 * Figure out how many blocks we'd need worst case to rebuild
> > +	 * each type of btree.  Note that we can only rebuild the
> > +	 * bnobt/cntbt or inobt/finobt as pairs.
> > +	 */
> > +	bnobt_sz = 2 * xfs_allocbt_calc_size(mp, freelen);
> > +	if (xfs_sb_version_hassparseinodes(&mp->m_sb))
> > +		inobt_sz = xfs_iallocbt_calc_size(mp, icount /
> > +				XFS_INODES_PER_HOLEMASK_BIT);
> > +	else
> > +		inobt_sz = xfs_iallocbt_calc_size(mp, icount /
> > +				XFS_INODES_PER_CHUNK);
> > +	if (xfs_sb_version_hasfinobt(&mp->m_sb))
> > +		inobt_sz *= 2;
> > +	if (xfs_sb_version_hasreflink(&mp->m_sb)) {
> > +		rmapbt_sz = xfs_rmapbt_calc_size(mp, aglen);
> > +		refcbt_sz = xfs_refcountbt_calc_size(mp, usedlen);
> > +	} else {
> > +		rmapbt_sz = xfs_rmapbt_calc_size(mp, usedlen);
> > +		refcbt_sz = 0;
> > +	}
> > +	if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
> > +		rmapbt_sz = 0;
> > +
> > +	trace_xfs_repair_calc_ag_resblks_btsize(mp, sm->sm_agno, bnobt_sz,
> > +			inobt_sz, rmapbt_sz, refcbt_sz);
> > +
> > +	return max(max(bnobt_sz, inobt_sz), max(rmapbt_sz, refcbt_sz));
> 
> Why are we only returning the resblks for a single tree rebuild?
> Shouldn't it return the total blocks require to rebuild all all the
> trees?

A repair call only ever rebuilds one btree in one AG, so I don't see why
we'd need to reserve space to rebuild all the btrees in the AG (or the
same btree in all AGs)... though it's possible that I don't understand
the question?

--D

> 
> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dave Chinner April 9, 2018, 12:23 a.m. UTC | #3
On Sat, Apr 07, 2018 at 10:55:42AM -0700, Darrick J. Wong wrote:
> On Fri, Apr 06, 2018 at 10:52:51AM +1000, Dave Chinner wrote:
> > On Mon, Apr 02, 2018 at 12:57:18PM -0700, Darrick J. Wong wrote:
> > > From: Darrick J. Wong <darrick.wong@oracle.com>
> > > 
> > > Add some helper functions for repair functions that will help us to
> > > allocate and initialize new metadata blocks for btrees that we're
> > > rebuilding.
> > 
> > This is more than "helper functions" - my replay is almost 700 lines
> > long!
> > 
> > i.e. This is a stack of extent invalidation, freeing and rmap/free
> > space rebuild code. Needs a lot more description and context than
> > "helper functions".
> 
> I agree now; this patch has become overly long and incohesive.  It could
> be broken into the following pieces:
> 
> - modifying transaction allocations to take resblks
> 	or "ensuring sufficient free space to rebuild"
> 
> - rolling transactions
> 
> - allocation and reinit functions
> 
> - fixing/putting on the freelist
> 
> - dealing with leftover blocks after we rebuild a tree
> 
> - finding btree roots
> 
> - resetting superblock counters
> 
> So I'll break this up into seven smaller pieces.

Thanks!

> 
> > .....
> > > @@ -574,7 +575,10 @@ xfs_scrub_setup_fs(
> > >  	struct xfs_scrub_context	*sc,
> > >  	struct xfs_inode		*ip)
> > >  {
> > > -	return xfs_scrub_trans_alloc(sc->sm, sc->mp, &sc->tp);
> > > +	uint				resblks;
> > > +
> > > +	resblks = xfs_repair_calc_ag_resblks(sc);
> > > +	return xfs_scrub_trans_alloc(sc->sm, sc->mp, resblks, &sc->tp);
> > >  }
> > 
> > What's the block reservation for?
> 
> We rebuild a metadata structure by constructing a completely new btree
> with blocks we got from the free space and switching the root when we're
> done.  This isn't treated any differently from any other btree shape
> change, so we need to reserve blocks in the transaction.

Can you put this in a comment?

> > > +	args.tp = sc->tp;
> > > +	args.mp = sc->mp;
> > > +	args.oinfo = *oinfo;
> > > +	args.fsbno = XFS_AGB_TO_FSB(args.mp, sc->sa.agno, 0);
> > 
> > So our target BNO is the start of the AG.
> > 
> > > +	args.minlen = 1;
> > > +	args.maxlen = 1;
> > > +	args.prod = 1;
> > > +	args.type = XFS_ALLOCTYPE_NEAR_BNO;
> > 
> > And we do a "near" search" for a single block. i.e. we are looking
> > for a single block near to the start of the AG. Basically, we are
> > looking for the extent at the left edge of the by-bno tree, which
> > may not be a single block.
> > 
> > Would it be better to do a XFS_ALLOCTYPE_THIS_AG allocation here so
> > we look up the by-size btree for a single block extent? The left
> > edge of that tree will always be the smallest free extent closest to
> > the start of the AG, and so using THIS_AG will tend to fill
> > small freespace holes rather than tear up larger free spaces for
> > single block allocations.....
> 
> Makes sense.  Originally I was thinking that we try to put the rebuilt
> btree as close to the start of the AG as possible, but there's little
> point in doing that so long as regular btree block allocation doesn't
> bother.
> 
> TBH I've been wondering on the side if it makes more sense for us to
> detect things like dm-{thinp,cache} which have large(ish) underlying
> blocks and try to consolidate metadata into a smaller number of those
> underlying blocks at the start (or the end) of the AG?  It could be
> useful to pack the metadata into a smaller number of underlying blocks
> so that if one piece of metadata gets hot enough the rest will end up in
> the fast cache as well.

Separate issue, probably can be done with an in-memory threshold
for metadata allocation (e.g. search for BNO < 5% from start of AG).
I've thought about doing somthing like that (reserved area for
metadata) for some time, but I've never really seen signficant
advantages to doing it...

> > > +/* Put a block back on the AGFL. */
> > > +int
> > > +xfs_repair_put_freelist(
> > > +	struct xfs_scrub_context	*sc,
> > > +	xfs_agblock_t			agbno)
> > > +{
> > > +	struct xfs_owner_info		oinfo;
> > > +	int				error;
> > > +
> > > +	/*
> > > +	 * Since we're "freeing" a lost block onto the AGFL, we have to
> > > +	 * create an rmap for the block prior to merging it or else other
> > > +	 * parts will break.
> > > +	 */
> > > +	xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG);
> > > +	error = xfs_rmap_alloc(sc->tp, sc->sa.agf_bp, sc->sa.agno, agbno, 1,
> > > +			&oinfo);
> > > +	if (error)
> > > +		return error;
> > > +
> > > +	/* Put the block on the AGFL. */
> > > +	error = xfs_alloc_put_freelist(sc->tp, sc->sa.agf_bp, sc->sa.agfl_bp,
> > > +			agbno, 0);
> > 
> > At what point do we check there's actually room in the AGFL for this
> > block?
> > 
> > > +	if (error)
> > > +		return error;
> > > +	xfs_extent_busy_insert(sc->tp, sc->sa.agno, agbno, 1,
> > > +			XFS_EXTENT_BUSY_SKIP_DISCARD);
> > > +
> > > +	/* Make sure the AGFL doesn't overfill. */
> > > +	return xfs_repair_fix_freelist(sc, true);
> > 
> > i.e. shouldn't this be done first?
> 
> 
>

???

> > > +		else if (resv == XFS_AG_RESV_AGFL)
> > > +			error = xfs_repair_put_freelist(sc, agbno);
> > > +		else
> > > +			error = xfs_free_extent(sc->tp, fsbno, 1, oinfo, resv);
> > > +		if (error)
> > > +			break;
> > > +
> > > +		if (sc->ip)
> > > +			error = xfs_trans_roll_inode(&sc->tp, sc->ip);
> > > +		else
> > > +			error = xfs_repair_roll_ag_trans(sc);
> > 
> > why don't we break on error here?
> 
> We do, but it's up in the for loop.

I missed that completely. I'd much prefer we don't hide "break on
error" conditions inside the for loop machinery because it is so
easy to miss...

> > > +
> > > +	trace_xfs_repair_collect_btree_extent(sc->mp,
> > > +			XFS_FSB_TO_AGNO(sc->mp, fsbno),
> > > +			XFS_FSB_TO_AGBNO(sc->mp, fsbno), len);
> > > +
> > > +	rae = kmem_alloc(sizeof(struct xfs_repair_extent),
> > > +			KM_MAYFAIL | KM_NOFS);
> > 
> > Why KM_NOFS?
> 
> The caller holds a transaction and (potentially) has locked an entire
> AG, so we want to avoid recursing into the filesystem to free up memory.

If we're in transaction context, we are already under a KM_NOFS
allocation contexts. SO it shouldn't be needed here - if we are
called outside transaction context, then we need a comment
explaining it (as opposed to using KM_NOFS to shut up lockdep).

> > > +	for_each_xfs_repair_extent_safe(rae, n, exlist) {
> > > +		agno = XFS_FSB_TO_AGNO(sc->mp, rae->fsbno);
> > > +		agbno = XFS_FSB_TO_AGBNO(sc->mp, rae->fsbno);
> > > +		for (i = 0; i < rae->len; i++) {
> > > +			bp = xfs_btree_get_bufs(sc->mp, sc->tp, agno,
> > > +					agbno + i, 0);
> > > +			xfs_trans_binval(sc->tp, bp);
> > > +		}
> > > +	}
> > 
> > What if they are data blocks we are dumping? xfs_trans_binval is
> > fine for metadata blocks, but creating a stale metadata buffer and
> > logging a buffer cancel for user data blocks doesn't make any sense
> > to me....
> 
> exlist is only supposed to contain references to metadata blocks that we
> collected from the rmapbt.  The current iteration of the repair code
> shouldn't ever dump file data blocks.

That needs explanation, then. I had no idea that exlist was only for
metadata blocks...

> > > +/* Remove all the blocks in sublist from exlist. */
> > > +#define LEFT_CONTIG	(1 << 0)
> > > +#define RIGHT_CONTIG	(1 << 1)
> > 
> > Namespace, please.
> > 
> > > +int
> > > +xfs_repair_subtract_extents(
> > > +	struct xfs_scrub_context	*sc,
> > > +	struct xfs_repair_extent_list	*exlist,
> > > +	struct xfs_repair_extent_list	*sublist)
> > 
> > What is this function supposed to do? I can't really review the code
> > (looks complex) because I don't know what it's function is supposed
> > to be.
> 
> The intent is that we iterate the rmapbt for all of its records for a
> given owner to generate exlist.  Then we iterate all blocks of metadata
> structures that we are not rebuilding but have the same rmapbt owner to
> generate sublist.  This routine subtracts all the blocks mentioned in
> sublist from all the extents linked in exlist, which leaves exlist as
> the list of all blocks owned by the old version of btree that we're
> rebuilding.  The list is fed into _reap_btree_extents to free the old
> btree blocks (or merely remove the rmap if the block is crosslinked).

Comment, please :)

> > > +
> > > +	if (delta_icount) {
> > > +		error = xfs_mod_icount(mp, delta_icount);
> > > +		if (error)
> > > +			goto out;
> > > +	}
> > > +	if (delta_ifree) {
> > > +		error = xfs_mod_ifree(mp, delta_ifree);
> > > +		if (error)
> > > +			goto out;
> > > +	}
> > > +	if (delta_fdblocks) {
> > > +		error = xfs_mod_fdblocks(mp, delta_fdblocks, false);
> > > +		if (error)
> > > +			goto out;
> > > +	}
> > > +
> > > +out:
> > > +	xfs_trans_cancel(tp);
> > > +	return error;
> > 
> > Ummmm - why do we do all this superblock modification work in a
> > transaction context, and then just cancel it?
> 
> It's an empty transaction, so nothing gets dirty and nothing needs
> committing.  TBH I don't even think this is necessary, since we only use
> it for reading the agi/agf, and for that we can certainly _buf_relse
> instead of letting the _trans_cancel do it.

Yeah, it'd be better to get rid of the transaction context if we
can. It should also explain how it avoids races with ongoing
superblock counter updates...

> > > +		inobt_sz = xfs_iallocbt_calc_size(mp, icount /
> > > +				XFS_INODES_PER_CHUNK);
> > > +	if (xfs_sb_version_hasfinobt(&mp->m_sb))
> > > +		inobt_sz *= 2;
> > > +	if (xfs_sb_version_hasreflink(&mp->m_sb)) {
> > > +		rmapbt_sz = xfs_rmapbt_calc_size(mp, aglen);
> > > +		refcbt_sz = xfs_refcountbt_calc_size(mp, usedlen);
> > > +	} else {
> > > +		rmapbt_sz = xfs_rmapbt_calc_size(mp, usedlen);
> > > +		refcbt_sz = 0;
> > > +	}
> > > +	if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
> > > +		rmapbt_sz = 0;
> > > +
> > > +	trace_xfs_repair_calc_ag_resblks_btsize(mp, sm->sm_agno, bnobt_sz,
> > > +			inobt_sz, rmapbt_sz, refcbt_sz);
> > > +
> > > +	return max(max(bnobt_sz, inobt_sz), max(rmapbt_sz, refcbt_sz));
> > 
> > Why are we only returning the resblks for a single tree rebuild?
> > Shouldn't it return the total blocks require to rebuild all all the
> > trees?
> 
> A repair call only ever rebuilds one btree in one AG, so I don't see why
> we'd need to reserve space to rebuild all the btrees in the AG (or the
> same btree in all AGs)... though it's possible that I don't understand
> the question?

It's not clear that we are only doing a reservation for a single
tree in an AG. needs a comment to explain the context the
reservation is being made for.

Cheers,

Dave.
Darrick J. Wong April 9, 2018, 5:19 p.m. UTC | #4
On Mon, Apr 09, 2018 at 10:23:42AM +1000, Dave Chinner wrote:
> On Sat, Apr 07, 2018 at 10:55:42AM -0700, Darrick J. Wong wrote:
> > On Fri, Apr 06, 2018 at 10:52:51AM +1000, Dave Chinner wrote:
> > > On Mon, Apr 02, 2018 at 12:57:18PM -0700, Darrick J. Wong wrote:
> > > > From: Darrick J. Wong <darrick.wong@oracle.com>
> > > > 
> > > > Add some helper functions for repair functions that will help us to
> > > > allocate and initialize new metadata blocks for btrees that we're
> > > > rebuilding.
> > > 
> > > This is more than "helper functions" - my replay is almost 700 lines
> > > long!
> > > 
> > > i.e. This is a stack of extent invalidation, freeing and rmap/free
> > > space rebuild code. Needs a lot more description and context than
> > > "helper functions".
> > 
> > I agree now; this patch has become overly long and incohesive.  It could
> > be broken into the following pieces:
> > 
> > - modifying transaction allocations to take resblks
> > 	or "ensuring sufficient free space to rebuild"
> > 
> > - rolling transactions
> > 
> > - allocation and reinit functions
> > 
> > - fixing/putting on the freelist
> > 
> > - dealing with leftover blocks after we rebuild a tree
> > 
> > - finding btree roots
> > 
> > - resetting superblock counters
> > 
> > So I'll break this up into seven smaller pieces.
> 
> Thanks!
> 
> > 
> > > .....
> > > > @@ -574,7 +575,10 @@ xfs_scrub_setup_fs(
> > > >  	struct xfs_scrub_context	*sc,
> > > >  	struct xfs_inode		*ip)
> > > >  {
> > > > -	return xfs_scrub_trans_alloc(sc->sm, sc->mp, &sc->tp);
> > > > +	uint				resblks;
> > > > +
> > > > +	resblks = xfs_repair_calc_ag_resblks(sc);
> > > > +	return xfs_scrub_trans_alloc(sc->sm, sc->mp, resblks, &sc->tp);
> > > >  }
> > > 
> > > What's the block reservation for?
> > 
> > We rebuild a metadata structure by constructing a completely new btree
> > with blocks we got from the free space and switching the root when we're
> > done.  This isn't treated any differently from any other btree shape
> > change, so we need to reserve blocks in the transaction.
> 
> Can you put this in a comment?

Added to the comment above xfs_scrub_trans_alloc.

> > > > +	args.tp = sc->tp;
> > > > +	args.mp = sc->mp;
> > > > +	args.oinfo = *oinfo;
> > > > +	args.fsbno = XFS_AGB_TO_FSB(args.mp, sc->sa.agno, 0);
> > > 
> > > So our target BNO is the start of the AG.
> > > 
> > > > +	args.minlen = 1;
> > > > +	args.maxlen = 1;
> > > > +	args.prod = 1;
> > > > +	args.type = XFS_ALLOCTYPE_NEAR_BNO;
> > > 
> > > And we do a "near" search" for a single block. i.e. we are looking
> > > for a single block near to the start of the AG. Basically, we are
> > > looking for the extent at the left edge of the by-bno tree, which
> > > may not be a single block.
> > > 
> > > Would it be better to do a XFS_ALLOCTYPE_THIS_AG allocation here so
> > > we look up the by-size btree for a single block extent? The left
> > > edge of that tree will always be the smallest free extent closest to
> > > the start of the AG, and so using THIS_AG will tend to fill
> > > small freespace holes rather than tear up larger free spaces for
> > > single block allocations.....
> > 
> > Makes sense.  Originally I was thinking that we try to put the rebuilt
> > btree as close to the start of the AG as possible, but there's little
> > point in doing that so long as regular btree block allocation doesn't
> > bother.
> > 
> > TBH I've been wondering on the side if it makes more sense for us to
> > detect things like dm-{thinp,cache} which have large(ish) underlying
> > blocks and try to consolidate metadata into a smaller number of those
> > underlying blocks at the start (or the end) of the AG?  It could be
> > useful to pack the metadata into a smaller number of underlying blocks
> > so that if one piece of metadata gets hot enough the rest will end up in
> > the fast cache as well.
> 
> Separate issue, probably can be done with an in-memory threshold
> for metadata allocation (e.g. search for BNO < 5% from start of AG).
> I've thought about doing somthing like that (reserved area for
> metadata) for some time, but I've never really seen signficant
> advantages to doing it...

<nod> I suppose on a spinny disk it might cut down scrub/repair time,
though maybe for pmem it'll prove advantageous to try to keep free space
as unfragmented as possible to increase the likelihood of large page
use.  Oh well, that's a research question for another thread. :)

> > > > +/* Put a block back on the AGFL. */
> > > > +int
> > > > +xfs_repair_put_freelist(
> > > > +	struct xfs_scrub_context	*sc,
> > > > +	xfs_agblock_t			agbno)
> > > > +{
> > > > +	struct xfs_owner_info		oinfo;
> > > > +	int				error;
> > > > +
> > > > +	/*
> > > > +	 * Since we're "freeing" a lost block onto the AGFL, we have to
> > > > +	 * create an rmap for the block prior to merging it or else other
> > > > +	 * parts will break.
> > > > +	 */
> > > > +	xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG);
> > > > +	error = xfs_rmap_alloc(sc->tp, sc->sa.agf_bp, sc->sa.agno, agbno, 1,
> > > > +			&oinfo);
> > > > +	if (error)
> > > > +		return error;
> > > > +
> > > > +	/* Put the block on the AGFL. */
> > > > +	error = xfs_alloc_put_freelist(sc->tp, sc->sa.agf_bp, sc->sa.agfl_bp,
> > > > +			agbno, 0);
> > > 
> > > At what point do we check there's actually room in the AGFL for this
> > > block?
> > > 
> > > > +	if (error)
> > > > +		return error;
> > > > +	xfs_extent_busy_insert(sc->tp, sc->sa.agno, agbno, 1,
> > > > +			XFS_EXTENT_BUSY_SKIP_DISCARD);
> > > > +
> > > > +	/* Make sure the AGFL doesn't overfill. */
> > > > +	return xfs_repair_fix_freelist(sc, true);
> > > 
> > > i.e. shouldn't this be done first?
> > 
> > 
> >
> 
> ???

That was supposed to be "Yes".

> 
> > > > +		else if (resv == XFS_AG_RESV_AGFL)
> > > > +			error = xfs_repair_put_freelist(sc, agbno);
> > > > +		else
> > > > +			error = xfs_free_extent(sc->tp, fsbno, 1, oinfo, resv);
> > > > +		if (error)
> > > > +			break;
> > > > +
> > > > +		if (sc->ip)
> > > > +			error = xfs_trans_roll_inode(&sc->tp, sc->ip);
> > > > +		else
> > > > +			error = xfs_repair_roll_ag_trans(sc);
> > > 
> > > why don't we break on error here?
> > 
> > We do, but it's up in the for loop.
> 
> I missed that completely. I'd much prefer we don't hide "break on
> error" conditions inside the for loop machinery because it is so
> easy to miss...

I refactored the loop body into a separate helper, which decreased the
indentation and made the loop control clearer.

> 
> > > > +
> > > > +	trace_xfs_repair_collect_btree_extent(sc->mp,
> > > > +			XFS_FSB_TO_AGNO(sc->mp, fsbno),
> > > > +			XFS_FSB_TO_AGBNO(sc->mp, fsbno), len);
> > > > +
> > > > +	rae = kmem_alloc(sizeof(struct xfs_repair_extent),
> > > > +			KM_MAYFAIL | KM_NOFS);
> > > 
> > > Why KM_NOFS?
> > 
> > The caller holds a transaction and (potentially) has locked an entire
> > AG, so we want to avoid recursing into the filesystem to free up memory.
> 
> If we're in transaction context, we are already under a KM_NOFS
> allocation contexts. SO it shouldn't be needed here - if we are
> called outside transaction context, then we need a comment
> explaining it (as opposed to using KM_NOFS to shut up lockdep).

Ok, I'll get rid of the KM_NOFS in this and all the other patches.

(I admit I probably /have/ been stuffing in KM_NOFS to shut up
lockdep...)

> > > > +	for_each_xfs_repair_extent_safe(rae, n, exlist) {
> > > > +		agno = XFS_FSB_TO_AGNO(sc->mp, rae->fsbno);
> > > > +		agbno = XFS_FSB_TO_AGBNO(sc->mp, rae->fsbno);
> > > > +		for (i = 0; i < rae->len; i++) {
> > > > +			bp = xfs_btree_get_bufs(sc->mp, sc->tp, agno,
> > > > +					agbno + i, 0);
> > > > +			xfs_trans_binval(sc->tp, bp);
> > > > +		}
> > > > +	}
> > > 
> > > What if they are data blocks we are dumping? xfs_trans_binval is
> > > fine for metadata blocks, but creating a stale metadata buffer and
> > > logging a buffer cancel for user data blocks doesn't make any sense
> > > to me....
> > 
> > exlist is only supposed to contain references to metadata blocks that we
> > collected from the rmapbt.  The current iteration of the repair code
> > shouldn't ever dump file data blocks.
> 
> That needs explanation, then. I had no idea that exlist was only for
> metadata blocks...

Ok, comment changed to:

/*
 * Invalidate buffers for per-AG btree blocks we're dumping.  We assume
 * that exlist points only to metadata blocks.
 */

> > > > +/* Remove all the blocks in sublist from exlist. */
> > > > +#define LEFT_CONTIG	(1 << 0)
> > > > +#define RIGHT_CONTIG	(1 << 1)
> > > 
> > > Namespace, please.
> > > 
> > > > +int
> > > > +xfs_repair_subtract_extents(
> > > > +	struct xfs_scrub_context	*sc,
> > > > +	struct xfs_repair_extent_list	*exlist,
> > > > +	struct xfs_repair_extent_list	*sublist)
> > > 
> > > What is this function supposed to do? I can't really review the code
> > > (looks complex) because I don't know what it's function is supposed
> > > to be.
> > 
> > The intent is that we iterate the rmapbt for all of its records for a
> > given owner to generate exlist.  Then we iterate all blocks of metadata
> > structures that we are not rebuilding but have the same rmapbt owner to
> > generate sublist.  This routine subtracts all the blocks mentioned in
> > sublist from all the extents linked in exlist, which leaves exlist as
> > the list of all blocks owned by the old version of btree that we're
> > rebuilding.  The list is fed into _reap_btree_extents to free the old
> > btree blocks (or merely remove the rmap if the block is crosslinked).
> 
> Comment, please :)

Added.

> > > > +
> > > > +	if (delta_icount) {
> > > > +		error = xfs_mod_icount(mp, delta_icount);
> > > > +		if (error)
> > > > +			goto out;
> > > > +	}
> > > > +	if (delta_ifree) {
> > > > +		error = xfs_mod_ifree(mp, delta_ifree);
> > > > +		if (error)
> > > > +			goto out;
> > > > +	}
> > > > +	if (delta_fdblocks) {
> > > > +		error = xfs_mod_fdblocks(mp, delta_fdblocks, false);
> > > > +		if (error)
> > > > +			goto out;
> > > > +	}
> > > > +
> > > > +out:
> > > > +	xfs_trans_cancel(tp);
> > > > +	return error;
> > > 
> > > Ummmm - why do we do all this superblock modification work in a
> > > transaction context, and then just cancel it?
> > 
> > It's an empty transaction, so nothing gets dirty and nothing needs
> > committing.  TBH I don't even think this is necessary, since we only use
> > it for reading the agi/agf, and for that we can certainly _buf_relse
> > instead of letting the _trans_cancel do it.
> 
> Yeah, it'd be better to get rid of the transaction context if we
> can. It should also explain how it avoids races with ongoing
> superblock counter updates...

<nod>  AFAICT it's a simple matter of taking m_sb_lock to update the
m_sb counters, after which we unlock and do the percpu counters.

> > > > +		inobt_sz = xfs_iallocbt_calc_size(mp, icount /
> > > > +				XFS_INODES_PER_CHUNK);
> > > > +	if (xfs_sb_version_hasfinobt(&mp->m_sb))
> > > > +		inobt_sz *= 2;
> > > > +	if (xfs_sb_version_hasreflink(&mp->m_sb)) {
> > > > +		rmapbt_sz = xfs_rmapbt_calc_size(mp, aglen);
> > > > +		refcbt_sz = xfs_refcountbt_calc_size(mp, usedlen);
> > > > +	} else {
> > > > +		rmapbt_sz = xfs_rmapbt_calc_size(mp, usedlen);
> > > > +		refcbt_sz = 0;
> > > > +	}
> > > > +	if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
> > > > +		rmapbt_sz = 0;
> > > > +
> > > > +	trace_xfs_repair_calc_ag_resblks_btsize(mp, sm->sm_agno, bnobt_sz,
> > > > +			inobt_sz, rmapbt_sz, refcbt_sz);
> > > > +
> > > > +	return max(max(bnobt_sz, inobt_sz), max(rmapbt_sz, refcbt_sz));
> > > 
> > > Why are we only returning the resblks for a single tree rebuild?
> > > Shouldn't it return the total blocks require to rebuild all all the
> > > trees?
> > 
> > A repair call only ever rebuilds one btree in one AG, so I don't see why
> > we'd need to reserve space to rebuild all the btrees in the AG (or the
> > same btree in all AGs)... though it's possible that I don't understand
> > the question?
> 
> It's not clear that we are only doing a reservation for a single
> tree in an AG. needs a comment to explain the context the
> reservation is being made for.

/*
 * Figure out how many blocks to reserve for an AG repair.  We calculate
 * the worst case estimate for the number of blocks we'd need to rebuild
 * one of any type of per-AG btree.
 */

--D

> 
> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index 639d14b..cecc447 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -74,8 +74,7 @@  xfs_scrub_setup_inode_bmap(
 			goto out;
 	}
 
-	/* Got the inode, lock it and we're ready to go. */
-	error = xfs_scrub_trans_alloc(sc->sm, mp, &sc->tp);
+	error = xfs_scrub_trans_alloc(sc->sm, mp, 0, &sc->tp);
 	if (error)
 		goto out;
 	sc->ilock_flags |= XFS_ILOCK_EXCL;
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index ddb0ff6..056d46e 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -49,6 +49,7 @@ 
 #include "scrub/common.h"
 #include "scrub/trace.h"
 #include "scrub/btree.h"
+#include "scrub/repair.h"
 
 /* Common code for the metadata scrubbers. */
 
@@ -574,7 +575,10 @@  xfs_scrub_setup_fs(
 	struct xfs_scrub_context	*sc,
 	struct xfs_inode		*ip)
 {
-	return xfs_scrub_trans_alloc(sc->sm, sc->mp, &sc->tp);
+	uint				resblks;
+
+	resblks = xfs_repair_calc_ag_resblks(sc);
+	return xfs_scrub_trans_alloc(sc->sm, sc->mp, resblks, &sc->tp);
 }
 
 /* Set us up with AG headers and btree cursors. */
@@ -705,7 +709,7 @@  xfs_scrub_setup_inode_contents(
 	/* Got the inode, lock it and we're ready to go. */
 	sc->ilock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
 	xfs_ilock(sc->ip, sc->ilock_flags);
-	error = xfs_scrub_trans_alloc(sc->sm, mp, &sc->tp);
+	error = xfs_scrub_trans_alloc(sc->sm, mp, resblks, &sc->tp);
 	if (error)
 		goto out;
 	sc->ilock_flags |= XFS_ILOCK_EXCL;
diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h
index 140e90c..e6875f8 100644
--- a/fs/xfs/scrub/common.h
+++ b/fs/xfs/scrub/common.h
@@ -41,13 +41,22 @@  xfs_scrub_should_terminate(
 /*
  * Grab an empty transaction so that we can re-grab locked buffers if
  * one of our btrees turns out to be cyclic.
+ *
+ * If we're going to repair something, we need to ask for the largest
+ * possible log reservation so that we can handle the worst case
+ * scenario for rebuilding a metadata item.
  */
 static inline int
 xfs_scrub_trans_alloc(
 	struct xfs_scrub_metadata	*sm,
 	struct xfs_mount		*mp,
+	uint				blocks,
 	struct xfs_trans		**tpp)
 {
+	if (sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)
+		return xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate,
+				blocks, 0, 0, tpp);
+
 	return xfs_trans_alloc_empty(mp, tpp);
 }
 
diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c
index df14930..28a0d92 100644
--- a/fs/xfs/scrub/inode.c
+++ b/fs/xfs/scrub/inode.c
@@ -68,7 +68,7 @@  xfs_scrub_setup_inode(
 		break;
 	case -EFSCORRUPTED:
 	case -EFSBADCRC:
-		return xfs_scrub_trans_alloc(sc->sm, mp, &sc->tp);
+		return xfs_scrub_trans_alloc(sc->sm, mp, 0, &sc->tp);
 	default:
 		return error;
 	}
@@ -76,7 +76,7 @@  xfs_scrub_setup_inode(
 	/* Got the inode, lock it and we're ready to go. */
 	sc->ilock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
 	xfs_ilock(sc->ip, sc->ilock_flags);
-	error = xfs_scrub_trans_alloc(sc->sm, mp, &sc->tp);
+	error = xfs_scrub_trans_alloc(sc->sm, mp, 0, &sc->tp);
 	if (error)
 		goto out;
 	sc->ilock_flags |= XFS_ILOCK_EXCL;
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index 69ffb94..11c2257 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -63,3 +63,819 @@  xfs_repair_probe(
 
 	return 0;
 }
+
+/*
+ * Roll a transaction, keeping the AG headers locked and reinitializing
+ * the btree cursors.
+ */
+int
+xfs_repair_roll_ag_trans(
+	struct xfs_scrub_context	*sc)
+{
+	struct xfs_trans		*tp;
+	int				error;
+
+	/* Keep the AG header buffers locked so we can keep going. */
+	xfs_trans_bhold(sc->tp, sc->sa.agi_bp);
+	xfs_trans_bhold(sc->tp, sc->sa.agf_bp);
+	xfs_trans_bhold(sc->tp, sc->sa.agfl_bp);
+
+	/* Roll the transaction. */
+	tp = sc->tp;
+	error = xfs_trans_roll(&sc->tp);
+	if (error)
+		return error;
+
+	/* Join the buffer to the new transaction or release the hold. */
+	if (sc->tp != tp) {
+		xfs_trans_bjoin(sc->tp, sc->sa.agi_bp);
+		xfs_trans_bjoin(sc->tp, sc->sa.agf_bp);
+		xfs_trans_bjoin(sc->tp, sc->sa.agfl_bp);
+	} else {
+		xfs_trans_bhold_release(sc->tp, sc->sa.agi_bp);
+		xfs_trans_bhold_release(sc->tp, sc->sa.agf_bp);
+		xfs_trans_bhold_release(sc->tp, sc->sa.agfl_bp);
+	}
+
+	return error;
+}
+
+/*
+ * Does the given AG have enough space to rebuild a btree?  Neither AG
+ * reservation can be critical, and we must have enough space (factoring
+ * in AG reservations) to construct a whole btree.
+ */
+bool
+xfs_repair_ag_has_space(
+	struct xfs_perag		*pag,
+	xfs_extlen_t			nr_blocks,
+	enum xfs_ag_resv_type		type)
+{
+	return  !xfs_ag_resv_critical(pag, XFS_AG_RESV_AGFL) &&
+		!xfs_ag_resv_critical(pag, XFS_AG_RESV_METADATA) &&
+		pag->pagf_freeblks > xfs_ag_resv_needed(pag, type) + nr_blocks;
+}
+
+/* Allocate a block in an AG. */
+int
+xfs_repair_alloc_ag_block(
+	struct xfs_scrub_context	*sc,
+	struct xfs_owner_info		*oinfo,
+	xfs_fsblock_t			*fsbno,
+	enum xfs_ag_resv_type		resv)
+{
+	struct xfs_alloc_arg		args = {0};
+	xfs_agblock_t			bno;
+	int				error;
+
+	if (resv == XFS_AG_RESV_AGFL) {
+		error = xfs_alloc_get_freelist(sc->tp, sc->sa.agf_bp, &bno, 1);
+		if (error)
+			return error;
+		if (bno == NULLAGBLOCK)
+			return -ENOSPC;
+		xfs_extent_busy_reuse(sc->mp, sc->sa.agno, bno,
+				1, false);
+		*fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, bno);
+		return 0;
+	}
+
+	args.tp = sc->tp;
+	args.mp = sc->mp;
+	args.oinfo = *oinfo;
+	args.fsbno = XFS_AGB_TO_FSB(args.mp, sc->sa.agno, 0);
+	args.minlen = 1;
+	args.maxlen = 1;
+	args.prod = 1;
+	args.type = XFS_ALLOCTYPE_NEAR_BNO;
+	args.resv = resv;
+
+	error = xfs_alloc_vextent(&args);
+	if (error)
+		return error;
+	if (args.fsbno == NULLFSBLOCK)
+		return -ENOSPC;
+	ASSERT(args.len == 1);
+	*fsbno = args.fsbno;
+
+	return 0;
+}
+
+/* Initialize an AG block to a zeroed out btree header. */
+int
+xfs_repair_init_btblock(
+	struct xfs_scrub_context	*sc,
+	xfs_fsblock_t			fsb,
+	struct xfs_buf			**bpp,
+	xfs_btnum_t			btnum,
+	const struct xfs_buf_ops	*ops)
+{
+	struct xfs_trans		*tp = sc->tp;
+	struct xfs_mount		*mp = sc->mp;
+	struct xfs_buf			*bp;
+
+	trace_xfs_repair_init_btblock(mp, XFS_FSB_TO_AGNO(mp, fsb),
+			XFS_FSB_TO_AGBNO(mp, fsb), btnum);
+
+	ASSERT(XFS_FSB_TO_AGNO(mp, fsb) == sc->sa.agno);
+	bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, fsb),
+			XFS_FSB_TO_BB(mp, 1), 0);
+	xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
+	xfs_btree_init_block(mp, bp, btnum, 0, 0, sc->sa.agno,
+			XFS_BTREE_CRC_BLOCKS);
+	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_BTREE_BUF);
+	xfs_trans_log_buf(tp, bp, 0, bp->b_length);
+	bp->b_ops = ops;
+	*bpp = bp;
+
+	return 0;
+}
+
+/* Ensure the freelist is full. */
+int
+xfs_repair_fix_freelist(
+	struct xfs_scrub_context	*sc,
+	bool				can_shrink)
+{
+	struct xfs_alloc_arg		args = {0};
+	int				error;
+
+	args.mp = sc->mp;
+	args.tp = sc->tp;
+	args.agno = sc->sa.agno;
+	args.alignment = 1;
+	args.pag = xfs_perag_get(args.mp, sc->sa.agno);
+	args.resv = XFS_AG_RESV_AGFL;
+
+	error = xfs_alloc_fix_freelist(&args,
+			can_shrink ? 0 : XFS_ALLOC_FLAG_NOSHRINK);
+	xfs_perag_put(args.pag);
+
+	return error;
+}
+
+/* Put a block back on the AGFL. */
+int
+xfs_repair_put_freelist(
+	struct xfs_scrub_context	*sc,
+	xfs_agblock_t			agbno)
+{
+	struct xfs_owner_info		oinfo;
+	int				error;
+
+	/*
+	 * Since we're "freeing" a lost block onto the AGFL, we have to
+	 * create an rmap for the block prior to merging it or else other
+	 * parts will break.
+	 */
+	xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG);
+	error = xfs_rmap_alloc(sc->tp, sc->sa.agf_bp, sc->sa.agno, agbno, 1,
+			&oinfo);
+	if (error)
+		return error;
+
+	/* Put the block on the AGFL. */
+	error = xfs_alloc_put_freelist(sc->tp, sc->sa.agf_bp, sc->sa.agfl_bp,
+			agbno, 0);
+	if (error)
+		return error;
+	xfs_extent_busy_insert(sc->tp, sc->sa.agno, agbno, 1,
+			XFS_EXTENT_BUSY_SKIP_DISCARD);
+
+	/* Make sure the AGFL doesn't overfill. */
+	return xfs_repair_fix_freelist(sc, true);
+}
+
+/*
+ * For a given metadata extent and owner, delete the associated rmap.
+ * If the block has no other owners, free it.
+ */
+STATIC int
+xfs_repair_free_or_unmap_extent(
+	struct xfs_scrub_context	*sc,
+	xfs_fsblock_t			fsbno,
+	xfs_extlen_t			len,
+	struct xfs_owner_info		*oinfo,
+	enum xfs_ag_resv_type		resv)
+{
+	struct xfs_mount		*mp = sc->mp;
+	struct xfs_btree_cur		*rmap_cur;
+	struct xfs_buf			*agf_bp = NULL;
+	xfs_agnumber_t			agno;
+	xfs_agblock_t			agbno;
+	bool				has_other_rmap;
+	int				error = 0;
+
+	ASSERT(xfs_sb_version_hasrmapbt(&mp->m_sb));
+	agno = XFS_FSB_TO_AGNO(mp, fsbno);
+	agbno = XFS_FSB_TO_AGBNO(mp, fsbno);
+
+	trace_xfs_repair_free_or_unmap_extent(mp, agno, agbno, len);
+
+	for (; len > 0 && !error; len--, agbno++, fsbno++) {
+		ASSERT(sc->ip != NULL || agno == sc->sa.agno);
+
+		/* Can we find any other rmappings? */
+		if (sc->ip) {
+			error = xfs_alloc_read_agf(mp, sc->tp, agno, 0,
+					&agf_bp);
+			if (error)
+				break;
+			if (!agf_bp) {
+				error = -ENOMEM;
+				break;
+			}
+		}
+		rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp,
+				agf_bp ? agf_bp : sc->sa.agf_bp, agno);
+		error = xfs_rmap_has_other_keys(rmap_cur, agbno, 1, oinfo,
+				&has_other_rmap);
+		if (error)
+			goto out_cur;
+		xfs_btree_del_cursor(rmap_cur, XFS_BTREE_NOERROR);
+		if (agf_bp)
+			xfs_trans_brelse(sc->tp, agf_bp);
+
+		/*
+		 * If there are other rmappings, this block is cross
+		 * linked and must not be freed.  Remove the reverse
+		 * mapping and move on.  Otherwise, we were the only
+		 * owner of the block, so free the extent, which will
+		 * also remove the rmap.
+		 */
+		if (has_other_rmap)
+			error = xfs_rmap_free(sc->tp, agf_bp, agno, agbno, 1,
+					oinfo);
+		else if (resv == XFS_AG_RESV_AGFL)
+			error = xfs_repair_put_freelist(sc, agbno);
+		else
+			error = xfs_free_extent(sc->tp, fsbno, 1, oinfo, resv);
+		if (error)
+			break;
+
+		if (sc->ip)
+			error = xfs_trans_roll_inode(&sc->tp, sc->ip);
+		else
+			error = xfs_repair_roll_ag_trans(sc);
+	}
+
+	return error;
+out_cur:
+	xfs_btree_del_cursor(rmap_cur, XFS_BTREE_ERROR);
+	if (agf_bp)
+		xfs_trans_brelse(sc->tp, agf_bp);
+	return error;
+}
+
+/* Collect a dead btree extent for later disposal. */
+int
+xfs_repair_collect_btree_extent(
+	struct xfs_scrub_context	*sc,
+	struct xfs_repair_extent_list	*exlist,
+	xfs_fsblock_t			fsbno,
+	xfs_extlen_t			len)
+{
+	struct xfs_repair_extent	*rae;
+
+	trace_xfs_repair_collect_btree_extent(sc->mp,
+			XFS_FSB_TO_AGNO(sc->mp, fsbno),
+			XFS_FSB_TO_AGBNO(sc->mp, fsbno), len);
+
+	rae = kmem_alloc(sizeof(struct xfs_repair_extent),
+			KM_MAYFAIL | KM_NOFS);
+	if (!rae)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&rae->list);
+	rae->fsbno = fsbno;
+	rae->len = len;
+	list_add_tail(&rae->list, &exlist->list);
+
+	return 0;
+}
+
+/* Invalidate buffers for blocks we're dumping. */
+int
+xfs_repair_invalidate_blocks(
+	struct xfs_scrub_context	*sc,
+	struct xfs_repair_extent_list	*exlist)
+{
+	struct xfs_repair_extent	*rae;
+	struct xfs_repair_extent	*n;
+	struct xfs_buf			*bp;
+	xfs_agnumber_t			agno;
+	xfs_agblock_t			agbno;
+	xfs_agblock_t			i;
+
+	for_each_xfs_repair_extent_safe(rae, n, exlist) {
+		agno = XFS_FSB_TO_AGNO(sc->mp, rae->fsbno);
+		agbno = XFS_FSB_TO_AGBNO(sc->mp, rae->fsbno);
+		for (i = 0; i < rae->len; i++) {
+			bp = xfs_btree_get_bufs(sc->mp, sc->tp, agno,
+					agbno + i, 0);
+			xfs_trans_binval(sc->tp, bp);
+		}
+	}
+
+	return 0;
+}
+
+/* Dispose of dead btree extents.  If oinfo is NULL, just delete the list. */
+int
+xfs_repair_reap_btree_extents(
+	struct xfs_scrub_context	*sc,
+	struct xfs_repair_extent_list	*exlist,
+	struct xfs_owner_info		*oinfo,
+	enum xfs_ag_resv_type		type)
+{
+	struct xfs_repair_extent	*rae;
+	struct xfs_repair_extent	*n;
+	int				error = 0;
+
+	for_each_xfs_repair_extent_safe(rae, n, exlist) {
+		if (oinfo) {
+			error = xfs_repair_free_or_unmap_extent(sc, rae->fsbno,
+					rae->len, oinfo, type);
+			if (error)
+				oinfo = NULL;
+		}
+		list_del(&rae->list);
+		kmem_free(rae);
+	}
+
+	return error;
+}
+
+/* Errors happened, just delete the dead btree extent list. */
+void
+xfs_repair_cancel_btree_extents(
+	struct xfs_scrub_context	*sc,
+	struct xfs_repair_extent_list	*exlist)
+{
+	xfs_repair_reap_btree_extents(sc, exlist, NULL, XFS_AG_RESV_NONE);
+}
+
+/* Compare two btree extents. */
+static int
+xfs_repair_btree_extent_cmp(
+	void				*priv,
+	struct list_head		*a,
+	struct list_head		*b)
+{
+	struct xfs_repair_extent	*ap;
+	struct xfs_repair_extent	*bp;
+
+	ap = container_of(a, struct xfs_repair_extent, list);
+	bp = container_of(b, struct xfs_repair_extent, list);
+
+	if (ap->fsbno > bp->fsbno)
+		return 1;
+	else if (ap->fsbno < bp->fsbno)
+		return -1;
+	return 0;
+}
+
+/* Remove all the blocks in sublist from exlist. */
+#define LEFT_CONTIG	(1 << 0)
+#define RIGHT_CONTIG	(1 << 1)
+int
+xfs_repair_subtract_extents(
+	struct xfs_scrub_context	*sc,
+	struct xfs_repair_extent_list	*exlist,
+	struct xfs_repair_extent_list	*sublist)
+{
+	struct list_head		*lp;
+	struct xfs_repair_extent	*ex;
+	struct xfs_repair_extent	*newex;
+	struct xfs_repair_extent	*subex;
+	xfs_fsblock_t			sub_fsb;
+	xfs_extlen_t			sub_len;
+	int				state;
+	int				error = 0;
+
+	if (list_empty(&exlist->list) || list_empty(&sublist->list))
+		return 0;
+	ASSERT(!list_empty(&sublist->list));
+
+	list_sort(NULL, &exlist->list, xfs_repair_btree_extent_cmp);
+	list_sort(NULL, &sublist->list, xfs_repair_btree_extent_cmp);
+
+	subex = list_first_entry(&sublist->list, struct xfs_repair_extent,
+			list);
+	lp = exlist->list.next;
+	while (lp != &exlist->list) {
+		ex = list_entry(lp, struct xfs_repair_extent, list);
+
+		/*
+		 * Advance subex and/or ex until we find a pair that
+		 * intersect or we run out of extents.
+		 */
+		while (subex->fsbno + subex->len <= ex->fsbno) {
+			if (list_is_last(&subex->list, &sublist->list))
+				goto out;
+			subex = list_next_entry(subex, list);
+		}
+		if (subex->fsbno >= ex->fsbno + ex->len) {
+			lp = lp->next;
+			continue;
+		}
+
+		/* trim subex to fit the extent we have */
+		sub_fsb = subex->fsbno;
+		sub_len = subex->len;
+		if (subex->fsbno < ex->fsbno) {
+			sub_len -= ex->fsbno - subex->fsbno;
+			sub_fsb = ex->fsbno;
+		}
+		if (sub_len > ex->len)
+			sub_len = ex->len;
+
+		state = 0;
+		if (sub_fsb == ex->fsbno)
+			state |= LEFT_CONTIG;
+		if (sub_fsb + sub_len == ex->fsbno + ex->len)
+			state |= RIGHT_CONTIG;
+		switch (state) {
+		case LEFT_CONTIG:
+			/* Coincides with only the left. */
+			ex->fsbno += sub_len;
+			ex->len -= sub_len;
+			break;
+		case RIGHT_CONTIG:
+			/* Coincides with only the right. */
+			ex->len -= sub_len;
+			lp = lp->next;
+			break;
+		case LEFT_CONTIG | RIGHT_CONTIG:
+			/* Total overlap, just delete ex. */
+			lp = lp->next;
+			list_del(&ex->list);
+			kmem_free(ex);
+			break;
+		case 0:
+			/*
+			 * Deleting from the middle: add the new right extent
+			 * and then shrink the left extent.
+			 */
+			newex = kmem_alloc(
+					sizeof(struct xfs_repair_extent),
+					KM_MAYFAIL | KM_NOFS);
+			if (!newex) {
+				error = -ENOMEM;
+				goto out;
+			}
+			INIT_LIST_HEAD(&newex->list);
+			newex->fsbno = sub_fsb + sub_len;
+			newex->len = ex->len - (sub_fsb - ex->fsbno) - sub_len;
+			list_add(&newex->list, &ex->list);
+			ex->len = sub_fsb - ex->fsbno;
+			lp = lp->next;
+			break;
+		default:
+			ASSERT(0);
+			break;
+		}
+	}
+
+out:
+	return error;
+}
+#undef LEFT_CONTIG
+#undef RIGHT_CONTIG
+
+struct xfs_repair_find_ag_btree_roots_info {
+	struct xfs_buf			*agfl_bp;
+	struct xfs_repair_find_ag_btree	*btree_info;
+};
+
+/* Is this an OWN_AG block in the AGFL? */
+STATIC bool
+xfs_repair_is_block_in_agfl(
+	struct xfs_mount		*mp,
+	uint64_t			rmap_owner,
+	xfs_agblock_t			agbno,
+	struct xfs_buf			*agf_bp,
+	struct xfs_buf			*agfl_bp)
+{
+	struct xfs_agf			*agf;
+	__be32				*agfl_bno;
+	unsigned int			flfirst;
+	unsigned int			fllast;
+	int				i;
+
+	if (rmap_owner != XFS_RMAP_OWN_AG)
+		return false;
+
+	agf = XFS_BUF_TO_AGF(agf_bp);
+	agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agfl_bp);
+	flfirst = be32_to_cpu(agf->agf_flfirst);
+	fllast = be32_to_cpu(agf->agf_fllast);
+
+	/* Skip an empty AGFL. */
+	if (agf->agf_flcount == cpu_to_be32(0))
+		return false;
+
+	/* first to last is a consecutive list. */
+	if (fllast >= flfirst) {
+		for (i = flfirst; i <= fllast; i++) {
+			if (be32_to_cpu(agfl_bno[i]) == agbno)
+				return true;
+		}
+
+		return false;
+	}
+
+	/* first to the end */
+	for (i = flfirst; i < xfs_agfl_size(mp); i++) {
+		if (be32_to_cpu(agfl_bno[i]) == agbno)
+			return true;
+	}
+
+	/* the start to last. */
+	for (i = 0; i <= fllast; i++) {
+		if (be32_to_cpu(agfl_bno[i]) == agbno)
+			return true;
+	}
+
+	return false;
+}
+
+/* Find btree roots from the AGF. */
+STATIC int
+xfs_repair_find_ag_btree_roots_helper(
+	struct xfs_btree_cur		*cur,
+	struct xfs_rmap_irec		*rec,
+	void				*priv)
+{
+	struct xfs_mount		*mp = cur->bc_mp;
+	struct xfs_repair_find_ag_btree_roots_info	*ri = priv;
+	struct xfs_repair_find_ag_btree	*fab;
+	struct xfs_buf			*bp;
+	struct xfs_btree_block		*btblock;
+	xfs_daddr_t			daddr;
+	xfs_agblock_t			agbno;
+	int				error = 0;
+
+	if (!XFS_RMAP_NON_INODE_OWNER(rec->rm_owner))
+		return 0;
+
+	for (agbno = 0; agbno < rec->rm_blockcount; agbno++) {
+		daddr = XFS_AGB_TO_DADDR(mp, cur->bc_private.a.agno,
+				rec->rm_startblock + agbno);
+		for (fab = ri->btree_info; fab->buf_ops; fab++) {
+			if (rec->rm_owner != fab->rmap_owner)
+				continue;
+
+			/*
+			 * Blocks in the AGFL have stale contents that
+			 * might just happen to have a matching magic
+			 * and uuid.  We don't want to pull these blocks
+			 * in as part of a tree root, so we have to
+			 * filter out the AGFL stuff here.  If the AGFL
+			 * looks insane we'll just refuse to repair.
+			 */
+			if (xfs_repair_is_block_in_agfl(mp, rec->rm_owner,
+					rec->rm_startblock + agbno,
+					cur->bc_private.a.agbp, ri->agfl_bp))
+				continue;
+
+			error = xfs_trans_read_buf(mp, cur->bc_tp,
+					mp->m_ddev_targp, daddr, mp->m_bsize,
+					0, &bp, NULL);
+			if (error)
+				return error;
+
+			/* Does this look like a block we want? */
+			btblock = XFS_BUF_TO_BLOCK(bp);
+			if (be32_to_cpu(btblock->bb_magic) != fab->magic)
+				goto next_fab;
+			if (xfs_sb_version_hascrc(&mp->m_sb) &&
+			    !uuid_equal(&btblock->bb_u.s.bb_uuid,
+					&mp->m_sb.sb_meta_uuid))
+				goto next_fab;
+			if (fab->root != NULLAGBLOCK &&
+			    xfs_btree_get_level(btblock) <= fab->level)
+				goto next_fab;
+
+			/* Make sure we pass the verifiers. */
+			bp->b_ops = fab->buf_ops;
+			bp->b_ops->verify_read(bp);
+			if (bp->b_error)
+				goto next_fab;
+			fab->root = rec->rm_startblock + agbno;
+			fab->level = xfs_btree_get_level(btblock);
+
+			trace_xfs_repair_find_ag_btree_roots_helper(mp,
+					cur->bc_private.a.agno,
+					rec->rm_startblock + agbno,
+					be32_to_cpu(btblock->bb_magic),
+					fab->level);
+next_fab:
+			xfs_trans_brelse(cur->bc_tp, bp);
+			if (be32_to_cpu(btblock->bb_magic) == fab->magic)
+				break;
+		}
+	}
+
+	return error;
+}
+
+/* Find the roots of the given btrees from the rmap info. */
+int
+xfs_repair_find_ag_btree_roots(
+	struct xfs_scrub_context	*sc,
+	struct xfs_buf			*agf_bp,
+	struct xfs_repair_find_ag_btree	*btree_info,
+	struct xfs_buf			*agfl_bp)
+{
+	struct xfs_mount		*mp = sc->mp;
+	struct xfs_repair_find_ag_btree_roots_info	ri;
+	struct xfs_repair_find_ag_btree	*fab;
+	struct xfs_btree_cur		*cur;
+	int				error;
+
+	ri.btree_info = btree_info;
+	ri.agfl_bp = agfl_bp;
+	for (fab = btree_info; fab->buf_ops; fab++) {
+		ASSERT(agfl_bp || fab->rmap_owner != XFS_RMAP_OWN_AG);
+		fab->root = NULLAGBLOCK;
+		fab->level = 0;
+	}
+
+	cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno);
+	error = xfs_rmap_query_all(cur, xfs_repair_find_ag_btree_roots_helper,
+			&ri);
+	xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+
+	for (fab = btree_info; !error && fab->buf_ops; fab++)
+		if (fab->root != NULLAGBLOCK)
+			fab->level++;
+
+	return error;
+}
+
+/* Reset the superblock counters from the AGF/AGI. */
+int
+xfs_repair_reset_counters(
+	struct xfs_mount	*mp)
+{
+	struct xfs_trans	*tp;
+	struct xfs_buf		*agi_bp;
+	struct xfs_buf		*agf_bp;
+	struct xfs_agi		*agi;
+	struct xfs_agf		*agf;
+	xfs_agnumber_t		agno;
+	xfs_ino_t		icount = 0;
+	xfs_ino_t		ifree = 0;
+	xfs_filblks_t		fdblocks = 0;
+	int64_t			delta_icount;
+	int64_t			delta_ifree;
+	int64_t			delta_fdblocks;
+	int			error;
+
+	trace_xfs_repair_reset_counters(mp);
+
+	error = xfs_trans_alloc_empty(mp, &tp);
+	if (error)
+		return error;
+
+	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
+		/* Count all the inodes... */
+		error = xfs_ialloc_read_agi(mp, tp, agno, &agi_bp);
+		if (error)
+			goto out;
+		agi = XFS_BUF_TO_AGI(agi_bp);
+		icount += be32_to_cpu(agi->agi_count);
+		ifree += be32_to_cpu(agi->agi_freecount);
+
+		/* Add up the free/freelist/bnobt/cntbt blocks... */
+		error = xfs_alloc_read_agf(mp, tp, agno, 0, &agf_bp);
+		if (error)
+			goto out;
+		if (!agf_bp) {
+			error = -ENOMEM;
+			goto out;
+		}
+		agf = XFS_BUF_TO_AGF(agf_bp);
+		fdblocks += be32_to_cpu(agf->agf_freeblks);
+		fdblocks += be32_to_cpu(agf->agf_flcount);
+		fdblocks += be32_to_cpu(agf->agf_btreeblks);
+	}
+
+	/*
+	 * Reinitialize the counters.  The on-disk and in-core counters
+	 * differ by the number of inodes/blocks reserved by the admin,
+	 * the per-AG reservation, and any transactions in progress, so
+	 * we have to account for that.
+	 */
+	spin_lock(&mp->m_sb_lock);
+	delta_icount = (int64_t)mp->m_sb.sb_icount - icount;
+	delta_ifree = (int64_t)mp->m_sb.sb_ifree - ifree;
+	delta_fdblocks = (int64_t)mp->m_sb.sb_fdblocks - fdblocks;
+	mp->m_sb.sb_icount = icount;
+	mp->m_sb.sb_ifree = ifree;
+	mp->m_sb.sb_fdblocks = fdblocks;
+	spin_unlock(&mp->m_sb_lock);
+
+	if (delta_icount) {
+		error = xfs_mod_icount(mp, delta_icount);
+		if (error)
+			goto out;
+	}
+	if (delta_ifree) {
+		error = xfs_mod_ifree(mp, delta_ifree);
+		if (error)
+			goto out;
+	}
+	if (delta_fdblocks) {
+		error = xfs_mod_fdblocks(mp, delta_fdblocks, false);
+		if (error)
+			goto out;
+	}
+
+out:
+	xfs_trans_cancel(tp);
+	return error;
+}
+
+/* Figure out how many blocks to reserve for an AG repair. */
+xfs_extlen_t
+xfs_repair_calc_ag_resblks(
+	struct xfs_scrub_context	*sc)
+{
+	struct xfs_mount		*mp = sc->mp;
+	struct xfs_scrub_metadata	*sm = sc->sm;
+	struct xfs_agi			*agi;
+	struct xfs_agf			*agf;
+	struct xfs_buf			*bp;
+	xfs_agino_t			icount;
+	xfs_extlen_t			aglen;
+	xfs_extlen_t			usedlen;
+	xfs_extlen_t			freelen;
+	xfs_extlen_t			bnobt_sz;
+	xfs_extlen_t			inobt_sz;
+	xfs_extlen_t			rmapbt_sz;
+	xfs_extlen_t			refcbt_sz;
+	int				error;
+
+	if (!(sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR))
+		return 0;
+
+	/*
+	 * Try to get the actual counters from disk; if not, make
+	 * some worst case assumptions.
+	 */
+	error = xfs_read_agi(mp, NULL, sm->sm_agno, &bp);
+	if (!error) {
+		agi = XFS_BUF_TO_AGI(bp);
+		icount = be32_to_cpu(agi->agi_count);
+		xfs_trans_brelse(NULL, bp);
+	} else {
+		icount = mp->m_sb.sb_agblocks / mp->m_sb.sb_inopblock;
+	}
+
+	error = xfs_alloc_read_agf(mp, NULL, sm->sm_agno, 0, &bp);
+	if (!error && bp) {
+		agf = XFS_BUF_TO_AGF(bp);
+		aglen = be32_to_cpu(agf->agf_length);
+		freelen = be32_to_cpu(agf->agf_freeblks);
+		usedlen = aglen - freelen;
+		xfs_trans_brelse(NULL, bp);
+	} else {
+		aglen = mp->m_sb.sb_agblocks;
+		freelen = aglen;
+		usedlen = aglen;
+	}
+
+	trace_xfs_repair_calc_ag_resblks(mp, sm->sm_agno, icount, aglen,
+			freelen, usedlen);
+
+	/*
+	 * Figure out how many blocks we'd need worst case to rebuild
+	 * each type of btree.  Note that we can only rebuild the
+	 * bnobt/cntbt or inobt/finobt as pairs.
+	 */
+	bnobt_sz = 2 * xfs_allocbt_calc_size(mp, freelen);
+	if (xfs_sb_version_hassparseinodes(&mp->m_sb))
+		inobt_sz = xfs_iallocbt_calc_size(mp, icount /
+				XFS_INODES_PER_HOLEMASK_BIT);
+	else
+		inobt_sz = xfs_iallocbt_calc_size(mp, icount /
+				XFS_INODES_PER_CHUNK);
+	if (xfs_sb_version_hasfinobt(&mp->m_sb))
+		inobt_sz *= 2;
+	if (xfs_sb_version_hasreflink(&mp->m_sb)) {
+		rmapbt_sz = xfs_rmapbt_calc_size(mp, aglen);
+		refcbt_sz = xfs_refcountbt_calc_size(mp, usedlen);
+	} else {
+		rmapbt_sz = xfs_rmapbt_calc_size(mp, usedlen);
+		refcbt_sz = 0;
+	}
+	if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
+		rmapbt_sz = 0;
+
+	trace_xfs_repair_calc_ag_resblks_btsize(mp, sm->sm_agno, bnobt_sz,
+			inobt_sz, rmapbt_sz, refcbt_sz);
+
+	return max(max(bnobt_sz, inobt_sz), max(rmapbt_sz, refcbt_sz));
+}
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index 660ab3a..6872359 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -24,10 +24,88 @@ 
 
 int xfs_repair_probe(struct xfs_scrub_context *sc);
 
+/* Repair helpers */
+
+struct xfs_repair_find_ag_btree {
+	uint64_t			rmap_owner;
+	const struct xfs_buf_ops	*buf_ops;
+	uint32_t			magic;
+	xfs_agblock_t			root;
+	unsigned int			level;
+};
+
+struct xfs_repair_extent {
+	struct list_head		list;
+	xfs_fsblock_t			fsbno;
+	xfs_extlen_t			len;
+};
+
+struct xfs_repair_extent_list {
+	struct list_head		list;
+};
+
+static inline void
+xfs_repair_init_extent_list(
+	struct xfs_repair_extent_list	*exlist)
+{
+	INIT_LIST_HEAD(&exlist->list);
+}
+
+#define for_each_xfs_repair_extent_safe(rbe, n, exlist) \
+	list_for_each_entry_safe((rbe), (n), &(exlist)->list, list)
+
+int xfs_repair_roll_ag_trans(struct xfs_scrub_context *sc);
+bool xfs_repair_ag_has_space(struct xfs_perag *pag, xfs_extlen_t nr_blocks,
+		enum xfs_ag_resv_type type);
+int xfs_repair_alloc_ag_block(struct xfs_scrub_context *sc,
+		struct xfs_owner_info *oinfo, xfs_fsblock_t *fsbno,
+		enum xfs_ag_resv_type resv);
+int xfs_repair_init_btblock(struct xfs_scrub_context *sc, xfs_fsblock_t fsb,
+		struct xfs_buf **bpp, xfs_btnum_t btnum,
+		const struct xfs_buf_ops *ops);
+int xfs_repair_fix_freelist(struct xfs_scrub_context *sc, bool can_shrink);
+int xfs_repair_put_freelist(struct xfs_scrub_context *sc, xfs_agblock_t agbno);
+int xfs_repair_collect_btree_extent(struct xfs_scrub_context *sc,
+		struct xfs_repair_extent_list *btlist, xfs_fsblock_t fsbno,
+		xfs_extlen_t len);
+int xfs_repair_invalidate_blocks(struct xfs_scrub_context *sc,
+		struct xfs_repair_extent_list *btlist);
+int xfs_repair_reap_btree_extents(struct xfs_scrub_context *sc,
+		struct xfs_repair_extent_list *btlist,
+		struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type);
+void xfs_repair_cancel_btree_extents(struct xfs_scrub_context *sc,
+		struct xfs_repair_extent_list *btlist);
+int xfs_repair_subtract_extents(struct xfs_scrub_context *sc,
+		struct xfs_repair_extent_list *exlist,
+		struct xfs_repair_extent_list *sublist);
+int xfs_repair_find_ag_btree_roots(struct xfs_scrub_context *sc,
+		struct xfs_buf *agf_bp,
+		struct xfs_repair_find_ag_btree *btree_info,
+		struct xfs_buf *agfl_bp);
+int xfs_repair_reset_counters(struct xfs_mount *mp);
+xfs_extlen_t xfs_repair_calc_ag_resblks(struct xfs_scrub_context *sc);
+int xfs_repair_setup_btree_extent_collection(struct xfs_scrub_context *sc);
+
+/* Metadata repairers */
+
 #else
 
 #define xfs_repair_probe		(NULL)
 
+static inline int xfs_repair_reset_counters(struct xfs_mount *mp)
+{
+	ASSERT(0);
+	return -EIO;
+}
+
+static inline xfs_extlen_t
+xfs_repair_calc_ag_resblks(
+	struct xfs_scrub_context	*sc)
+{
+	ASSERT(!(sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR));
+	return 0;
+}
+
 #endif /* CONFIG_XFS_ONLINE_REPAIR */
 
 #endif	/* __XFS_SCRUB_REPAIR_H__ */
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index 888ab2a..9a8fa78 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -196,6 +196,8 @@  xfs_scrub_teardown(
 		kmem_free(sc->buf);
 		sc->buf = NULL;
 	}
+	if (sc->reset_counters && !error)
+		error = xfs_repair_reset_counters(sc->mp);
 	return error;
 }
 
diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h
index 9ab8ef8..340d6227 100644
--- a/fs/xfs/scrub/scrub.h
+++ b/fs/xfs/scrub/scrub.h
@@ -76,6 +76,7 @@  struct xfs_scrub_context {
 	void				*buf;
 	uint				ilock_flags;
 	bool				try_harder;
+	bool				reset_counters;
 
 	/* State tracking for single-AG operations. */
 	struct xfs_scrub_ag		sa;