diff mbox series

[04/16] xfs: repair the AGF

Message ID 153256439500.29021.5265670898665301362.stgit@magnolia (mailing list archive)
State Superseded
Headers show
Series xfs-4.19: online repair support | expand

Commit Message

Darrick J. Wong July 26, 2018, 12:19 a.m. UTC
From: Darrick J. Wong <darrick.wong@oracle.com>

Regenerate the AGF from the rmap data.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/scrub/agheader_repair.c |  366 ++++++++++++++++++++++++++++++++++++++++
 fs/xfs/scrub/repair.c          |   27 ++-
 fs/xfs/scrub/repair.h          |    4 
 fs/xfs/scrub/scrub.c           |    2 
 4 files changed, 389 insertions(+), 10 deletions(-)



--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Brian Foster July 27, 2018, 2:23 p.m. UTC | #1
On Wed, Jul 25, 2018 at 05:19:55PM -0700, Darrick J. Wong wrote:
> From: Darrick J. Wong <darrick.wong@oracle.com>
> 
> Regenerate the AGF from the rmap data.
> 
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> ---

Mostly seems sane to me. I still need to come up to speed on the broader
xfs_scrub context. A few comments in the meantime..

>  fs/xfs/scrub/agheader_repair.c |  366 ++++++++++++++++++++++++++++++++++++++++
>  fs/xfs/scrub/repair.c          |   27 ++-
>  fs/xfs/scrub/repair.h          |    4 
>  fs/xfs/scrub/scrub.c           |    2 
>  4 files changed, 389 insertions(+), 10 deletions(-)
> 
> 
> diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c
> index 1e96621ece3a..938af216cb1c 100644
> --- a/fs/xfs/scrub/agheader_repair.c
> +++ b/fs/xfs/scrub/agheader_repair.c
...
> @@ -54,3 +61,362 @@ xrep_superblock(
>  	xfs_trans_log_buf(sc->tp, bp, 0, BBTOB(bp->b_length) - 1);
>  	return error;
>  }
...
> +/* Update all AGF fields which derive from btree contents. */
> +STATIC int
> +xrep_agf_calc_from_btrees(
> +	struct xfs_scrub	*sc,
> +	struct xfs_buf		*agf_bp)
> +{
> +	struct xrep_agf_allocbt	raa = { .sc = sc };
> +	struct xfs_btree_cur	*cur = NULL;
> +	struct xfs_agf		*agf = XFS_BUF_TO_AGF(agf_bp);
> +	struct xfs_mount	*mp = sc->mp;
> +	xfs_agblock_t		btreeblks;
> +	xfs_agblock_t		blocks;
> +	int			error;
> +
> +	/* Update the AGF counters from the bnobt. */
> +	cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno,
> +			XFS_BTNUM_BNO);
> +	error = xfs_alloc_query_all(cur, xrep_agf_walk_allocbt, &raa);
> +	if (error)
> +		goto err;
> +	error = xfs_btree_count_blocks(cur, &blocks);
> +	if (error)
> +		goto err;
> +	xfs_btree_del_cursor(cur, error);
> +	btreeblks = blocks - 1;

Why the -1? We don't count the root or something?

> +	agf->agf_freeblks = cpu_to_be32(raa.freeblks);
> +	agf->agf_longest = cpu_to_be32(raa.longest);
> +
> +	/* Update the AGF counters from the cntbt. */
> +	cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno,
> +			XFS_BTNUM_CNT);
> +	error = xfs_btree_count_blocks(cur, &blocks);
> +	if (error)
> +		goto err;
> +	xfs_btree_del_cursor(cur, error);
> +	btreeblks += blocks - 1;
> +
> +	/* Update the AGF counters from the rmapbt. */
> +	cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno);
> +	error = xfs_btree_count_blocks(cur, &blocks);
> +	if (error)
> +		goto err;
> +	xfs_btree_del_cursor(cur, error);
> +	agf->agf_rmap_blocks = cpu_to_be32(blocks);
> +	btreeblks += blocks - 1;
> +
> +	agf->agf_btreeblks = cpu_to_be32(btreeblks);
> +
> +	/* Update the AGF counters from the refcountbt. */
> +	if (xfs_sb_version_hasreflink(&mp->m_sb)) {
> +		cur = xfs_refcountbt_init_cursor(mp, sc->tp, agf_bp,
> +				sc->sa.agno, NULL);

FYI this fails to compile on for-next (dfops param has been removed).

> +		error = xfs_btree_count_blocks(cur, &blocks);
> +		if (error)
> +			goto err;
> +		xfs_btree_del_cursor(cur, error);
> +		agf->agf_refcount_blocks = cpu_to_be32(blocks);
> +	}
> +
> +	return 0;
> +err:
> +	xfs_btree_del_cursor(cur, error);
> +	return error;
> +}
> +
> +/* Commit the new AGF and reinitialize the incore state. */
> +STATIC int
> +xrep_agf_commit_new(
> +	struct xfs_scrub	*sc,
> +	struct xfs_buf		*agf_bp)
> +{
> +	struct xfs_perag	*pag;
> +	struct xfs_agf		*agf = XFS_BUF_TO_AGF(agf_bp);
> +
> +	/* Trigger fdblocks recalculation */
> +	xfs_force_summary_recalc(sc->mp);
> +
> +	/* Write this to disk. */
> +	xfs_trans_buf_set_type(sc->tp, agf_bp, XFS_BLFT_AGF_BUF);
> +	xfs_trans_log_buf(sc->tp, agf_bp, 0, BBTOB(agf_bp->b_length) - 1);
> +
> +	/* Now reinitialize the in-core counters we changed. */
> +	pag = sc->sa.pag;
> +	sc->sa.pag->pagf_init = 1;

Nit: can probably do 'pag->pagf_init = 1' here since we just initialized
pag on the line above.

That aside, is ordering important at all here? I'm wondering if somebody
can grab the pag right after we set this and see pagf_init == 1 before
we've updated the values below. Perhaps it doesn't really matter since
we have the agf buffer.

> +	pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks);
> +	pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks);
> +	pag->pagf_longest = be32_to_cpu(agf->agf_longest);
> +	pag->pagf_levels[XFS_BTNUM_BNOi] =
> +			be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]);
> +	pag->pagf_levels[XFS_BTNUM_CNTi] =
> +			be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]);
> +	pag->pagf_levels[XFS_BTNUM_RMAPi] =
> +			be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAPi]);
> +	pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level);
> +
> +	return 0;
> +}
> +
> +/* Repair the AGF. v5 filesystems only. */
> +int
> +xrep_agf(
> +	struct xfs_scrub		*sc)
> +{
> +	struct xrep_find_ag_btree	fab[XREP_AGF_MAX] = {
> +		[XREP_AGF_BNOBT] = {
> +			.rmap_owner = XFS_RMAP_OWN_AG,
> +			.buf_ops = &xfs_allocbt_buf_ops,
> +			.magic = XFS_ABTB_CRC_MAGIC,
> +		},
> +		[XREP_AGF_CNTBT] = {
> +			.rmap_owner = XFS_RMAP_OWN_AG,
> +			.buf_ops = &xfs_allocbt_buf_ops,
> +			.magic = XFS_ABTC_CRC_MAGIC,
> +		},
> +		[XREP_AGF_RMAPBT] = {
> +			.rmap_owner = XFS_RMAP_OWN_AG,
> +			.buf_ops = &xfs_rmapbt_buf_ops,
> +			.magic = XFS_RMAP_CRC_MAGIC,
> +		},
> +		[XREP_AGF_REFCOUNTBT] = {
> +			.rmap_owner = XFS_RMAP_OWN_REFC,
> +			.buf_ops = &xfs_refcountbt_buf_ops,
> +			.magic = XFS_REFC_CRC_MAGIC,
> +		},
> +		[XREP_AGF_END] = {
> +			.buf_ops = NULL,
> +		},
> +	};
> +	struct xfs_agf			old_agf;
> +	struct xfs_mount		*mp = sc->mp;
> +	struct xfs_buf			*agf_bp;
> +	struct xfs_buf			*agfl_bp;
> +	struct xfs_agf			*agf;
> +	int				error;
> +
> +	/* We require the rmapbt to rebuild anything. */
> +	if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
> +		return -EOPNOTSUPP;
> +
> +	xchk_perag_get(sc->mp, &sc->sa);
> +	error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp,
> +			XFS_AG_DADDR(mp, sc->sa.agno, XFS_AGF_DADDR(mp)),
> +			XFS_FSS_TO_BB(mp, 1), 0, &agf_bp, NULL);
> +	if (error)
> +		return error;
> +	agf_bp->b_ops = &xfs_agf_buf_ops;

Any reason we don't call xfs_read_agf() here? It looks like we use the
similar helper for the agfl below.

> +	agf = XFS_BUF_TO_AGF(agf_bp);
> +
> +	/*
> +	 * Load the AGFL so that we can screen out OWN_AG blocks that are on
> +	 * the AGFL now; these blocks might have once been part of the
> +	 * bno/cnt/rmap btrees but are not now.  This is a chicken and egg
> +	 * problem: the AGF is corrupt, so we have to trust the AGFL contents
> +	 * because we can't do any serious cross-referencing with any of the
> +	 * btrees rooted in the AGF.  If the AGFL contents are obviously bad
> +	 * then we'll bail out.
> +	 */
> +	error = xfs_alloc_read_agfl(mp, sc->tp, sc->sa.agno, &agfl_bp);
> +	if (error)
> +		return error;
> +
> +	/*
> +	 * Spot-check the AGFL blocks; if they're obviously corrupt then
> +	 * there's nothing we can do but bail out.
> +	 */

Why? Can't we reset the agfl, or is that handled elsewhere?

Brian

> +	error = xfs_agfl_walk(sc->mp, XFS_BUF_TO_AGF(agf_bp), agfl_bp,
> +			xrep_agf_check_agfl_block, sc);
> +	if (error)
> +		return error;
> +
> +	/*
> +	 * Find the AGF btree roots.  This is also a chicken-and-egg situation;
> +	 * see the function for more details.
> +	 */
> +	error = xrep_agf_find_btrees(sc, agf_bp, fab, agfl_bp);
> +	if (error)
> +		return error;
> +
> +	/* Start rewriting the header and implant the btrees we found. */
> +	xrep_agf_init_header(sc, agf_bp, &old_agf);
> +	xrep_agf_set_roots(sc, agf, fab);
> +	error = xrep_agf_calc_from_btrees(sc, agf_bp);
> +	if (error)
> +		goto out_revert;
> +
> +	/* Commit the changes and reinitialize incore state. */
> +	return xrep_agf_commit_new(sc, agf_bp);
> +
> +out_revert:
> +	/* Mark the incore AGF state stale and revert the AGF. */
> +	sc->sa.pag->pagf_init = 0;
> +	memcpy(agf, &old_agf, sizeof(old_agf));
> +	return error;
> +}
> diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
> index 85b048b341a0..17cf48564390 100644
> --- a/fs/xfs/scrub/repair.c
> +++ b/fs/xfs/scrub/repair.c
> @@ -128,9 +128,12 @@ xrep_roll_ag_trans(
>  	int			error;
>  
>  	/* Keep the AG header buffers locked so we can keep going. */
> -	xfs_trans_bhold(sc->tp, sc->sa.agi_bp);
> -	xfs_trans_bhold(sc->tp, sc->sa.agf_bp);
> -	xfs_trans_bhold(sc->tp, sc->sa.agfl_bp);
> +	if (sc->sa.agi_bp)
> +		xfs_trans_bhold(sc->tp, sc->sa.agi_bp);
> +	if (sc->sa.agf_bp)
> +		xfs_trans_bhold(sc->tp, sc->sa.agf_bp);
> +	if (sc->sa.agfl_bp)
> +		xfs_trans_bhold(sc->tp, sc->sa.agfl_bp);
>  
>  	/* Roll the transaction. */
>  	error = xfs_trans_roll(&sc->tp);
> @@ -138,9 +141,12 @@ xrep_roll_ag_trans(
>  		goto out_release;
>  
>  	/* Join AG headers to the new transaction. */
> -	xfs_trans_bjoin(sc->tp, sc->sa.agi_bp);
> -	xfs_trans_bjoin(sc->tp, sc->sa.agf_bp);
> -	xfs_trans_bjoin(sc->tp, sc->sa.agfl_bp);
> +	if (sc->sa.agi_bp)
> +		xfs_trans_bjoin(sc->tp, sc->sa.agi_bp);
> +	if (sc->sa.agf_bp)
> +		xfs_trans_bjoin(sc->tp, sc->sa.agf_bp);
> +	if (sc->sa.agfl_bp)
> +		xfs_trans_bjoin(sc->tp, sc->sa.agfl_bp);
>  
>  	return 0;
>  
> @@ -150,9 +156,12 @@ xrep_roll_ag_trans(
>  	 * buffers will be released during teardown on our way out
>  	 * of the kernel.
>  	 */
> -	xfs_trans_bhold_release(sc->tp, sc->sa.agi_bp);
> -	xfs_trans_bhold_release(sc->tp, sc->sa.agf_bp);
> -	xfs_trans_bhold_release(sc->tp, sc->sa.agfl_bp);
> +	if (sc->sa.agi_bp)
> +		xfs_trans_bhold_release(sc->tp, sc->sa.agi_bp);
> +	if (sc->sa.agf_bp)
> +		xfs_trans_bhold_release(sc->tp, sc->sa.agf_bp);
> +	if (sc->sa.agfl_bp)
> +		xfs_trans_bhold_release(sc->tp, sc->sa.agfl_bp);
>  
>  	return error;
>  }
> diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
> index 5a4e92221916..1d283360b5ab 100644
> --- a/fs/xfs/scrub/repair.h
> +++ b/fs/xfs/scrub/repair.h
> @@ -58,6 +58,8 @@ int xrep_ino_dqattach(struct xfs_scrub *sc);
>  
>  int xrep_probe(struct xfs_scrub *sc);
>  int xrep_superblock(struct xfs_scrub *sc);
> +int xrep_agf(struct xfs_scrub *sc);
> +int xrep_agfl(struct xfs_scrub *sc);
>  
>  #else
>  
> @@ -81,6 +83,8 @@ xrep_calc_ag_resblks(
>  
>  #define xrep_probe			xrep_notsupported
>  #define xrep_superblock			xrep_notsupported
> +#define xrep_agf			xrep_notsupported
> +#define xrep_agfl			xrep_notsupported
>  
>  #endif /* CONFIG_XFS_ONLINE_REPAIR */
>  
> diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
> index 6efb926f3cf8..1e8a17c8e2b9 100644
> --- a/fs/xfs/scrub/scrub.c
> +++ b/fs/xfs/scrub/scrub.c
> @@ -214,7 +214,7 @@ static const struct xchk_meta_ops meta_scrub_ops[] = {
>  		.type	= ST_PERAG,
>  		.setup	= xchk_setup_fs,
>  		.scrub	= xchk_agf,
> -		.repair	= xrep_notsupported,
> +		.repair	= xrep_agf,
>  	},
>  	[XFS_SCRUB_TYPE_AGFL]= {	/* agfl */
>  		.type	= ST_PERAG,
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Darrick J. Wong July 27, 2018, 4:02 p.m. UTC | #2
On Fri, Jul 27, 2018 at 10:23:48AM -0400, Brian Foster wrote:
> On Wed, Jul 25, 2018 at 05:19:55PM -0700, Darrick J. Wong wrote:
> > From: Darrick J. Wong <darrick.wong@oracle.com>
> > 
> > Regenerate the AGF from the rmap data.
> > 
> > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > ---
> 
> Mostly seems sane to me. I still need to come up to speed on the broader
> xfs_scrub context. A few comments in the meantime..

<nod> Thanks for taking a look at this series. :)

> >  fs/xfs/scrub/agheader_repair.c |  366 ++++++++++++++++++++++++++++++++++++++++
> >  fs/xfs/scrub/repair.c          |   27 ++-
> >  fs/xfs/scrub/repair.h          |    4 
> >  fs/xfs/scrub/scrub.c           |    2 
> >  4 files changed, 389 insertions(+), 10 deletions(-)
> > 
> > 
> > diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c
> > index 1e96621ece3a..938af216cb1c 100644
> > --- a/fs/xfs/scrub/agheader_repair.c
> > +++ b/fs/xfs/scrub/agheader_repair.c
> ...
> > @@ -54,3 +61,362 @@ xrep_superblock(
> >  	xfs_trans_log_buf(sc->tp, bp, 0, BBTOB(bp->b_length) - 1);
> >  	return error;
> >  }
> ...
> > +/* Update all AGF fields which derive from btree contents. */
> > +STATIC int
> > +xrep_agf_calc_from_btrees(
> > +	struct xfs_scrub	*sc,
> > +	struct xfs_buf		*agf_bp)
> > +{
> > +	struct xrep_agf_allocbt	raa = { .sc = sc };
> > +	struct xfs_btree_cur	*cur = NULL;
> > +	struct xfs_agf		*agf = XFS_BUF_TO_AGF(agf_bp);
> > +	struct xfs_mount	*mp = sc->mp;
> > +	xfs_agblock_t		btreeblks;
> > +	xfs_agblock_t		blocks;
> > +	int			error;
> > +
> > +	/* Update the AGF counters from the bnobt. */
> > +	cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno,
> > +			XFS_BTNUM_BNO);
> > +	error = xfs_alloc_query_all(cur, xrep_agf_walk_allocbt, &raa);
> > +	if (error)
> > +		goto err;
> > +	error = xfs_btree_count_blocks(cur, &blocks);
> > +	if (error)
> > +		goto err;
> > +	xfs_btree_del_cursor(cur, error);
> > +	btreeblks = blocks - 1;
> 
> Why the -1? We don't count the root or something?

The AGF btreeblks field only counts the number of blocks added to the
bno/cnt/rmapbt since they were initialized (each with a single root
block).  I find it a little strange not to count the root, but oh well.

> > +	agf->agf_freeblks = cpu_to_be32(raa.freeblks);
> > +	agf->agf_longest = cpu_to_be32(raa.longest);
> > +
> > +	/* Update the AGF counters from the cntbt. */
> > +	cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno,
> > +			XFS_BTNUM_CNT);
> > +	error = xfs_btree_count_blocks(cur, &blocks);
> > +	if (error)
> > +		goto err;
> > +	xfs_btree_del_cursor(cur, error);
> > +	btreeblks += blocks - 1;
> > +
> > +	/* Update the AGF counters from the rmapbt. */
> > +	cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno);
> > +	error = xfs_btree_count_blocks(cur, &blocks);
> > +	if (error)
> > +		goto err;
> > +	xfs_btree_del_cursor(cur, error);
> > +	agf->agf_rmap_blocks = cpu_to_be32(blocks);
> > +	btreeblks += blocks - 1;
> > +
> > +	agf->agf_btreeblks = cpu_to_be32(btreeblks);
> > +
> > +	/* Update the AGF counters from the refcountbt. */
> > +	if (xfs_sb_version_hasreflink(&mp->m_sb)) {
> > +		cur = xfs_refcountbt_init_cursor(mp, sc->tp, agf_bp,
> > +				sc->sa.agno, NULL);
> 
> FYI this fails to compile on for-next (dfops param has been removed).

Yeah, I'm working on a rebase to for-next (once I settle on the locking
question in hch's "reduce cow lookups" series).

> > +		error = xfs_btree_count_blocks(cur, &blocks);
> > +		if (error)
> > +			goto err;
> > +		xfs_btree_del_cursor(cur, error);
> > +		agf->agf_refcount_blocks = cpu_to_be32(blocks);
> > +	}
> > +
> > +	return 0;
> > +err:
> > +	xfs_btree_del_cursor(cur, error);
> > +	return error;
> > +}
> > +
> > +/* Commit the new AGF and reinitialize the incore state. */
> > +STATIC int
> > +xrep_agf_commit_new(
> > +	struct xfs_scrub	*sc,
> > +	struct xfs_buf		*agf_bp)
> > +{
> > +	struct xfs_perag	*pag;
> > +	struct xfs_agf		*agf = XFS_BUF_TO_AGF(agf_bp);
> > +
> > +	/* Trigger fdblocks recalculation */
> > +	xfs_force_summary_recalc(sc->mp);
> > +
> > +	/* Write this to disk. */
> > +	xfs_trans_buf_set_type(sc->tp, agf_bp, XFS_BLFT_AGF_BUF);
> > +	xfs_trans_log_buf(sc->tp, agf_bp, 0, BBTOB(agf_bp->b_length) - 1);
> > +
> > +	/* Now reinitialize the in-core counters we changed. */
> > +	pag = sc->sa.pag;
> > +	sc->sa.pag->pagf_init = 1;
> 
> Nit: can probably do 'pag->pagf_init = 1' here since we just initialized
> pag on the line above.

Ok.

> That aside, is ordering important at all here? I'm wondering if somebody
> can grab the pag right after we set this and see pagf_init == 1 before
> we've updated the values below. Perhaps it doesn't really matter since
> we have the agf buffer.

Hmm, I'll move it to the end to minimize the wtf factor. :)

> > +	pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks);
> > +	pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks);
> > +	pag->pagf_longest = be32_to_cpu(agf->agf_longest);
> > +	pag->pagf_levels[XFS_BTNUM_BNOi] =
> > +			be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]);
> > +	pag->pagf_levels[XFS_BTNUM_CNTi] =
> > +			be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]);
> > +	pag->pagf_levels[XFS_BTNUM_RMAPi] =
> > +			be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAPi]);
> > +	pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level);
> > +
> > +	return 0;
> > +}
> > +
> > +/* Repair the AGF. v5 filesystems only. */
> > +int
> > +xrep_agf(
> > +	struct xfs_scrub		*sc)
> > +{
> > +	struct xrep_find_ag_btree	fab[XREP_AGF_MAX] = {
> > +		[XREP_AGF_BNOBT] = {
> > +			.rmap_owner = XFS_RMAP_OWN_AG,
> > +			.buf_ops = &xfs_allocbt_buf_ops,
> > +			.magic = XFS_ABTB_CRC_MAGIC,
> > +		},
> > +		[XREP_AGF_CNTBT] = {
> > +			.rmap_owner = XFS_RMAP_OWN_AG,
> > +			.buf_ops = &xfs_allocbt_buf_ops,
> > +			.magic = XFS_ABTC_CRC_MAGIC,
> > +		},
> > +		[XREP_AGF_RMAPBT] = {
> > +			.rmap_owner = XFS_RMAP_OWN_AG,
> > +			.buf_ops = &xfs_rmapbt_buf_ops,
> > +			.magic = XFS_RMAP_CRC_MAGIC,
> > +		},
> > +		[XREP_AGF_REFCOUNTBT] = {
> > +			.rmap_owner = XFS_RMAP_OWN_REFC,
> > +			.buf_ops = &xfs_refcountbt_buf_ops,
> > +			.magic = XFS_REFC_CRC_MAGIC,
> > +		},
> > +		[XREP_AGF_END] = {
> > +			.buf_ops = NULL,
> > +		},
> > +	};
> > +	struct xfs_agf			old_agf;
> > +	struct xfs_mount		*mp = sc->mp;
> > +	struct xfs_buf			*agf_bp;
> > +	struct xfs_buf			*agfl_bp;
> > +	struct xfs_agf			*agf;
> > +	int				error;
> > +
> > +	/* We require the rmapbt to rebuild anything. */
> > +	if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
> > +		return -EOPNOTSUPP;
> > +
> > +	xchk_perag_get(sc->mp, &sc->sa);
> > +	error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp,
> > +			XFS_AG_DADDR(mp, sc->sa.agno, XFS_AGF_DADDR(mp)),
> > +			XFS_FSS_TO_BB(mp, 1), 0, &agf_bp, NULL);
> > +	if (error)
> > +		return error;
> > +	agf_bp->b_ops = &xfs_agf_buf_ops;
> 
> Any reason we don't call xfs_read_agf() here? It looks like we use the
> similar helper for the agfl below.

We're grabbing the agf buffer without read verifiers so that we can
reinitialize it.  Note that scrub tries xfs_read_agf, and if it fails
with -EFSCORRUPTED/-EFSBADCRC it marks the agf as corrupt, so it's
possible that sc->sa.sa_agf is still null.

This probably could have been trans_get_buf though...

> > +	agf = XFS_BUF_TO_AGF(agf_bp);
> > +
> > +	/*
> > +	 * Load the AGFL so that we can screen out OWN_AG blocks that are on
> > +	 * the AGFL now; these blocks might have once been part of the
> > +	 * bno/cnt/rmap btrees but are not now.  This is a chicken and egg
> > +	 * problem: the AGF is corrupt, so we have to trust the AGFL contents
> > +	 * because we can't do any serious cross-referencing with any of the
> > +	 * btrees rooted in the AGF.  If the AGFL contents are obviously bad
> > +	 * then we'll bail out.
> > +	 */
> > +	error = xfs_alloc_read_agfl(mp, sc->tp, sc->sa.agno, &agfl_bp);
> > +	if (error)
> > +		return error;
> > +
> > +	/*
> > +	 * Spot-check the AGFL blocks; if they're obviously corrupt then
> > +	 * there's nothing we can do but bail out.
> > +	 */
> 
> Why? Can't we reset the agfl, or is that handled elsewhere?

It's handled in xrep_agfl, but userspace will have to call us again to
fix the agfl and then call us a third time about the agf repair.

(xfs_scrub does this, naturally...)

--D

> Brian
> 
> > +	error = xfs_agfl_walk(sc->mp, XFS_BUF_TO_AGF(agf_bp), agfl_bp,
> > +			xrep_agf_check_agfl_block, sc);
> > +	if (error)
> > +		return error;
> > +
> > +	/*
> > +	 * Find the AGF btree roots.  This is also a chicken-and-egg situation;
> > +	 * see the function for more details.
> > +	 */
> > +	error = xrep_agf_find_btrees(sc, agf_bp, fab, agfl_bp);
> > +	if (error)
> > +		return error;
> > +
> > +	/* Start rewriting the header and implant the btrees we found. */
> > +	xrep_agf_init_header(sc, agf_bp, &old_agf);
> > +	xrep_agf_set_roots(sc, agf, fab);
> > +	error = xrep_agf_calc_from_btrees(sc, agf_bp);
> > +	if (error)
> > +		goto out_revert;
> > +
> > +	/* Commit the changes and reinitialize incore state. */
> > +	return xrep_agf_commit_new(sc, agf_bp);
> > +
> > +out_revert:
> > +	/* Mark the incore AGF state stale and revert the AGF. */
> > +	sc->sa.pag->pagf_init = 0;
> > +	memcpy(agf, &old_agf, sizeof(old_agf));
> > +	return error;
> > +}
> > diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
> > index 85b048b341a0..17cf48564390 100644
> > --- a/fs/xfs/scrub/repair.c
> > +++ b/fs/xfs/scrub/repair.c
> > @@ -128,9 +128,12 @@ xrep_roll_ag_trans(
> >  	int			error;
> >  
> >  	/* Keep the AG header buffers locked so we can keep going. */
> > -	xfs_trans_bhold(sc->tp, sc->sa.agi_bp);
> > -	xfs_trans_bhold(sc->tp, sc->sa.agf_bp);
> > -	xfs_trans_bhold(sc->tp, sc->sa.agfl_bp);
> > +	if (sc->sa.agi_bp)
> > +		xfs_trans_bhold(sc->tp, sc->sa.agi_bp);
> > +	if (sc->sa.agf_bp)
> > +		xfs_trans_bhold(sc->tp, sc->sa.agf_bp);
> > +	if (sc->sa.agfl_bp)
> > +		xfs_trans_bhold(sc->tp, sc->sa.agfl_bp);
> >  
> >  	/* Roll the transaction. */
> >  	error = xfs_trans_roll(&sc->tp);
> > @@ -138,9 +141,12 @@ xrep_roll_ag_trans(
> >  		goto out_release;
> >  
> >  	/* Join AG headers to the new transaction. */
> > -	xfs_trans_bjoin(sc->tp, sc->sa.agi_bp);
> > -	xfs_trans_bjoin(sc->tp, sc->sa.agf_bp);
> > -	xfs_trans_bjoin(sc->tp, sc->sa.agfl_bp);
> > +	if (sc->sa.agi_bp)
> > +		xfs_trans_bjoin(sc->tp, sc->sa.agi_bp);
> > +	if (sc->sa.agf_bp)
> > +		xfs_trans_bjoin(sc->tp, sc->sa.agf_bp);
> > +	if (sc->sa.agfl_bp)
> > +		xfs_trans_bjoin(sc->tp, sc->sa.agfl_bp);
> >  
> >  	return 0;
> >  
> > @@ -150,9 +156,12 @@ xrep_roll_ag_trans(
> >  	 * buffers will be released during teardown on our way out
> >  	 * of the kernel.
> >  	 */
> > -	xfs_trans_bhold_release(sc->tp, sc->sa.agi_bp);
> > -	xfs_trans_bhold_release(sc->tp, sc->sa.agf_bp);
> > -	xfs_trans_bhold_release(sc->tp, sc->sa.agfl_bp);
> > +	if (sc->sa.agi_bp)
> > +		xfs_trans_bhold_release(sc->tp, sc->sa.agi_bp);
> > +	if (sc->sa.agf_bp)
> > +		xfs_trans_bhold_release(sc->tp, sc->sa.agf_bp);
> > +	if (sc->sa.agfl_bp)
> > +		xfs_trans_bhold_release(sc->tp, sc->sa.agfl_bp);
> >  
> >  	return error;
> >  }
> > diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
> > index 5a4e92221916..1d283360b5ab 100644
> > --- a/fs/xfs/scrub/repair.h
> > +++ b/fs/xfs/scrub/repair.h
> > @@ -58,6 +58,8 @@ int xrep_ino_dqattach(struct xfs_scrub *sc);
> >  
> >  int xrep_probe(struct xfs_scrub *sc);
> >  int xrep_superblock(struct xfs_scrub *sc);
> > +int xrep_agf(struct xfs_scrub *sc);
> > +int xrep_agfl(struct xfs_scrub *sc);
> >  
> >  #else
> >  
> > @@ -81,6 +83,8 @@ xrep_calc_ag_resblks(
> >  
> >  #define xrep_probe			xrep_notsupported
> >  #define xrep_superblock			xrep_notsupported
> > +#define xrep_agf			xrep_notsupported
> > +#define xrep_agfl			xrep_notsupported
> >  
> >  #endif /* CONFIG_XFS_ONLINE_REPAIR */
> >  
> > diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
> > index 6efb926f3cf8..1e8a17c8e2b9 100644
> > --- a/fs/xfs/scrub/scrub.c
> > +++ b/fs/xfs/scrub/scrub.c
> > @@ -214,7 +214,7 @@ static const struct xchk_meta_ops meta_scrub_ops[] = {
> >  		.type	= ST_PERAG,
> >  		.setup	= xchk_setup_fs,
> >  		.scrub	= xchk_agf,
> > -		.repair	= xrep_notsupported,
> > +		.repair	= xrep_agf,
> >  	},
> >  	[XFS_SCRUB_TYPE_AGFL]= {	/* agfl */
> >  		.type	= ST_PERAG,
> > 
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Brian Foster July 27, 2018, 4:25 p.m. UTC | #3
On Fri, Jul 27, 2018 at 09:02:38AM -0700, Darrick J. Wong wrote:
> On Fri, Jul 27, 2018 at 10:23:48AM -0400, Brian Foster wrote:
> > On Wed, Jul 25, 2018 at 05:19:55PM -0700, Darrick J. Wong wrote:
> > > From: Darrick J. Wong <darrick.wong@oracle.com>
> > > 
> > > Regenerate the AGF from the rmap data.
> > > 
> > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > > ---
> > 
> > Mostly seems sane to me. I still need to come up to speed on the broader
> > xfs_scrub context. A few comments in the meantime..
> 
> <nod> Thanks for taking a look at this series. :)
> 
> > >  fs/xfs/scrub/agheader_repair.c |  366 ++++++++++++++++++++++++++++++++++++++++
> > >  fs/xfs/scrub/repair.c          |   27 ++-
> > >  fs/xfs/scrub/repair.h          |    4 
> > >  fs/xfs/scrub/scrub.c           |    2 
> > >  4 files changed, 389 insertions(+), 10 deletions(-)
> > > 
> > > 
> > > diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c
> > > index 1e96621ece3a..938af216cb1c 100644
> > > --- a/fs/xfs/scrub/agheader_repair.c
> > > +++ b/fs/xfs/scrub/agheader_repair.c
> > ...
> > > @@ -54,3 +61,362 @@ xrep_superblock(
> > >  	xfs_trans_log_buf(sc->tp, bp, 0, BBTOB(bp->b_length) - 1);
> > >  	return error;
> > >  }
> > ...
> > > +/* Update all AGF fields which derive from btree contents. */
> > > +STATIC int
> > > +xrep_agf_calc_from_btrees(
> > > +	struct xfs_scrub	*sc,
> > > +	struct xfs_buf		*agf_bp)
> > > +{
> > > +	struct xrep_agf_allocbt	raa = { .sc = sc };
> > > +	struct xfs_btree_cur	*cur = NULL;
> > > +	struct xfs_agf		*agf = XFS_BUF_TO_AGF(agf_bp);
> > > +	struct xfs_mount	*mp = sc->mp;
> > > +	xfs_agblock_t		btreeblks;
> > > +	xfs_agblock_t		blocks;
> > > +	int			error;
> > > +
> > > +	/* Update the AGF counters from the bnobt. */
> > > +	cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno,
> > > +			XFS_BTNUM_BNO);
> > > +	error = xfs_alloc_query_all(cur, xrep_agf_walk_allocbt, &raa);
> > > +	if (error)
> > > +		goto err;
> > > +	error = xfs_btree_count_blocks(cur, &blocks);
> > > +	if (error)
> > > +		goto err;
> > > +	xfs_btree_del_cursor(cur, error);
> > > +	btreeblks = blocks - 1;
> > 
> > Why the -1? We don't count the root or something?
> 
> The AGF btreeblks field only counts the number of blocks added to the
> bno/cnt/rmapbt since they were initialized (each with a single root
> block).  I find it a little strange not to count the root, but oh well.
> 

Got it.

> > > +	agf->agf_freeblks = cpu_to_be32(raa.freeblks);
> > > +	agf->agf_longest = cpu_to_be32(raa.longest);
> > > +
> > > +	/* Update the AGF counters from the cntbt. */
> > > +	cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno,
> > > +			XFS_BTNUM_CNT);
> > > +	error = xfs_btree_count_blocks(cur, &blocks);
> > > +	if (error)
> > > +		goto err;
> > > +	xfs_btree_del_cursor(cur, error);
> > > +	btreeblks += blocks - 1;
> > > +
> > > +	/* Update the AGF counters from the rmapbt. */
> > > +	cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno);
> > > +	error = xfs_btree_count_blocks(cur, &blocks);
> > > +	if (error)
> > > +		goto err;
> > > +	xfs_btree_del_cursor(cur, error);
> > > +	agf->agf_rmap_blocks = cpu_to_be32(blocks);
> > > +	btreeblks += blocks - 1;
> > > +
> > > +	agf->agf_btreeblks = cpu_to_be32(btreeblks);
> > > +
> > > +	/* Update the AGF counters from the refcountbt. */
> > > +	if (xfs_sb_version_hasreflink(&mp->m_sb)) {
> > > +		cur = xfs_refcountbt_init_cursor(mp, sc->tp, agf_bp,
> > > +				sc->sa.agno, NULL);
> > 
> > FYI this fails to compile on for-next (dfops param has been removed).
> 
> Yeah, I'm working on a rebase to for-next (once I settle on the locking
> question in hch's "reduce cow lookups" series).
> 
> > > +		error = xfs_btree_count_blocks(cur, &blocks);
> > > +		if (error)
> > > +			goto err;
> > > +		xfs_btree_del_cursor(cur, error);
> > > +		agf->agf_refcount_blocks = cpu_to_be32(blocks);
> > > +	}
> > > +
> > > +	return 0;
> > > +err:
> > > +	xfs_btree_del_cursor(cur, error);
> > > +	return error;
> > > +}
> > > +
...
> > > +/* Repair the AGF. v5 filesystems only. */
> > > +int
> > > +xrep_agf(
> > > +	struct xfs_scrub		*sc)
> > > +{
> > > +	struct xrep_find_ag_btree	fab[XREP_AGF_MAX] = {
> > > +		[XREP_AGF_BNOBT] = {
> > > +			.rmap_owner = XFS_RMAP_OWN_AG,
> > > +			.buf_ops = &xfs_allocbt_buf_ops,
> > > +			.magic = XFS_ABTB_CRC_MAGIC,
> > > +		},
> > > +		[XREP_AGF_CNTBT] = {
> > > +			.rmap_owner = XFS_RMAP_OWN_AG,
> > > +			.buf_ops = &xfs_allocbt_buf_ops,
> > > +			.magic = XFS_ABTC_CRC_MAGIC,
> > > +		},
> > > +		[XREP_AGF_RMAPBT] = {
> > > +			.rmap_owner = XFS_RMAP_OWN_AG,
> > > +			.buf_ops = &xfs_rmapbt_buf_ops,
> > > +			.magic = XFS_RMAP_CRC_MAGIC,
> > > +		},
> > > +		[XREP_AGF_REFCOUNTBT] = {
> > > +			.rmap_owner = XFS_RMAP_OWN_REFC,
> > > +			.buf_ops = &xfs_refcountbt_buf_ops,
> > > +			.magic = XFS_REFC_CRC_MAGIC,
> > > +		},
> > > +		[XREP_AGF_END] = {
> > > +			.buf_ops = NULL,
> > > +		},
> > > +	};
> > > +	struct xfs_agf			old_agf;
> > > +	struct xfs_mount		*mp = sc->mp;
> > > +	struct xfs_buf			*agf_bp;
> > > +	struct xfs_buf			*agfl_bp;
> > > +	struct xfs_agf			*agf;
> > > +	int				error;
> > > +
> > > +	/* We require the rmapbt to rebuild anything. */
> > > +	if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
> > > +		return -EOPNOTSUPP;
> > > +
> > > +	xchk_perag_get(sc->mp, &sc->sa);
> > > +	error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp,
> > > +			XFS_AG_DADDR(mp, sc->sa.agno, XFS_AGF_DADDR(mp)),
> > > +			XFS_FSS_TO_BB(mp, 1), 0, &agf_bp, NULL);
> > > +	if (error)
> > > +		return error;
> > > +	agf_bp->b_ops = &xfs_agf_buf_ops;
> > 
> > Any reason we don't call xfs_read_agf() here? It looks like we use the
> > similar helper for the agfl below.
> 
> We're grabbing the agf buffer without read verifiers so that we can
> reinitialize it.  Note that scrub tries xfs_read_agf, and if it fails
> with -EFSCORRUPTED/-EFSBADCRC it marks the agf as corrupt, so it's
> possible that sc->sa.sa_agf is still null.
> 

Ah, makes sense. I missed the NULL b_ops..

> This probably could have been trans_get_buf though...
> 
> > > +	agf = XFS_BUF_TO_AGF(agf_bp);
> > > +
> > > +	/*
> > > +	 * Load the AGFL so that we can screen out OWN_AG blocks that are on
> > > +	 * the AGFL now; these blocks might have once been part of the
> > > +	 * bno/cnt/rmap btrees but are not now.  This is a chicken and egg
> > > +	 * problem: the AGF is corrupt, so we have to trust the AGFL contents
> > > +	 * because we can't do any serious cross-referencing with any of the
> > > +	 * btrees rooted in the AGF.  If the AGFL contents are obviously bad
> > > +	 * then we'll bail out.
> > > +	 */
> > > +	error = xfs_alloc_read_agfl(mp, sc->tp, sc->sa.agno, &agfl_bp);
> > > +	if (error)
> > > +		return error;
> > > +
> > > +	/*
> > > +	 * Spot-check the AGFL blocks; if they're obviously corrupt then
> > > +	 * there's nothing we can do but bail out.
> > > +	 */
> > 
> > Why? Can't we reset the agfl, or is that handled elsewhere?
> 
> It's handled in xrep_agfl, but userspace will have to call us again to
> fix the agfl and then call us a third time about the agf repair.
> 

Ok, this was one of the things I feel like I don't have enough context
on wrt to online repair: in general, what dependent structures we expect
to be consistent in order to repair some other interrelated structure.
Userspace repair is straightforward in this regard since we slurp the
whole fs into memory, adjust the global state as we go, then essentially
regenerate new metadata based on the finalized state.

For online repair, it sounds like we're potentially limited because
repair of one structure may always depend on some other subset of
metadata being consistent, right? If so, is the goal of online repair to
essentially "do the best we can but otherwise the most severe
corruptions may have to always fall back to xfs_repair?"

So in this particular case, we expect a sane agfl and otherwise buzz off
because 1.) this is a targeted agf repair request and 2.) we have a
separate request to deal with the agfl. It sounds like the smarts to
understand how we might have to jump back and forth between them is in
userspace, so the end-user doesn't necessarily have to understand the
dependency.

Brian

> (xfs_scrub does this, naturally...)
> 
> --D
> 
> > Brian
> > 
> > > +	error = xfs_agfl_walk(sc->mp, XFS_BUF_TO_AGF(agf_bp), agfl_bp,
> > > +			xrep_agf_check_agfl_block, sc);
> > > +	if (error)
> > > +		return error;
> > > +
> > > +	/*
> > > +	 * Find the AGF btree roots.  This is also a chicken-and-egg situation;
> > > +	 * see the function for more details.
> > > +	 */
> > > +	error = xrep_agf_find_btrees(sc, agf_bp, fab, agfl_bp);
> > > +	if (error)
> > > +		return error;
> > > +
> > > +	/* Start rewriting the header and implant the btrees we found. */
> > > +	xrep_agf_init_header(sc, agf_bp, &old_agf);
> > > +	xrep_agf_set_roots(sc, agf, fab);
> > > +	error = xrep_agf_calc_from_btrees(sc, agf_bp);
> > > +	if (error)
> > > +		goto out_revert;
> > > +
> > > +	/* Commit the changes and reinitialize incore state. */
> > > +	return xrep_agf_commit_new(sc, agf_bp);
> > > +
> > > +out_revert:
> > > +	/* Mark the incore AGF state stale and revert the AGF. */
> > > +	sc->sa.pag->pagf_init = 0;
> > > +	memcpy(agf, &old_agf, sizeof(old_agf));
> > > +	return error;
> > > +}
> > > diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
> > > index 85b048b341a0..17cf48564390 100644
> > > --- a/fs/xfs/scrub/repair.c
> > > +++ b/fs/xfs/scrub/repair.c
> > > @@ -128,9 +128,12 @@ xrep_roll_ag_trans(
> > >  	int			error;
> > >  
> > >  	/* Keep the AG header buffers locked so we can keep going. */
> > > -	xfs_trans_bhold(sc->tp, sc->sa.agi_bp);
> > > -	xfs_trans_bhold(sc->tp, sc->sa.agf_bp);
> > > -	xfs_trans_bhold(sc->tp, sc->sa.agfl_bp);
> > > +	if (sc->sa.agi_bp)
> > > +		xfs_trans_bhold(sc->tp, sc->sa.agi_bp);
> > > +	if (sc->sa.agf_bp)
> > > +		xfs_trans_bhold(sc->tp, sc->sa.agf_bp);
> > > +	if (sc->sa.agfl_bp)
> > > +		xfs_trans_bhold(sc->tp, sc->sa.agfl_bp);
> > >  
> > >  	/* Roll the transaction. */
> > >  	error = xfs_trans_roll(&sc->tp);
> > > @@ -138,9 +141,12 @@ xrep_roll_ag_trans(
> > >  		goto out_release;
> > >  
> > >  	/* Join AG headers to the new transaction. */
> > > -	xfs_trans_bjoin(sc->tp, sc->sa.agi_bp);
> > > -	xfs_trans_bjoin(sc->tp, sc->sa.agf_bp);
> > > -	xfs_trans_bjoin(sc->tp, sc->sa.agfl_bp);
> > > +	if (sc->sa.agi_bp)
> > > +		xfs_trans_bjoin(sc->tp, sc->sa.agi_bp);
> > > +	if (sc->sa.agf_bp)
> > > +		xfs_trans_bjoin(sc->tp, sc->sa.agf_bp);
> > > +	if (sc->sa.agfl_bp)
> > > +		xfs_trans_bjoin(sc->tp, sc->sa.agfl_bp);
> > >  
> > >  	return 0;
> > >  
> > > @@ -150,9 +156,12 @@ xrep_roll_ag_trans(
> > >  	 * buffers will be released during teardown on our way out
> > >  	 * of the kernel.
> > >  	 */
> > > -	xfs_trans_bhold_release(sc->tp, sc->sa.agi_bp);
> > > -	xfs_trans_bhold_release(sc->tp, sc->sa.agf_bp);
> > > -	xfs_trans_bhold_release(sc->tp, sc->sa.agfl_bp);
> > > +	if (sc->sa.agi_bp)
> > > +		xfs_trans_bhold_release(sc->tp, sc->sa.agi_bp);
> > > +	if (sc->sa.agf_bp)
> > > +		xfs_trans_bhold_release(sc->tp, sc->sa.agf_bp);
> > > +	if (sc->sa.agfl_bp)
> > > +		xfs_trans_bhold_release(sc->tp, sc->sa.agfl_bp);
> > >  
> > >  	return error;
> > >  }
> > > diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
> > > index 5a4e92221916..1d283360b5ab 100644
> > > --- a/fs/xfs/scrub/repair.h
> > > +++ b/fs/xfs/scrub/repair.h
> > > @@ -58,6 +58,8 @@ int xrep_ino_dqattach(struct xfs_scrub *sc);
> > >  
> > >  int xrep_probe(struct xfs_scrub *sc);
> > >  int xrep_superblock(struct xfs_scrub *sc);
> > > +int xrep_agf(struct xfs_scrub *sc);
> > > +int xrep_agfl(struct xfs_scrub *sc);
> > >  
> > >  #else
> > >  
> > > @@ -81,6 +83,8 @@ xrep_calc_ag_resblks(
> > >  
> > >  #define xrep_probe			xrep_notsupported
> > >  #define xrep_superblock			xrep_notsupported
> > > +#define xrep_agf			xrep_notsupported
> > > +#define xrep_agfl			xrep_notsupported
> > >  
> > >  #endif /* CONFIG_XFS_ONLINE_REPAIR */
> > >  
> > > diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
> > > index 6efb926f3cf8..1e8a17c8e2b9 100644
> > > --- a/fs/xfs/scrub/scrub.c
> > > +++ b/fs/xfs/scrub/scrub.c
> > > @@ -214,7 +214,7 @@ static const struct xchk_meta_ops meta_scrub_ops[] = {
> > >  		.type	= ST_PERAG,
> > >  		.setup	= xchk_setup_fs,
> > >  		.scrub	= xchk_agf,
> > > -		.repair	= xrep_notsupported,
> > > +		.repair	= xrep_agf,
> > >  	},
> > >  	[XFS_SCRUB_TYPE_AGFL]= {	/* agfl */
> > >  		.type	= ST_PERAG,
> > > 
> > > --
> > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > the body of a message to majordomo@vger.kernel.org
> > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Darrick J. Wong July 27, 2018, 6:19 p.m. UTC | #4
On Fri, Jul 27, 2018 at 12:25:56PM -0400, Brian Foster wrote:
> On Fri, Jul 27, 2018 at 09:02:38AM -0700, Darrick J. Wong wrote:
> > On Fri, Jul 27, 2018 at 10:23:48AM -0400, Brian Foster wrote:
> > > On Wed, Jul 25, 2018 at 05:19:55PM -0700, Darrick J. Wong wrote:
> > > > From: Darrick J. Wong <darrick.wong@oracle.com>
> > > > 
> > > > Regenerate the AGF from the rmap data.
> > > > 
> > > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > > > ---
> > > 
> > > Mostly seems sane to me. I still need to come up to speed on the broader
> > > xfs_scrub context. A few comments in the meantime..
> > 
> > <nod> Thanks for taking a look at this series. :)
> > 
> > > >  fs/xfs/scrub/agheader_repair.c |  366 ++++++++++++++++++++++++++++++++++++++++
> > > >  fs/xfs/scrub/repair.c          |   27 ++-
> > > >  fs/xfs/scrub/repair.h          |    4 
> > > >  fs/xfs/scrub/scrub.c           |    2 
> > > >  4 files changed, 389 insertions(+), 10 deletions(-)
> > > > 
> > > > 
> > > > diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c
> > > > index 1e96621ece3a..938af216cb1c 100644
> > > > --- a/fs/xfs/scrub/agheader_repair.c
> > > > +++ b/fs/xfs/scrub/agheader_repair.c
> > > ...
> > > > @@ -54,3 +61,362 @@ xrep_superblock(
> > > >  	xfs_trans_log_buf(sc->tp, bp, 0, BBTOB(bp->b_length) - 1);
> > > >  	return error;
> > > >  }
> > > ...
> > > > +/* Update all AGF fields which derive from btree contents. */
> > > > +STATIC int
> > > > +xrep_agf_calc_from_btrees(
> > > > +	struct xfs_scrub	*sc,
> > > > +	struct xfs_buf		*agf_bp)
> > > > +{
> > > > +	struct xrep_agf_allocbt	raa = { .sc = sc };
> > > > +	struct xfs_btree_cur	*cur = NULL;
> > > > +	struct xfs_agf		*agf = XFS_BUF_TO_AGF(agf_bp);
> > > > +	struct xfs_mount	*mp = sc->mp;
> > > > +	xfs_agblock_t		btreeblks;
> > > > +	xfs_agblock_t		blocks;
> > > > +	int			error;
> > > > +
> > > > +	/* Update the AGF counters from the bnobt. */
> > > > +	cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno,
> > > > +			XFS_BTNUM_BNO);
> > > > +	error = xfs_alloc_query_all(cur, xrep_agf_walk_allocbt, &raa);
> > > > +	if (error)
> > > > +		goto err;
> > > > +	error = xfs_btree_count_blocks(cur, &blocks);
> > > > +	if (error)
> > > > +		goto err;
> > > > +	xfs_btree_del_cursor(cur, error);
> > > > +	btreeblks = blocks - 1;
> > > 
> > > Why the -1? We don't count the root or something?
> > 
> > The AGF btreeblks field only counts the number of blocks added to the
> > bno/cnt/rmapbt since they were initialized (each with a single root
> > block).  I find it a little strange not to count the root, but oh well.
> > 
> 
> Got it.
> 
> > > > +	agf->agf_freeblks = cpu_to_be32(raa.freeblks);
> > > > +	agf->agf_longest = cpu_to_be32(raa.longest);
> > > > +
> > > > +	/* Update the AGF counters from the cntbt. */
> > > > +	cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno,
> > > > +			XFS_BTNUM_CNT);
> > > > +	error = xfs_btree_count_blocks(cur, &blocks);
> > > > +	if (error)
> > > > +		goto err;
> > > > +	xfs_btree_del_cursor(cur, error);
> > > > +	btreeblks += blocks - 1;
> > > > +
> > > > +	/* Update the AGF counters from the rmapbt. */
> > > > +	cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno);
> > > > +	error = xfs_btree_count_blocks(cur, &blocks);
> > > > +	if (error)
> > > > +		goto err;
> > > > +	xfs_btree_del_cursor(cur, error);
> > > > +	agf->agf_rmap_blocks = cpu_to_be32(blocks);
> > > > +	btreeblks += blocks - 1;
> > > > +
> > > > +	agf->agf_btreeblks = cpu_to_be32(btreeblks);
> > > > +
> > > > +	/* Update the AGF counters from the refcountbt. */
> > > > +	if (xfs_sb_version_hasreflink(&mp->m_sb)) {
> > > > +		cur = xfs_refcountbt_init_cursor(mp, sc->tp, agf_bp,
> > > > +				sc->sa.agno, NULL);
> > > 
> > > FYI this fails to compile on for-next (dfops param has been removed).
> > 
> > Yeah, I'm working on a rebase to for-next (once I settle on the locking
> > question in hch's "reduce cow lookups" series).
> > 
> > > > +		error = xfs_btree_count_blocks(cur, &blocks);
> > > > +		if (error)
> > > > +			goto err;
> > > > +		xfs_btree_del_cursor(cur, error);
> > > > +		agf->agf_refcount_blocks = cpu_to_be32(blocks);
> > > > +	}
> > > > +
> > > > +	return 0;
> > > > +err:
> > > > +	xfs_btree_del_cursor(cur, error);
> > > > +	return error;
> > > > +}
> > > > +
> ...
> > > > +/* Repair the AGF. v5 filesystems only. */
> > > > +int
> > > > +xrep_agf(
> > > > +	struct xfs_scrub		*sc)
> > > > +{
> > > > +	struct xrep_find_ag_btree	fab[XREP_AGF_MAX] = {
> > > > +		[XREP_AGF_BNOBT] = {
> > > > +			.rmap_owner = XFS_RMAP_OWN_AG,
> > > > +			.buf_ops = &xfs_allocbt_buf_ops,
> > > > +			.magic = XFS_ABTB_CRC_MAGIC,
> > > > +		},
> > > > +		[XREP_AGF_CNTBT] = {
> > > > +			.rmap_owner = XFS_RMAP_OWN_AG,
> > > > +			.buf_ops = &xfs_allocbt_buf_ops,
> > > > +			.magic = XFS_ABTC_CRC_MAGIC,
> > > > +		},
> > > > +		[XREP_AGF_RMAPBT] = {
> > > > +			.rmap_owner = XFS_RMAP_OWN_AG,
> > > > +			.buf_ops = &xfs_rmapbt_buf_ops,
> > > > +			.magic = XFS_RMAP_CRC_MAGIC,
> > > > +		},
> > > > +		[XREP_AGF_REFCOUNTBT] = {
> > > > +			.rmap_owner = XFS_RMAP_OWN_REFC,
> > > > +			.buf_ops = &xfs_refcountbt_buf_ops,
> > > > +			.magic = XFS_REFC_CRC_MAGIC,
> > > > +		},
> > > > +		[XREP_AGF_END] = {
> > > > +			.buf_ops = NULL,
> > > > +		},
> > > > +	};
> > > > +	struct xfs_agf			old_agf;
> > > > +	struct xfs_mount		*mp = sc->mp;
> > > > +	struct xfs_buf			*agf_bp;
> > > > +	struct xfs_buf			*agfl_bp;
> > > > +	struct xfs_agf			*agf;
> > > > +	int				error;
> > > > +
> > > > +	/* We require the rmapbt to rebuild anything. */
> > > > +	if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
> > > > +		return -EOPNOTSUPP;
> > > > +
> > > > +	xchk_perag_get(sc->mp, &sc->sa);
> > > > +	error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp,
> > > > +			XFS_AG_DADDR(mp, sc->sa.agno, XFS_AGF_DADDR(mp)),
> > > > +			XFS_FSS_TO_BB(mp, 1), 0, &agf_bp, NULL);
> > > > +	if (error)
> > > > +		return error;
> > > > +	agf_bp->b_ops = &xfs_agf_buf_ops;
> > > 
> > > Any reason we don't call xfs_read_agf() here? It looks like we use the
> > > similar helper for the agfl below.
> > 
> > We're grabbing the agf buffer without read verifiers so that we can
> > reinitialize it.  Note that scrub tries xfs_read_agf, and if it fails
> > with -EFSCORRUPTED/-EFSBADCRC it marks the agf as corrupt, so it's
> > possible that sc->sa.sa_agf is still null.
> > 
> 
> Ah, makes sense. I missed the NULL b_ops..
> 
> > This probably could have been trans_get_buf though...
> > 
> > > > +	agf = XFS_BUF_TO_AGF(agf_bp);
> > > > +
> > > > +	/*
> > > > +	 * Load the AGFL so that we can screen out OWN_AG blocks that are on
> > > > +	 * the AGFL now; these blocks might have once been part of the
> > > > +	 * bno/cnt/rmap btrees but are not now.  This is a chicken and egg
> > > > +	 * problem: the AGF is corrupt, so we have to trust the AGFL contents
> > > > +	 * because we can't do any serious cross-referencing with any of the
> > > > +	 * btrees rooted in the AGF.  If the AGFL contents are obviously bad
> > > > +	 * then we'll bail out.
> > > > +	 */
> > > > +	error = xfs_alloc_read_agfl(mp, sc->tp, sc->sa.agno, &agfl_bp);
> > > > +	if (error)
> > > > +		return error;
> > > > +
> > > > +	/*
> > > > +	 * Spot-check the AGFL blocks; if they're obviously corrupt then
> > > > +	 * there's nothing we can do but bail out.
> > > > +	 */
> > > 
> > > Why? Can't we reset the agfl, or is that handled elsewhere?
> > 
> > It's handled in xrep_agfl, but userspace will have to call us again to
> > fix the agfl and then call us a third time about the agf repair.
> > 
> 
> Ok, this was one of the things I feel like I don't have enough context
> on wrt to online repair: in general, what dependent structures we expect
> to be consistent in order to repair some other interrelated structure.
> Userspace repair is straightforward in this regard since we slurp the
> whole fs into memory, adjust the global state as we go, then essentially
> regenerate new metadata based on the finalized state.

<nod>

> For online repair, it sounds like we're potentially limited because
> repair of one structure may always depend on some other subset of
> metadata being consistent, right?

Correct.  There's an implicit assumption here (which is coded into a
warning message in xfs_scrub) that if the primary and secondary metadata
(rmapbt usually) are corrupt then the fs may be unrecoverable online...

> If so, is the goal of online repair to essentially "do the best we can
> but otherwise the most severe corruptions may have to always fall back
> to xfs_repair?"

...and so the only recourse is xfs_repair, with the usual caveat that a
severely damaged filesystem might be a total loss.  If the online repair
fails then the fs will be shut down (either due to cancelling a dirty
repair transaction or because xfs_scrub can call FS_IOC_SHUTDOWN after a
failure).

> So in this particular case, we expect a sane agfl and otherwise buzz off
> because 1.) this is a targeted agf repair request and 2.) we have a
> separate request to deal with the agfl. It sounds like the smarts to
> understand how we might have to jump back and forth between them is in
> userspace, so the end-user doesn't necessarily have to understand the
> dependency.

Correct.  Userspace should be running xfs_scrub, which knows in which
order metadata has to be checked, and what dependencies must be
satisfied for repairs to succeed.  While it's possible to invoke it
manually via xfs_io, in practice nobody but xfstests should be using
that method of invocation.

--D

> Brian
> 
> > (xfs_scrub does this, naturally...)
> > 
> > --D
> > 
> > > Brian
> > > 
> > > > +	error = xfs_agfl_walk(sc->mp, XFS_BUF_TO_AGF(agf_bp), agfl_bp,
> > > > +			xrep_agf_check_agfl_block, sc);
> > > > +	if (error)
> > > > +		return error;
> > > > +
> > > > +	/*
> > > > +	 * Find the AGF btree roots.  This is also a chicken-and-egg situation;
> > > > +	 * see the function for more details.
> > > > +	 */
> > > > +	error = xrep_agf_find_btrees(sc, agf_bp, fab, agfl_bp);
> > > > +	if (error)
> > > > +		return error;
> > > > +
> > > > +	/* Start rewriting the header and implant the btrees we found. */
> > > > +	xrep_agf_init_header(sc, agf_bp, &old_agf);
> > > > +	xrep_agf_set_roots(sc, agf, fab);
> > > > +	error = xrep_agf_calc_from_btrees(sc, agf_bp);
> > > > +	if (error)
> > > > +		goto out_revert;
> > > > +
> > > > +	/* Commit the changes and reinitialize incore state. */
> > > > +	return xrep_agf_commit_new(sc, agf_bp);
> > > > +
> > > > +out_revert:
> > > > +	/* Mark the incore AGF state stale and revert the AGF. */
> > > > +	sc->sa.pag->pagf_init = 0;
> > > > +	memcpy(agf, &old_agf, sizeof(old_agf));
> > > > +	return error;
> > > > +}
> > > > diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
> > > > index 85b048b341a0..17cf48564390 100644
> > > > --- a/fs/xfs/scrub/repair.c
> > > > +++ b/fs/xfs/scrub/repair.c
> > > > @@ -128,9 +128,12 @@ xrep_roll_ag_trans(
> > > >  	int			error;
> > > >  
> > > >  	/* Keep the AG header buffers locked so we can keep going. */
> > > > -	xfs_trans_bhold(sc->tp, sc->sa.agi_bp);
> > > > -	xfs_trans_bhold(sc->tp, sc->sa.agf_bp);
> > > > -	xfs_trans_bhold(sc->tp, sc->sa.agfl_bp);
> > > > +	if (sc->sa.agi_bp)
> > > > +		xfs_trans_bhold(sc->tp, sc->sa.agi_bp);
> > > > +	if (sc->sa.agf_bp)
> > > > +		xfs_trans_bhold(sc->tp, sc->sa.agf_bp);
> > > > +	if (sc->sa.agfl_bp)
> > > > +		xfs_trans_bhold(sc->tp, sc->sa.agfl_bp);
> > > >  
> > > >  	/* Roll the transaction. */
> > > >  	error = xfs_trans_roll(&sc->tp);
> > > > @@ -138,9 +141,12 @@ xrep_roll_ag_trans(
> > > >  		goto out_release;
> > > >  
> > > >  	/* Join AG headers to the new transaction. */
> > > > -	xfs_trans_bjoin(sc->tp, sc->sa.agi_bp);
> > > > -	xfs_trans_bjoin(sc->tp, sc->sa.agf_bp);
> > > > -	xfs_trans_bjoin(sc->tp, sc->sa.agfl_bp);
> > > > +	if (sc->sa.agi_bp)
> > > > +		xfs_trans_bjoin(sc->tp, sc->sa.agi_bp);
> > > > +	if (sc->sa.agf_bp)
> > > > +		xfs_trans_bjoin(sc->tp, sc->sa.agf_bp);
> > > > +	if (sc->sa.agfl_bp)
> > > > +		xfs_trans_bjoin(sc->tp, sc->sa.agfl_bp);
> > > >  
> > > >  	return 0;
> > > >  
> > > > @@ -150,9 +156,12 @@ xrep_roll_ag_trans(
> > > >  	 * buffers will be released during teardown on our way out
> > > >  	 * of the kernel.
> > > >  	 */
> > > > -	xfs_trans_bhold_release(sc->tp, sc->sa.agi_bp);
> > > > -	xfs_trans_bhold_release(sc->tp, sc->sa.agf_bp);
> > > > -	xfs_trans_bhold_release(sc->tp, sc->sa.agfl_bp);
> > > > +	if (sc->sa.agi_bp)
> > > > +		xfs_trans_bhold_release(sc->tp, sc->sa.agi_bp);
> > > > +	if (sc->sa.agf_bp)
> > > > +		xfs_trans_bhold_release(sc->tp, sc->sa.agf_bp);
> > > > +	if (sc->sa.agfl_bp)
> > > > +		xfs_trans_bhold_release(sc->tp, sc->sa.agfl_bp);
> > > >  
> > > >  	return error;
> > > >  }
> > > > diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
> > > > index 5a4e92221916..1d283360b5ab 100644
> > > > --- a/fs/xfs/scrub/repair.h
> > > > +++ b/fs/xfs/scrub/repair.h
> > > > @@ -58,6 +58,8 @@ int xrep_ino_dqattach(struct xfs_scrub *sc);
> > > >  
> > > >  int xrep_probe(struct xfs_scrub *sc);
> > > >  int xrep_superblock(struct xfs_scrub *sc);
> > > > +int xrep_agf(struct xfs_scrub *sc);
> > > > +int xrep_agfl(struct xfs_scrub *sc);
> > > >  
> > > >  #else
> > > >  
> > > > @@ -81,6 +83,8 @@ xrep_calc_ag_resblks(
> > > >  
> > > >  #define xrep_probe			xrep_notsupported
> > > >  #define xrep_superblock			xrep_notsupported
> > > > +#define xrep_agf			xrep_notsupported
> > > > +#define xrep_agfl			xrep_notsupported
> > > >  
> > > >  #endif /* CONFIG_XFS_ONLINE_REPAIR */
> > > >  
> > > > diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
> > > > index 6efb926f3cf8..1e8a17c8e2b9 100644
> > > > --- a/fs/xfs/scrub/scrub.c
> > > > +++ b/fs/xfs/scrub/scrub.c
> > > > @@ -214,7 +214,7 @@ static const struct xchk_meta_ops meta_scrub_ops[] = {
> > > >  		.type	= ST_PERAG,
> > > >  		.setup	= xchk_setup_fs,
> > > >  		.scrub	= xchk_agf,
> > > > -		.repair	= xrep_notsupported,
> > > > +		.repair	= xrep_agf,
> > > >  	},
> > > >  	[XFS_SCRUB_TYPE_AGFL]= {	/* agfl */
> > > >  		.type	= ST_PERAG,
> > > > 
> > > > --
> > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > the body of a message to majordomo@vger.kernel.org
> > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > --
> > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > the body of a message to majordomo@vger.kernel.org
> > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox series

Patch

diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c
index 1e96621ece3a..938af216cb1c 100644
--- a/fs/xfs/scrub/agheader_repair.c
+++ b/fs/xfs/scrub/agheader_repair.c
@@ -17,12 +17,19 @@ 
 #include "xfs_sb.h"
 #include "xfs_inode.h"
 #include "xfs_alloc.h"
+#include "xfs_alloc_btree.h"
 #include "xfs_ialloc.h"
+#include "xfs_ialloc_btree.h"
 #include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_refcount.h"
+#include "xfs_refcount_btree.h"
 #include "scrub/xfs_scrub.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/bitmap.h"
 
 /* Superblock */
 
@@ -54,3 +61,362 @@  xrep_superblock(
 	xfs_trans_log_buf(sc->tp, bp, 0, BBTOB(bp->b_length) - 1);
 	return error;
 }
+
+/* AGF */
+
+struct xrep_agf_allocbt {
+	struct xfs_scrub	*sc;
+	xfs_agblock_t		freeblks;
+	xfs_agblock_t		longest;
+};
+
+/* Record free space shape information. */
+STATIC int
+xrep_agf_walk_allocbt(
+	struct xfs_btree_cur		*cur,
+	struct xfs_alloc_rec_incore	*rec,
+	void				*priv)
+{
+	struct xrep_agf_allocbt		*raa = priv;
+	int				error = 0;
+
+	if (xchk_should_terminate(raa->sc, &error))
+		return error;
+
+	raa->freeblks += rec->ar_blockcount;
+	if (rec->ar_blockcount > raa->longest)
+		raa->longest = rec->ar_blockcount;
+	return error;
+}
+
+/* Does this AGFL block look sane? */
+STATIC int
+xrep_agf_check_agfl_block(
+	struct xfs_mount	*mp,
+	xfs_agblock_t		agbno,
+	void			*priv)
+{
+	struct xfs_scrub	*sc = priv;
+
+	if (!xfs_verify_agbno(mp, sc->sa.agno, agbno))
+		return -EFSCORRUPTED;
+	return 0;
+}
+
+/*
+ * Offset within the xrep_find_ag_btree array for each btree type.  Avoid the
+ * XFS_BTNUM_ names here to avoid creating a sparse array.
+ */
+enum {
+	XREP_AGF_BNOBT = 0,
+	XREP_AGF_CNTBT,
+	XREP_AGF_RMAPBT,
+	XREP_AGF_REFCOUNTBT,
+	XREP_AGF_END,
+	XREP_AGF_MAX
+};
+
+/*
+ * Given the btree roots described by *fab, find the roots, check them for
+ * sanity, and pass the root data back out via *fab.
+ *
+ * This is /also/ a chicken and egg problem because we have to use the rmapbt
+ * (rooted in the AGF) to find the btrees rooted in the AGF.  We also have no
+ * idea if the btrees make any sense.  If we hit obvious corruptions in those
+ * btrees we'll bail out.
+ */
+STATIC int
+xrep_agf_find_btrees(
+	struct xfs_scrub		*sc,
+	struct xfs_buf			*agf_bp,
+	struct xrep_find_ag_btree	*fab,
+	struct xfs_buf			*agfl_bp)
+{
+	struct xfs_agf			*old_agf = XFS_BUF_TO_AGF(agf_bp);
+	int				error;
+
+	/* Go find the root data. */
+	error = xrep_find_ag_btree_roots(sc, agf_bp, fab, agfl_bp);
+	if (error)
+		return error;
+
+	/* We must find the bnobt, cntbt, and rmapbt roots. */
+	if (fab[XREP_AGF_BNOBT].root == NULLAGBLOCK ||
+	    fab[XREP_AGF_BNOBT].height > XFS_BTREE_MAXLEVELS ||
+	    fab[XREP_AGF_CNTBT].root == NULLAGBLOCK ||
+	    fab[XREP_AGF_CNTBT].height > XFS_BTREE_MAXLEVELS ||
+	    fab[XREP_AGF_RMAPBT].root == NULLAGBLOCK ||
+	    fab[XREP_AGF_RMAPBT].height > XFS_BTREE_MAXLEVELS)
+		return -EFSCORRUPTED;
+
+	/*
+	 * We relied on the rmapbt to reconstruct the AGF.  If we get a
+	 * different root then something's seriously wrong.
+	 */
+	if (fab[XREP_AGF_RMAPBT].root !=
+	    be32_to_cpu(old_agf->agf_roots[XFS_BTNUM_RMAPi]))
+		return -EFSCORRUPTED;
+
+	/* We must find the refcountbt root if that feature is enabled. */
+	if (xfs_sb_version_hasreflink(&sc->mp->m_sb) &&
+	    (fab[XREP_AGF_REFCOUNTBT].root == NULLAGBLOCK ||
+	     fab[XREP_AGF_REFCOUNTBT].height > XFS_BTREE_MAXLEVELS))
+		return -EFSCORRUPTED;
+
+	return 0;
+}
+
+/*
+ * Reinitialize the AGF header, making an in-core copy of the old contents so
+ * that we know which in-core state needs to be reinitialized.
+ */
+STATIC void
+xrep_agf_init_header(
+	struct xfs_scrub	*sc,
+	struct xfs_buf		*agf_bp,
+	struct xfs_agf		*old_agf)
+{
+	struct xfs_mount	*mp = sc->mp;
+	struct xfs_agf		*agf = XFS_BUF_TO_AGF(agf_bp);
+
+	memcpy(old_agf, agf, sizeof(*old_agf));
+	memset(agf, 0, BBTOB(agf_bp->b_length));
+	agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC);
+	agf->agf_versionnum = cpu_to_be32(XFS_AGF_VERSION);
+	agf->agf_seqno = cpu_to_be32(sc->sa.agno);
+	agf->agf_length = cpu_to_be32(xfs_ag_block_count(mp, sc->sa.agno));
+	agf->agf_flfirst = old_agf->agf_flfirst;
+	agf->agf_fllast = old_agf->agf_fllast;
+	agf->agf_flcount = old_agf->agf_flcount;
+	if (xfs_sb_version_hascrc(&mp->m_sb))
+		uuid_copy(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid);
+
+	/* Mark the incore AGF data stale until we're done fixing things. */
+	ASSERT(sc->sa.pag->pagf_init);
+	sc->sa.pag->pagf_init = 0;
+}
+
+/* Set btree root information in an AGF. */
+STATIC void
+xrep_agf_set_roots(
+	struct xfs_scrub		*sc,
+	struct xfs_agf			*agf,
+	struct xrep_find_ag_btree	*fab)
+{
+	agf->agf_roots[XFS_BTNUM_BNOi] =
+			cpu_to_be32(fab[XREP_AGF_BNOBT].root);
+	agf->agf_levels[XFS_BTNUM_BNOi] =
+			cpu_to_be32(fab[XREP_AGF_BNOBT].height);
+
+	agf->agf_roots[XFS_BTNUM_CNTi] =
+			cpu_to_be32(fab[XREP_AGF_CNTBT].root);
+	agf->agf_levels[XFS_BTNUM_CNTi] =
+			cpu_to_be32(fab[XREP_AGF_CNTBT].height);
+
+	agf->agf_roots[XFS_BTNUM_RMAPi] =
+			cpu_to_be32(fab[XREP_AGF_RMAPBT].root);
+	agf->agf_levels[XFS_BTNUM_RMAPi] =
+			cpu_to_be32(fab[XREP_AGF_RMAPBT].height);
+
+	if (xfs_sb_version_hasreflink(&sc->mp->m_sb)) {
+		agf->agf_refcount_root =
+				cpu_to_be32(fab[XREP_AGF_REFCOUNTBT].root);
+		agf->agf_refcount_level =
+				cpu_to_be32(fab[XREP_AGF_REFCOUNTBT].height);
+	}
+}
+
+/* Update all AGF fields which derive from btree contents. */
+STATIC int
+xrep_agf_calc_from_btrees(
+	struct xfs_scrub	*sc,
+	struct xfs_buf		*agf_bp)
+{
+	struct xrep_agf_allocbt	raa = { .sc = sc };
+	struct xfs_btree_cur	*cur = NULL;
+	struct xfs_agf		*agf = XFS_BUF_TO_AGF(agf_bp);
+	struct xfs_mount	*mp = sc->mp;
+	xfs_agblock_t		btreeblks;
+	xfs_agblock_t		blocks;
+	int			error;
+
+	/* Update the AGF counters from the bnobt. */
+	cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno,
+			XFS_BTNUM_BNO);
+	error = xfs_alloc_query_all(cur, xrep_agf_walk_allocbt, &raa);
+	if (error)
+		goto err;
+	error = xfs_btree_count_blocks(cur, &blocks);
+	if (error)
+		goto err;
+	xfs_btree_del_cursor(cur, error);
+	btreeblks = blocks - 1;
+	agf->agf_freeblks = cpu_to_be32(raa.freeblks);
+	agf->agf_longest = cpu_to_be32(raa.longest);
+
+	/* Update the AGF counters from the cntbt. */
+	cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno,
+			XFS_BTNUM_CNT);
+	error = xfs_btree_count_blocks(cur, &blocks);
+	if (error)
+		goto err;
+	xfs_btree_del_cursor(cur, error);
+	btreeblks += blocks - 1;
+
+	/* Update the AGF counters from the rmapbt. */
+	cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno);
+	error = xfs_btree_count_blocks(cur, &blocks);
+	if (error)
+		goto err;
+	xfs_btree_del_cursor(cur, error);
+	agf->agf_rmap_blocks = cpu_to_be32(blocks);
+	btreeblks += blocks - 1;
+
+	agf->agf_btreeblks = cpu_to_be32(btreeblks);
+
+	/* Update the AGF counters from the refcountbt. */
+	if (xfs_sb_version_hasreflink(&mp->m_sb)) {
+		cur = xfs_refcountbt_init_cursor(mp, sc->tp, agf_bp,
+				sc->sa.agno, NULL);
+		error = xfs_btree_count_blocks(cur, &blocks);
+		if (error)
+			goto err;
+		xfs_btree_del_cursor(cur, error);
+		agf->agf_refcount_blocks = cpu_to_be32(blocks);
+	}
+
+	return 0;
+err:
+	xfs_btree_del_cursor(cur, error);
+	return error;
+}
+
+/* Commit the new AGF and reinitialize the incore state. */
+STATIC int
+xrep_agf_commit_new(
+	struct xfs_scrub	*sc,
+	struct xfs_buf		*agf_bp)
+{
+	struct xfs_perag	*pag;
+	struct xfs_agf		*agf = XFS_BUF_TO_AGF(agf_bp);
+
+	/* Trigger fdblocks recalculation */
+	xfs_force_summary_recalc(sc->mp);
+
+	/* Write this to disk. */
+	xfs_trans_buf_set_type(sc->tp, agf_bp, XFS_BLFT_AGF_BUF);
+	xfs_trans_log_buf(sc->tp, agf_bp, 0, BBTOB(agf_bp->b_length) - 1);
+
+	/* Now reinitialize the in-core counters we changed. */
+	pag = sc->sa.pag;
+	sc->sa.pag->pagf_init = 1;
+	pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks);
+	pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks);
+	pag->pagf_longest = be32_to_cpu(agf->agf_longest);
+	pag->pagf_levels[XFS_BTNUM_BNOi] =
+			be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]);
+	pag->pagf_levels[XFS_BTNUM_CNTi] =
+			be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]);
+	pag->pagf_levels[XFS_BTNUM_RMAPi] =
+			be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAPi]);
+	pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level);
+
+	return 0;
+}
+
+/* Repair the AGF. v5 filesystems only. */
+int
+xrep_agf(
+	struct xfs_scrub		*sc)
+{
+	struct xrep_find_ag_btree	fab[XREP_AGF_MAX] = {
+		[XREP_AGF_BNOBT] = {
+			.rmap_owner = XFS_RMAP_OWN_AG,
+			.buf_ops = &xfs_allocbt_buf_ops,
+			.magic = XFS_ABTB_CRC_MAGIC,
+		},
+		[XREP_AGF_CNTBT] = {
+			.rmap_owner = XFS_RMAP_OWN_AG,
+			.buf_ops = &xfs_allocbt_buf_ops,
+			.magic = XFS_ABTC_CRC_MAGIC,
+		},
+		[XREP_AGF_RMAPBT] = {
+			.rmap_owner = XFS_RMAP_OWN_AG,
+			.buf_ops = &xfs_rmapbt_buf_ops,
+			.magic = XFS_RMAP_CRC_MAGIC,
+		},
+		[XREP_AGF_REFCOUNTBT] = {
+			.rmap_owner = XFS_RMAP_OWN_REFC,
+			.buf_ops = &xfs_refcountbt_buf_ops,
+			.magic = XFS_REFC_CRC_MAGIC,
+		},
+		[XREP_AGF_END] = {
+			.buf_ops = NULL,
+		},
+	};
+	struct xfs_agf			old_agf;
+	struct xfs_mount		*mp = sc->mp;
+	struct xfs_buf			*agf_bp;
+	struct xfs_buf			*agfl_bp;
+	struct xfs_agf			*agf;
+	int				error;
+
+	/* We require the rmapbt to rebuild anything. */
+	if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
+		return -EOPNOTSUPP;
+
+	xchk_perag_get(sc->mp, &sc->sa);
+	error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp,
+			XFS_AG_DADDR(mp, sc->sa.agno, XFS_AGF_DADDR(mp)),
+			XFS_FSS_TO_BB(mp, 1), 0, &agf_bp, NULL);
+	if (error)
+		return error;
+	agf_bp->b_ops = &xfs_agf_buf_ops;
+	agf = XFS_BUF_TO_AGF(agf_bp);
+
+	/*
+	 * Load the AGFL so that we can screen out OWN_AG blocks that are on
+	 * the AGFL now; these blocks might have once been part of the
+	 * bno/cnt/rmap btrees but are not now.  This is a chicken and egg
+	 * problem: the AGF is corrupt, so we have to trust the AGFL contents
+	 * because we can't do any serious cross-referencing with any of the
+	 * btrees rooted in the AGF.  If the AGFL contents are obviously bad
+	 * then we'll bail out.
+	 */
+	error = xfs_alloc_read_agfl(mp, sc->tp, sc->sa.agno, &agfl_bp);
+	if (error)
+		return error;
+
+	/*
+	 * Spot-check the AGFL blocks; if they're obviously corrupt then
+	 * there's nothing we can do but bail out.
+	 */
+	error = xfs_agfl_walk(sc->mp, XFS_BUF_TO_AGF(agf_bp), agfl_bp,
+			xrep_agf_check_agfl_block, sc);
+	if (error)
+		return error;
+
+	/*
+	 * Find the AGF btree roots.  This is also a chicken-and-egg situation;
+	 * see the function for more details.
+	 */
+	error = xrep_agf_find_btrees(sc, agf_bp, fab, agfl_bp);
+	if (error)
+		return error;
+
+	/* Start rewriting the header and implant the btrees we found. */
+	xrep_agf_init_header(sc, agf_bp, &old_agf);
+	xrep_agf_set_roots(sc, agf, fab);
+	error = xrep_agf_calc_from_btrees(sc, agf_bp);
+	if (error)
+		goto out_revert;
+
+	/* Commit the changes and reinitialize incore state. */
+	return xrep_agf_commit_new(sc, agf_bp);
+
+out_revert:
+	/* Mark the incore AGF state stale and revert the AGF. */
+	sc->sa.pag->pagf_init = 0;
+	memcpy(agf, &old_agf, sizeof(old_agf));
+	return error;
+}
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index 85b048b341a0..17cf48564390 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -128,9 +128,12 @@  xrep_roll_ag_trans(
 	int			error;
 
 	/* Keep the AG header buffers locked so we can keep going. */
-	xfs_trans_bhold(sc->tp, sc->sa.agi_bp);
-	xfs_trans_bhold(sc->tp, sc->sa.agf_bp);
-	xfs_trans_bhold(sc->tp, sc->sa.agfl_bp);
+	if (sc->sa.agi_bp)
+		xfs_trans_bhold(sc->tp, sc->sa.agi_bp);
+	if (sc->sa.agf_bp)
+		xfs_trans_bhold(sc->tp, sc->sa.agf_bp);
+	if (sc->sa.agfl_bp)
+		xfs_trans_bhold(sc->tp, sc->sa.agfl_bp);
 
 	/* Roll the transaction. */
 	error = xfs_trans_roll(&sc->tp);
@@ -138,9 +141,12 @@  xrep_roll_ag_trans(
 		goto out_release;
 
 	/* Join AG headers to the new transaction. */
-	xfs_trans_bjoin(sc->tp, sc->sa.agi_bp);
-	xfs_trans_bjoin(sc->tp, sc->sa.agf_bp);
-	xfs_trans_bjoin(sc->tp, sc->sa.agfl_bp);
+	if (sc->sa.agi_bp)
+		xfs_trans_bjoin(sc->tp, sc->sa.agi_bp);
+	if (sc->sa.agf_bp)
+		xfs_trans_bjoin(sc->tp, sc->sa.agf_bp);
+	if (sc->sa.agfl_bp)
+		xfs_trans_bjoin(sc->tp, sc->sa.agfl_bp);
 
 	return 0;
 
@@ -150,9 +156,12 @@  xrep_roll_ag_trans(
 	 * buffers will be released during teardown on our way out
 	 * of the kernel.
 	 */
-	xfs_trans_bhold_release(sc->tp, sc->sa.agi_bp);
-	xfs_trans_bhold_release(sc->tp, sc->sa.agf_bp);
-	xfs_trans_bhold_release(sc->tp, sc->sa.agfl_bp);
+	if (sc->sa.agi_bp)
+		xfs_trans_bhold_release(sc->tp, sc->sa.agi_bp);
+	if (sc->sa.agf_bp)
+		xfs_trans_bhold_release(sc->tp, sc->sa.agf_bp);
+	if (sc->sa.agfl_bp)
+		xfs_trans_bhold_release(sc->tp, sc->sa.agfl_bp);
 
 	return error;
 }
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index 5a4e92221916..1d283360b5ab 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -58,6 +58,8 @@  int xrep_ino_dqattach(struct xfs_scrub *sc);
 
 int xrep_probe(struct xfs_scrub *sc);
 int xrep_superblock(struct xfs_scrub *sc);
+int xrep_agf(struct xfs_scrub *sc);
+int xrep_agfl(struct xfs_scrub *sc);
 
 #else
 
@@ -81,6 +83,8 @@  xrep_calc_ag_resblks(
 
 #define xrep_probe			xrep_notsupported
 #define xrep_superblock			xrep_notsupported
+#define xrep_agf			xrep_notsupported
+#define xrep_agfl			xrep_notsupported
 
 #endif /* CONFIG_XFS_ONLINE_REPAIR */
 
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index 6efb926f3cf8..1e8a17c8e2b9 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -214,7 +214,7 @@  static const struct xchk_meta_ops meta_scrub_ops[] = {
 		.type	= ST_PERAG,
 		.setup	= xchk_setup_fs,
 		.scrub	= xchk_agf,
-		.repair	= xrep_notsupported,
+		.repair	= xrep_agf,
 	},
 	[XFS_SCRUB_TYPE_AGFL]= {	/* agfl */
 		.type	= ST_PERAG,