diff mbox

[v2,06/22] xfs: add a repair helper to reset superblock counters

Message ID 20180518035623.GD23858@magnolia (mailing list archive)
State New, archived
Headers show

Commit Message

Darrick J. Wong May 18, 2018, 3:56 a.m. UTC
From: Darrick J. Wong <darrick.wong@oracle.com>

Add a helper function to reset the superblock inode and block counters.
The AG rebuilding functions will need these to adjust the counts if they
need to change as a part of recovering from corruption.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Allison Henderson <allison.henderson@oracle.com>
---
v2: improve documentation
---
 fs/xfs/scrub/repair.c |   89 +++++++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/scrub/repair.h |    7 ++++
 fs/xfs/scrub/scrub.c  |    2 +
 fs/xfs/scrub/scrub.h  |    1 +
 4 files changed, 99 insertions(+)

--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Dave Chinner May 29, 2018, 3:28 a.m. UTC | #1
On Thu, May 17, 2018 at 08:56:23PM -0700, Darrick J. Wong wrote:
> From: Darrick J. Wong <darrick.wong@oracle.com>
> 
> Add a helper function to reset the superblock inode and block counters.
> The AG rebuilding functions will need these to adjust the counts if they
> need to change as a part of recovering from corruption.
> 
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> Reviewed-by: Allison Henderson <allison.henderson@oracle.com>
> ---
> v2: improve documentation
> ---
>  fs/xfs/scrub/repair.c |   89 +++++++++++++++++++++++++++++++++++++++++++++++++
>  fs/xfs/scrub/repair.h |    7 ++++
>  fs/xfs/scrub/scrub.c  |    2 +
>  fs/xfs/scrub/scrub.h  |    1 +
>  4 files changed, 99 insertions(+)
> 
> diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
> index 877488ce4bc8..4b95a15c0bd0 100644
> --- a/fs/xfs/scrub/repair.c
> +++ b/fs/xfs/scrub/repair.c
> @@ -1026,3 +1026,92 @@ xfs_repair_find_ag_btree_roots(
>  
>  	return error;
>  }
> +
> +/*
> + * Reset the superblock counters.
> + *
> + * If a repair function changes the inode or free block counters, it must set
> + * reset_counters to push this function to reset the global counters.  Repair
> + * functions are responsible for resetting all other in-core state.  This
> + * function runs outside of transaction context after the repair context has
> + * been torn down, so if there's further filesystem corruption we'll error out
> + * to userspace and give userspace a chance to call back to fix the further
> + * errors.
> + */
> +int
> +xfs_repair_reset_counters(
> +	struct xfs_mount	*mp)
> +{
> +	struct xfs_buf		*agi_bp;
> +	struct xfs_buf		*agf_bp;
> +	struct xfs_agi		*agi;
> +	struct xfs_agf		*agf;
> +	xfs_agnumber_t		agno;
> +	xfs_ino_t		icount = 0;
> +	xfs_ino_t		ifree = 0;
> +	xfs_filblks_t		fdblocks = 0;
> +	int64_t			delta_icount;
> +	int64_t			delta_ifree;
> +	int64_t			delta_fdblocks;
> +	int			error;
> +
> +	trace_xfs_repair_reset_counters(mp);
> +
> +	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
> +		/* Count all the inodes... */
> +		error = xfs_ialloc_read_agi(mp, NULL, agno, &agi_bp);
> +		if (error)
> +			return error;
> +		agi = XFS_BUF_TO_AGI(agi_bp);
> +		icount += be32_to_cpu(agi->agi_count);
> +		ifree += be32_to_cpu(agi->agi_freecount);
> +		xfs_buf_relse(agi_bp);
> +
> +		/* Add up the free/freelist/bnobt/cntbt blocks... */
> +		error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agf_bp);
> +		if (error)
> +			return error;
> +		if (!agf_bp)
> +			return -ENOMEM;
> +		agf = XFS_BUF_TO_AGF(agf_bp);
> +		fdblocks += be32_to_cpu(agf->agf_freeblks);
> +		fdblocks += be32_to_cpu(agf->agf_flcount);
> +		fdblocks += be32_to_cpu(agf->agf_btreeblks);
> +		xfs_buf_relse(agf_bp);
> +	}
> +
> +	/*
> +	 * Reinitialize the counters.  The on-disk and in-core counters differ
> +	 * by the number of inodes/blocks reserved by the admin, the per-AG
> +	 * reservation, and any transactions in progress, so we have to
> +	 * account for that.  First we take the sb lock and update its
> +	 * counters...
> +	 */
> +	spin_lock(&mp->m_sb_lock);
> +	delta_icount = (int64_t)mp->m_sb.sb_icount - icount;
> +	delta_ifree = (int64_t)mp->m_sb.sb_ifree - ifree;
> +	delta_fdblocks = (int64_t)mp->m_sb.sb_fdblocks - fdblocks;
> +	mp->m_sb.sb_icount = icount;
> +	mp->m_sb.sb_ifree = ifree;
> +	mp->m_sb.sb_fdblocks = fdblocks;
> +	spin_unlock(&mp->m_sb_lock);

This seems racy to me ? i.e. the per-ag counters can change while
we are summing them, and once we've summed them then sb counters
can change while we are waiting for the m_sb_lock. It's looks to me
like the summed per-ag counters are not in any way coherent
wit the superblock or the in-core per-CPU counters, so I'm
struggling to understand why this is safe?

We can do this sort of summation at mount time (in
xfs_initialize_perag_data()) because the filesystem is running
single threaded while the summation is taking place and so nothing
is changing during th summation. The filesystem is active in this
case, so I don't think we can do the same thing here.

Also, it brought a question to mind because I haven't clearly noted
it happening yet: when do the xfs_perag counters get corrected? And
if they are already correct, why not just iterate the perag
counters?

Cheers,

Dave.
Darrick J. Wong May 29, 2018, 10:07 p.m. UTC | #2
On Tue, May 29, 2018 at 01:28:10PM +1000, Dave Chinner wrote:
> On Thu, May 17, 2018 at 08:56:23PM -0700, Darrick J. Wong wrote:
> > From: Darrick J. Wong <darrick.wong@oracle.com>
> > 
> > Add a helper function to reset the superblock inode and block counters.
> > The AG rebuilding functions will need these to adjust the counts if they
> > need to change as a part of recovering from corruption.
> > 
> > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > Reviewed-by: Allison Henderson <allison.henderson@oracle.com>
> > ---
> > v2: improve documentation
> > ---
> >  fs/xfs/scrub/repair.c |   89 +++++++++++++++++++++++++++++++++++++++++++++++++
> >  fs/xfs/scrub/repair.h |    7 ++++
> >  fs/xfs/scrub/scrub.c  |    2 +
> >  fs/xfs/scrub/scrub.h  |    1 +
> >  4 files changed, 99 insertions(+)
> > 
> > diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
> > index 877488ce4bc8..4b95a15c0bd0 100644
> > --- a/fs/xfs/scrub/repair.c
> > +++ b/fs/xfs/scrub/repair.c
> > @@ -1026,3 +1026,92 @@ xfs_repair_find_ag_btree_roots(
> >  
> >  	return error;
> >  }
> > +
> > +/*
> > + * Reset the superblock counters.
> > + *
> > + * If a repair function changes the inode or free block counters, it must set
> > + * reset_counters to push this function to reset the global counters.  Repair
> > + * functions are responsible for resetting all other in-core state.  This
> > + * function runs outside of transaction context after the repair context has
> > + * been torn down, so if there's further filesystem corruption we'll error out
> > + * to userspace and give userspace a chance to call back to fix the further
> > + * errors.
> > + */
> > +int
> > +xfs_repair_reset_counters(
> > +	struct xfs_mount	*mp)
> > +{
> > +	struct xfs_buf		*agi_bp;
> > +	struct xfs_buf		*agf_bp;
> > +	struct xfs_agi		*agi;
> > +	struct xfs_agf		*agf;
> > +	xfs_agnumber_t		agno;
> > +	xfs_ino_t		icount = 0;
> > +	xfs_ino_t		ifree = 0;
> > +	xfs_filblks_t		fdblocks = 0;
> > +	int64_t			delta_icount;
> > +	int64_t			delta_ifree;
> > +	int64_t			delta_fdblocks;
> > +	int			error;
> > +
> > +	trace_xfs_repair_reset_counters(mp);
> > +
> > +	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
> > +		/* Count all the inodes... */
> > +		error = xfs_ialloc_read_agi(mp, NULL, agno, &agi_bp);
> > +		if (error)
> > +			return error;
> > +		agi = XFS_BUF_TO_AGI(agi_bp);
> > +		icount += be32_to_cpu(agi->agi_count);
> > +		ifree += be32_to_cpu(agi->agi_freecount);
> > +		xfs_buf_relse(agi_bp);
> > +
> > +		/* Add up the free/freelist/bnobt/cntbt blocks... */
> > +		error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agf_bp);
> > +		if (error)
> > +			return error;
> > +		if (!agf_bp)
> > +			return -ENOMEM;
> > +		agf = XFS_BUF_TO_AGF(agf_bp);
> > +		fdblocks += be32_to_cpu(agf->agf_freeblks);
> > +		fdblocks += be32_to_cpu(agf->agf_flcount);
> > +		fdblocks += be32_to_cpu(agf->agf_btreeblks);
> > +		xfs_buf_relse(agf_bp);
> > +	}
> > +
> > +	/*
> > +	 * Reinitialize the counters.  The on-disk and in-core counters differ
> > +	 * by the number of inodes/blocks reserved by the admin, the per-AG
> > +	 * reservation, and any transactions in progress, so we have to
> > +	 * account for that.  First we take the sb lock and update its
> > +	 * counters...
> > +	 */
> > +	spin_lock(&mp->m_sb_lock);
> > +	delta_icount = (int64_t)mp->m_sb.sb_icount - icount;
> > +	delta_ifree = (int64_t)mp->m_sb.sb_ifree - ifree;
> > +	delta_fdblocks = (int64_t)mp->m_sb.sb_fdblocks - fdblocks;
> > +	mp->m_sb.sb_icount = icount;
> > +	mp->m_sb.sb_ifree = ifree;
> > +	mp->m_sb.sb_fdblocks = fdblocks;
> > +	spin_unlock(&mp->m_sb_lock);
> 
> This seems racy to me ? i.e. the per-ag counters can change while
> we are summing them, and once we've summed them then sb counters
> can change while we are waiting for the m_sb_lock. It's looks to me
> like the summed per-ag counters are not in any way coherent
> wit the superblock or the in-core per-CPU counters, so I'm
> struggling to understand why this is safe?

Hmm, yes, I think this is racy too.  The purpose of this code is to
recompute the global counters from the AG counters after any operation
that modifies anything that would affect the icount/ifreecount/fdblocks
counters...

> We can do this sort of summation at mount time (in
> xfs_initialize_perag_data()) because the filesystem is running
> single threaded while the summation is taking place and so nothing
> is changing during th summation. The filesystem is active in this
> case, so I don't think we can do the same thing here.

...however, you're correct to point out that the fs must be quiesced
before we can actually do this.  In other words, I think the filesystem
has to be completely frozen before we can do this.  Perhaps it's better
to have the per-ag rebuilders fix only the per-ag counters and leave the
global counters alone.  Then add a new scrubber that checks the summary
counters and fixes them if necessary.

> Also, it brought a question to mind because I haven't clearly noted
> it happening yet: when do the xfs_perag counters get corrected? And
> if they are already correct, why not just iterate the perag
> counters?

The xfs_perag counters are updated by the AGF/AGI/inobt rebuild code.

--D

> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dave Chinner May 29, 2018, 10:24 p.m. UTC | #3
On Tue, May 29, 2018 at 03:07:16PM -0700, Darrick J. Wong wrote:
> On Tue, May 29, 2018 at 01:28:10PM +1000, Dave Chinner wrote:
> > On Thu, May 17, 2018 at 08:56:23PM -0700, Darrick J. Wong wrote:
> > > +	/*
> > > +	 * Reinitialize the counters.  The on-disk and in-core counters differ
> > > +	 * by the number of inodes/blocks reserved by the admin, the per-AG
> > > +	 * reservation, and any transactions in progress, so we have to
> > > +	 * account for that.  First we take the sb lock and update its
> > > +	 * counters...
> > > +	 */
> > > +	spin_lock(&mp->m_sb_lock);
> > > +	delta_icount = (int64_t)mp->m_sb.sb_icount - icount;
> > > +	delta_ifree = (int64_t)mp->m_sb.sb_ifree - ifree;
> > > +	delta_fdblocks = (int64_t)mp->m_sb.sb_fdblocks - fdblocks;
> > > +	mp->m_sb.sb_icount = icount;
> > > +	mp->m_sb.sb_ifree = ifree;
> > > +	mp->m_sb.sb_fdblocks = fdblocks;
> > > +	spin_unlock(&mp->m_sb_lock);
> > 
> > This seems racy to me ? i.e. the per-ag counters can change while
> > we are summing them, and once we've summed them then sb counters
> > can change while we are waiting for the m_sb_lock. It's looks to me
> > like the summed per-ag counters are not in any way coherent
> > wit the superblock or the in-core per-CPU counters, so I'm
> > struggling to understand why this is safe?
> 
> Hmm, yes, I think this is racy too.  The purpose of this code is to
> recompute the global counters from the AG counters after any operation
> that modifies anything that would affect the icount/ifreecount/fdblocks
> counters...

*nod*

> > We can do this sort of summation at mount time (in
> > xfs_initialize_perag_data()) because the filesystem is running
> > single threaded while the summation is taking place and so nothing
> > is changing during th summation. The filesystem is active in this
> > case, so I don't think we can do the same thing here.
> 
> ...however, you're correct to point out that the fs must be quiesced
> before we can actually do this.  In other words, I think the filesystem
> has to be completely frozen before we can do this.  Perhaps it's better
> to have the per-ag rebuilders fix only the per-ag counters and leave the
> global counters alone.  Then add a new scrubber that checks the summary
> counters and fixes them if necessary.

So the question here is whether we actually need to accurately
correct the global superblock counters? We know that if we have a
dirty unmount, the counters will we re-initialised on mount from the
AG header information, so perhaps what we need here is a flag to
tell unmount to dirty the log again after it has written the unmount
record (like we currently do for quiesce).

That was we can do a racy "near enough" update here to get us out of
the worst of the space accounting mismatches, knowing that on the
next mount it will be accurately rebuilt.

Thoughts?

Cheers,

Dave.
Darrick J. Wong May 29, 2018, 10:43 p.m. UTC | #4
On Wed, May 30, 2018 at 08:24:28AM +1000, Dave Chinner wrote:
> On Tue, May 29, 2018 at 03:07:16PM -0700, Darrick J. Wong wrote:
> > On Tue, May 29, 2018 at 01:28:10PM +1000, Dave Chinner wrote:
> > > On Thu, May 17, 2018 at 08:56:23PM -0700, Darrick J. Wong wrote:
> > > > +	/*
> > > > +	 * Reinitialize the counters.  The on-disk and in-core counters differ
> > > > +	 * by the number of inodes/blocks reserved by the admin, the per-AG
> > > > +	 * reservation, and any transactions in progress, so we have to
> > > > +	 * account for that.  First we take the sb lock and update its
> > > > +	 * counters...
> > > > +	 */
> > > > +	spin_lock(&mp->m_sb_lock);
> > > > +	delta_icount = (int64_t)mp->m_sb.sb_icount - icount;
> > > > +	delta_ifree = (int64_t)mp->m_sb.sb_ifree - ifree;
> > > > +	delta_fdblocks = (int64_t)mp->m_sb.sb_fdblocks - fdblocks;
> > > > +	mp->m_sb.sb_icount = icount;
> > > > +	mp->m_sb.sb_ifree = ifree;
> > > > +	mp->m_sb.sb_fdblocks = fdblocks;
> > > > +	spin_unlock(&mp->m_sb_lock);
> > > 
> > > This seems racy to me ? i.e. the per-ag counters can change while
> > > we are summing them, and once we've summed them then sb counters
> > > can change while we are waiting for the m_sb_lock. It's looks to me
> > > like the summed per-ag counters are not in any way coherent
> > > wit the superblock or the in-core per-CPU counters, so I'm
> > > struggling to understand why this is safe?
> > 
> > Hmm, yes, I think this is racy too.  The purpose of this code is to
> > recompute the global counters from the AG counters after any operation
> > that modifies anything that would affect the icount/ifreecount/fdblocks
> > counters...
> 
> *nod*
> 
> > > We can do this sort of summation at mount time (in
> > > xfs_initialize_perag_data()) because the filesystem is running
> > > single threaded while the summation is taking place and so nothing
> > > is changing during th summation. The filesystem is active in this
> > > case, so I don't think we can do the same thing here.
> > 
> > ...however, you're correct to point out that the fs must be quiesced
> > before we can actually do this.  In other words, I think the filesystem
> > has to be completely frozen before we can do this.  Perhaps it's better
> > to have the per-ag rebuilders fix only the per-ag counters and leave the
> > global counters alone.  Then add a new scrubber that checks the summary
> > counters and fixes them if necessary.
> 
> So the question here is whether we actually need to accurately
> correct the global superblock counters?

I think so, because what happens if the superblock counter is
artificially high but the AGs do not actually have the free space?
xfs_trans_reserve won't ENOSPC like it should, so we could end up
blowing out of transactions and shutting down because some allocation
that has to succeed ("because trans_reserve said there was space!")
fails...

> We know that if we have a dirty unmount, the counters will we
> re-initialised on mount from the AG header information, so perhaps
> what we need here is a flag to tell unmount to dirty the log again
> after it has written the unmount record (like we currently do for
> quiesce).

...but now that we've repaired the filesystem, it could potentially run
for a very long time until the next unmount.  During that run, we'd be
misleading users about the real amount of free space and risking a hard
shutdown.  I prefer that online repair try not to leave any weird state
around after xfs_scrub exits.

> That was we can do a racy "near enough" update here to get us out of
> the worst of the space accounting mismatches, knowing that on the
> next mount it will be accurately rebuilt.
>
> Thoughts?

Well, I think the best solution is to have the AGF/AGI/inobt rebuilders
adjust the global counters by the same amount that they're adjusting the
counters in the AGF/AGI, then add a new scrubber that runs at the end to
freeze the fs and check/repair the global counter state. :)

--D

> 
> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dave Chinner May 30, 2018, 1:23 a.m. UTC | #5
On Tue, May 29, 2018 at 03:43:32PM -0700, Darrick J. Wong wrote:
> On Wed, May 30, 2018 at 08:24:28AM +1000, Dave Chinner wrote:
> > On Tue, May 29, 2018 at 03:07:16PM -0700, Darrick J. Wong wrote:
> > > On Tue, May 29, 2018 at 01:28:10PM +1000, Dave Chinner wrote:
> > > > On Thu, May 17, 2018 at 08:56:23PM -0700, Darrick J. Wong wrote:
> > > > > +	/*
> > > > > +	 * Reinitialize the counters.  The on-disk and in-core counters differ
> > > > > +	 * by the number of inodes/blocks reserved by the admin, the per-AG
> > > > > +	 * reservation, and any transactions in progress, so we have to
> > > > > +	 * account for that.  First we take the sb lock and update its
> > > > > +	 * counters...
> > > > > +	 */
> > > > > +	spin_lock(&mp->m_sb_lock);
> > > > > +	delta_icount = (int64_t)mp->m_sb.sb_icount - icount;
> > > > > +	delta_ifree = (int64_t)mp->m_sb.sb_ifree - ifree;
> > > > > +	delta_fdblocks = (int64_t)mp->m_sb.sb_fdblocks - fdblocks;
> > > > > +	mp->m_sb.sb_icount = icount;
> > > > > +	mp->m_sb.sb_ifree = ifree;
> > > > > +	mp->m_sb.sb_fdblocks = fdblocks;
> > > > > +	spin_unlock(&mp->m_sb_lock);
> > > > 
> > > > This seems racy to me ? i.e. the per-ag counters can change while
> > > > we are summing them, and once we've summed them then sb counters
> > > > can change while we are waiting for the m_sb_lock. It's looks to me
> > > > like the summed per-ag counters are not in any way coherent
> > > > wit the superblock or the in-core per-CPU counters, so I'm
> > > > struggling to understand why this is safe?
> > > 
> > > Hmm, yes, I think this is racy too.  The purpose of this code is to
> > > recompute the global counters from the AG counters after any operation
> > > that modifies anything that would affect the icount/ifreecount/fdblocks
> > > counters...
> > 
> > *nod*
> > 
> > > > We can do this sort of summation at mount time (in
> > > > xfs_initialize_perag_data()) because the filesystem is running
> > > > single threaded while the summation is taking place and so nothing
> > > > is changing during th summation. The filesystem is active in this
> > > > case, so I don't think we can do the same thing here.
> > > 
> > > ...however, you're correct to point out that the fs must be quiesced
> > > before we can actually do this.  In other words, I think the filesystem
> > > has to be completely frozen before we can do this.  Perhaps it's better
> > > to have the per-ag rebuilders fix only the per-ag counters and leave the
> > > global counters alone.  Then add a new scrubber that checks the summary
> > > counters and fixes them if necessary.
> > 
> > So the question here is whether we actually need to accurately
> > correct the global superblock counters?
> 
> I think so, because what happens if the superblock counter is
> artificially high but the AGs do not actually have the free space?
> xfs_trans_reserve won't ENOSPC like it should, so we could end up
> blowing out of transactions and shutting down because some allocation
> that has to succeed ("because trans_reserve said there was space!")
> fails...

Yes, but I would have thought the reset will get us close enough
that this wouldn't be an issue for the vast majority of people.

And the other side of it is that if we get close enough to ENOSPC
that it matters, we could freeze/sum/thaw to be fully accurate on
demand in xfs_trans_reserve(), right? We already slow down greatly
at ENOSPC, so at that point the perf overhead fo a freeze/thaw cycle
just doesn't matter...

> > We know that if we have a dirty unmount, the counters will we
> > re-initialised on mount from the AG header information, so perhaps
> > what we need here is a flag to tell unmount to dirty the log again
> > after it has written the unmount record (like we currently do for
> > quiesce).
> 
> ...but now that we've repaired the filesystem, it could potentially run
> for a very long time until the next unmount.  During that run, we'd be
> misleading users about the real amount of free space and risking a hard
> shutdown.  I prefer that online repair try not to leave any weird state
> around after xfs_scrub exits.

Sure, but user's may not want a freeze/read-all-ag-headers/thaw
cycle as part of repair if it can be avoided. If there are thousands
of AGs, this could take many seconds....

> > That was we can do a racy "near enough" update here to get us out of
> > the worst of the space accounting mismatches, knowing that on the
> > next mount it will be accurately rebuilt.
> >
> > Thoughts?
> 
> Well, I think the best solution is to have the AGF/AGI/inobt rebuilders
> adjust the global counters by the same amount that they're adjusting the
> counters in the AGF/AGI, then add a new scrubber that runs at the end to
> freeze the fs and check/repair the global counter state. :)

I'm just not convinced that we can get away with a global freeze to
do this summation without having noticable impact on applications.

Cheers,

Dave.
Darrick J. Wong May 30, 2018, 3:22 a.m. UTC | #6
On Wed, May 30, 2018 at 11:23:33AM +1000, Dave Chinner wrote:
> On Tue, May 29, 2018 at 03:43:32PM -0700, Darrick J. Wong wrote:
> > On Wed, May 30, 2018 at 08:24:28AM +1000, Dave Chinner wrote:
> > > On Tue, May 29, 2018 at 03:07:16PM -0700, Darrick J. Wong wrote:
> > > > On Tue, May 29, 2018 at 01:28:10PM +1000, Dave Chinner wrote:
> > > > > On Thu, May 17, 2018 at 08:56:23PM -0700, Darrick J. Wong wrote:
> > > > > > +	/*
> > > > > > +	 * Reinitialize the counters.  The on-disk and in-core counters differ
> > > > > > +	 * by the number of inodes/blocks reserved by the admin, the per-AG
> > > > > > +	 * reservation, and any transactions in progress, so we have to
> > > > > > +	 * account for that.  First we take the sb lock and update its
> > > > > > +	 * counters...
> > > > > > +	 */
> > > > > > +	spin_lock(&mp->m_sb_lock);
> > > > > > +	delta_icount = (int64_t)mp->m_sb.sb_icount - icount;
> > > > > > +	delta_ifree = (int64_t)mp->m_sb.sb_ifree - ifree;
> > > > > > +	delta_fdblocks = (int64_t)mp->m_sb.sb_fdblocks - fdblocks;
> > > > > > +	mp->m_sb.sb_icount = icount;
> > > > > > +	mp->m_sb.sb_ifree = ifree;
> > > > > > +	mp->m_sb.sb_fdblocks = fdblocks;
> > > > > > +	spin_unlock(&mp->m_sb_lock);
> > > > > 
> > > > > This seems racy to me ? i.e. the per-ag counters can change while
> > > > > we are summing them, and once we've summed them then sb counters
> > > > > can change while we are waiting for the m_sb_lock. It's looks to me
> > > > > like the summed per-ag counters are not in any way coherent
> > > > > wit the superblock or the in-core per-CPU counters, so I'm
> > > > > struggling to understand why this is safe?
> > > > 
> > > > Hmm, yes, I think this is racy too.  The purpose of this code is to
> > > > recompute the global counters from the AG counters after any operation
> > > > that modifies anything that would affect the icount/ifreecount/fdblocks
> > > > counters...
> > > 
> > > *nod*
> > > 
> > > > > We can do this sort of summation at mount time (in
> > > > > xfs_initialize_perag_data()) because the filesystem is running
> > > > > single threaded while the summation is taking place and so nothing
> > > > > is changing during th summation. The filesystem is active in this
> > > > > case, so I don't think we can do the same thing here.
> > > > 
> > > > ...however, you're correct to point out that the fs must be quiesced
> > > > before we can actually do this.  In other words, I think the filesystem
> > > > has to be completely frozen before we can do this.  Perhaps it's better
> > > > to have the per-ag rebuilders fix only the per-ag counters and leave the
> > > > global counters alone.  Then add a new scrubber that checks the summary
> > > > counters and fixes them if necessary.
> > > 
> > > So the question here is whether we actually need to accurately
> > > correct the global superblock counters?
> > 
> > I think so, because what happens if the superblock counter is
> > artificially high but the AGs do not actually have the free space?
> > xfs_trans_reserve won't ENOSPC like it should, so we could end up
> > blowing out of transactions and shutting down because some allocation
> > that has to succeed ("because trans_reserve said there was space!")
> > fails...
> 
> Yes, but I would have thought the reset will get us close enough
> that this wouldn't be an issue for the vast majority of people.

<nod> I'll adjust the sb counters based on the agf/agi/inobt adjustments
and we'll leave verifying and/or fixing the superblock counters as a
Future Research Topic(tm).

> And the other side of it is that if we get close enough to ENOSPC
> that it matters, we could freeze/sum/thaw to be fully accurate on
> demand in xfs_trans_reserve(), right? We already slow down greatly
> at ENOSPC, so at that point the perf overhead fo a freeze/thaw cycle
> just doesn't matter...
> 
> > > We know that if we have a dirty unmount, the counters will we
> > > re-initialised on mount from the AG header information, so perhaps
> > > what we need here is a flag to tell unmount to dirty the log again
> > > after it has written the unmount record (like we currently do for
> > > quiesce).
> > 
> > ...but now that we've repaired the filesystem, it could potentially run
> > for a very long time until the next unmount.  During that run, we'd be
> > misleading users about the real amount of free space and risking a hard
> > shutdown.  I prefer that online repair try not to leave any weird state
> > around after xfs_scrub exits.
> 
> Sure, but user's may not want a freeze/read-all-ag-headers/thaw
> cycle as part of repair if it can be avoided. If there are thousands
> of AGs, this could take many seconds....
> 
> > > That was we can do a racy "near enough" update here to get us out of
> > > the worst of the space accounting mismatches, knowing that on the
> > > next mount it will be accurately rebuilt.
> > >
> > > Thoughts?
> > 
> > Well, I think the best solution is to have the AGF/AGI/inobt rebuilders
> > adjust the global counters by the same amount that they're adjusting the
> > counters in the AGF/AGI, then add a new scrubber that runs at the end to
> > freeze the fs and check/repair the global counter state. :)
> 
> I'm just not convinced that we can get away with a global freeze to
> do this summation without having noticable impact on applications.

Admittedly, online repair has a semi-implicit design assumption that
either (a) it's running on fast enough storage that a bunch of random
IOs won't seriously harm performance or (b) whoever runs the client
program will throttle it to avoid starving regular operations, and (c)
repairs will not frequently be required.

Of course, the fsfreeze repairs totally blow (b) out of the water, which
means that in the long run I'm going to have to find a way shorten the
runtime of those repair types (rmap, quota).  If (a) is true then maybe
we can parallelize some of the AG accesses to reduce freeze time.  I'm
hoping that will reduce the pain of such things, though.  $god help the
users on floppy disks.

--D

> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index 877488ce4bc8..4b95a15c0bd0 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -1026,3 +1026,92 @@  xfs_repair_find_ag_btree_roots(
 
 	return error;
 }
+
+/*
+ * Reset the superblock counters.
+ *
+ * If a repair function changes the inode or free block counters, it must set
+ * reset_counters to push this function to reset the global counters.  Repair
+ * functions are responsible for resetting all other in-core state.  This
+ * function runs outside of transaction context after the repair context has
+ * been torn down, so if there's further filesystem corruption we'll error out
+ * to userspace and give userspace a chance to call back to fix the further
+ * errors.
+ */
+int
+xfs_repair_reset_counters(
+	struct xfs_mount	*mp)
+{
+	struct xfs_buf		*agi_bp;
+	struct xfs_buf		*agf_bp;
+	struct xfs_agi		*agi;
+	struct xfs_agf		*agf;
+	xfs_agnumber_t		agno;
+	xfs_ino_t		icount = 0;
+	xfs_ino_t		ifree = 0;
+	xfs_filblks_t		fdblocks = 0;
+	int64_t			delta_icount;
+	int64_t			delta_ifree;
+	int64_t			delta_fdblocks;
+	int			error;
+
+	trace_xfs_repair_reset_counters(mp);
+
+	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
+		/* Count all the inodes... */
+		error = xfs_ialloc_read_agi(mp, NULL, agno, &agi_bp);
+		if (error)
+			return error;
+		agi = XFS_BUF_TO_AGI(agi_bp);
+		icount += be32_to_cpu(agi->agi_count);
+		ifree += be32_to_cpu(agi->agi_freecount);
+		xfs_buf_relse(agi_bp);
+
+		/* Add up the free/freelist/bnobt/cntbt blocks... */
+		error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agf_bp);
+		if (error)
+			return error;
+		if (!agf_bp)
+			return -ENOMEM;
+		agf = XFS_BUF_TO_AGF(agf_bp);
+		fdblocks += be32_to_cpu(agf->agf_freeblks);
+		fdblocks += be32_to_cpu(agf->agf_flcount);
+		fdblocks += be32_to_cpu(agf->agf_btreeblks);
+		xfs_buf_relse(agf_bp);
+	}
+
+	/*
+	 * Reinitialize the counters.  The on-disk and in-core counters differ
+	 * by the number of inodes/blocks reserved by the admin, the per-AG
+	 * reservation, and any transactions in progress, so we have to
+	 * account for that.  First we take the sb lock and update its
+	 * counters...
+	 */
+	spin_lock(&mp->m_sb_lock);
+	delta_icount = (int64_t)mp->m_sb.sb_icount - icount;
+	delta_ifree = (int64_t)mp->m_sb.sb_ifree - ifree;
+	delta_fdblocks = (int64_t)mp->m_sb.sb_fdblocks - fdblocks;
+	mp->m_sb.sb_icount = icount;
+	mp->m_sb.sb_ifree = ifree;
+	mp->m_sb.sb_fdblocks = fdblocks;
+	spin_unlock(&mp->m_sb_lock);
+
+	/* ...and then update the per-cpu counters. */
+	if (delta_icount) {
+		error = xfs_mod_icount(mp, delta_icount);
+		if (error)
+			return error;
+	}
+	if (delta_ifree) {
+		error = xfs_mod_ifree(mp, delta_ifree);
+		if (error)
+			return error;
+	}
+	if (delta_fdblocks) {
+		error = xfs_mod_fdblocks(mp, delta_fdblocks, false);
+		if (error)
+			return error;
+	}
+
+	return 0;
+}
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index c922ef06b894..cc590312550a 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -96,6 +96,7 @@  int xfs_repair_find_ag_btree_roots(struct xfs_scrub_context *sc,
 		struct xfs_buf *agf_bp,
 		struct xfs_repair_find_ag_btree *btree_info,
 		struct xfs_buf *agfl_bp);
+int xfs_repair_reset_counters(struct xfs_mount *mp);
 
 /* Metadata repairers */
 
@@ -121,6 +122,12 @@  xfs_repair_calc_ag_resblks(
 	return 0;
 }
 
+static inline int xfs_repair_reset_counters(struct xfs_mount *mp)
+{
+	ASSERT(0);
+	return -EIO;
+}
+
 #define xfs_repair_probe		xfs_repair_notsupported
 
 #endif /* CONFIG_XFS_ONLINE_REPAIR */
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index c5999c28c20c..bf5e8dd66133 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -200,6 +200,8 @@  xfs_scrub_teardown(
 		kmem_free(sc->buf);
 		sc->buf = NULL;
 	}
+	if (sc->reset_counters && !error)
+		error = xfs_repair_reset_counters(sc->mp);
 	return error;
 }
 
diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h
index 636424d5e2ee..52b2be2df143 100644
--- a/fs/xfs/scrub/scrub.h
+++ b/fs/xfs/scrub/scrub.h
@@ -78,6 +78,7 @@  struct xfs_scrub_context {
 	uint				ilock_flags;
 	bool				try_harder;
 	bool				has_quotaofflock;
+	bool				reset_counters;
 
 	/* State tracking for single-AG operations. */
 	struct xfs_scrub_ag		sa;