[RFC] xfs: convert between packed and unpacked agfls on-demand
diff mbox

Message ID 20180312173552.GA30332@bfoster.bfoster
State New
Headers show

Commit Message

Brian Foster March 12, 2018, 5:35 p.m. UTC
On Mon, Mar 12, 2018 at 09:11:58AM -0400, Brian Foster wrote:
> On Fri, Mar 09, 2018 at 02:37:57PM -0800, Darrick J. Wong wrote:
> > On Fri, Mar 09, 2018 at 04:20:31PM -0500, Brian Foster wrote:
> > > On Fri, Mar 09, 2018 at 11:08:32AM -0800, Darrick J. Wong wrote:
> > > > On Fri, Mar 09, 2018 at 01:37:28PM -0500, Brian Foster wrote:
> > > > > On Fri, Mar 09, 2018 at 09:33:18AM -0800, Darrick J. Wong wrote:
...
> > <nod> I've been mulling over rewriting your previous rfc patch that put
> > all the scanning/lifting in {get,put}_freelist but having it reset the
> > agfl instead of doing the swizzling stuff.
> > 
> 
> Something to be careful of... emptying the agfl obviously means the
> subsequent fixup is a btree block allocation. That may limit the context
> of where the fixup can be performed. IOW, deferring it to
> _get_freelist() might no longer be safe. Instead, I think we'd have to
> implement it such that the on-disk flcount is completely untrusted when
> the mismatch is detected.
> 
...

Here's a variant of that patch that does a reset. It's definitely much
simpler. Thoughts?

Brian

--- 8< ---

--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Dave Chinner March 12, 2018, 9:14 p.m. UTC | #1
On Mon, Mar 12, 2018 at 01:35:53PM -0400, Brian Foster wrote:
> On Mon, Mar 12, 2018 at 09:11:58AM -0400, Brian Foster wrote:
> > On Fri, Mar 09, 2018 at 02:37:57PM -0800, Darrick J. Wong wrote:
> > > On Fri, Mar 09, 2018 at 04:20:31PM -0500, Brian Foster wrote:
> > > > On Fri, Mar 09, 2018 at 11:08:32AM -0800, Darrick J. Wong wrote:
> > > > > On Fri, Mar 09, 2018 at 01:37:28PM -0500, Brian Foster wrote:
> > > > > > On Fri, Mar 09, 2018 at 09:33:18AM -0800, Darrick J. Wong wrote:
> ...
> > > <nod> I've been mulling over rewriting your previous rfc patch that put
> > > all the scanning/lifting in {get,put}_freelist but having it reset the
> > > agfl instead of doing the swizzling stuff.
> > > 
> > 
> > Something to be careful of... emptying the agfl obviously means the
> > subsequent fixup is a btree block allocation. That may limit the context
> > of where the fixup can be performed. IOW, deferring it to
> > _get_freelist() might no longer be safe. Instead, I think we'd have to
> > implement it such that the on-disk flcount is completely untrusted when
> > the mismatch is detected.
> > 
> ...
> 
> Here's a variant of that patch that does a reset. It's definitely much
> simpler. Thoughts?

I like it - it is so much simpler than the other proposals, and it's
done on demand rather than by a brute-force mount/unmount scan.

> Brian
> 
> --- 8< ---
> 
> diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
> index c02781a4c091..7d313bb4677d 100644
> --- a/fs/xfs/libxfs/xfs_alloc.c
> +++ b/fs/xfs/libxfs/xfs_alloc.c
> @@ -2053,6 +2053,59 @@ xfs_alloc_space_available(
>  	return true;
>  }
>  
> +static bool
> +xfs_agf_verify_flcount(
> +	struct xfs_mount	*mp,
> +	struct xfs_agf		*agf)

Needs a comment explaining what it's checking and the return value.
Might actually read better as "xfs_agf_need_flcount_reset()"
returning true if fixup is required, especially at the caller
site...

> +{
> +	int			f = be32_to_cpu(agf->agf_flfirst);
> +	int			l = be32_to_cpu(agf->agf_fllast);
> +	int			c = be32_to_cpu(agf->agf_flcount);

These should probably be uint32_t - they are unsigned on disk.

> +	int			active = c;
> +	int			agfl_size = XFS_AGFL_SIZE(mp);
> +
> +	if (!xfs_sb_version_hascrc(&mp->m_sb))
> +		return true;
> +
> +	if (c && l >= f)
> +		active = l - f + 1;
> +	else if (c)
> +		active = agfl_size - f + l + 1;

	else
		active = 0;

To move all the initialisation of active into the one logic
statement and not make it dependent on the original value in the
AGF?

> +	if (active != c)
> +		return false;
> +	if (f >= agfl_size || l >= agfl_size)
> +		return false;

Shouldn't we be range checking these first? Also should probably
checking c the same way.

> +
> +	return true;
> +}
> +
> +static void
> +xfs_agfl_reset(
> +	struct xfs_trans	*tp,
> +	struct xfs_buf		*agbp,
> +	struct xfs_perag	*pag)
> +{
> +	struct xfs_mount	*mp = tp->t_mountp;
> +	struct xfs_agf		*agf = XFS_BUF_TO_AGF(agbp);
> +
> +	if (!pag->pagf_needreset)
> +		return;
> +
> +	trace_xfs_agfl_reset(pag);
> +	xfs_warn(mp, "agfl reset agno %u flcount %d", pag->pag_agno,
> +		 pag->pagf_flcount);
> +
> +	agf->agf_flfirst = 0;
> +	agf->agf_fllast = cpu_to_be32(XFS_AGFL_SIZE(mp) - 1);
> +	agf->agf_flcount = 0;
> +	xfs_alloc_log_agf(tp, agbp, XFS_AGF_FLFIRST | XFS_AGF_FLLAST |
> +				    XFS_AGF_FLCOUNT);
> +
> +	pag->pagf_flcount = 0;
> +	pag->pagf_needreset = false;
> +}
> +
>  /*
>   * Decide whether to use this allocation group for this allocation.
>   * If so, fix up the btree freelist's size.
> @@ -2119,6 +2172,9 @@ xfs_alloc_fix_freelist(
>  	if (!xfs_alloc_space_available(args, need, flags))
>  		goto out_agbp_relse;
>  
> +	if (pag->pagf_needreset)
> +		xfs_agfl_reset(tp, agbp, pag);
> +
>  	/*
>  	 * Make the freelist shorter if it's too long.
>  	 *
> @@ -2588,6 +2644,7 @@ xfs_alloc_read_agf(
>  		pag->pagb_count = 0;
>  		pag->pagb_tree = RB_ROOT;
>  		pag->pagf_init = 1;
> +		pag->pagf_needreset = !xfs_agf_verify_flcount(mp, agf);
>  	}
>  #ifdef DEBUG
>  	else if (!XFS_FORCED_SHUTDOWN(mp)) {
> diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> index e0792d036be2..5dd36920b7d6 100644
> --- a/fs/xfs/xfs_mount.h
> +++ b/fs/xfs/xfs_mount.h
> @@ -353,6 +353,7 @@ typedef struct xfs_perag {
>  	char		pagi_inodeok;	/* The agi is ok for inodes */
>  	uint8_t		pagf_levels[XFS_BTNUM_AGF];
>  					/* # of levels in bno & cnt btree */
> +	bool		pagf_needreset;
>  	uint32_t	pagf_flcount;	/* count of blocks in freelist */
>  	xfs_extlen_t	pagf_freeblks;	/* total free blocks */
>  	xfs_extlen_t	pagf_longest;	/* longest free space */
> diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> index 945de08af7ba..678d602dc400 100644
> --- a/fs/xfs/xfs_trace.h
> +++ b/fs/xfs/xfs_trace.h
> @@ -3339,6 +3339,24 @@ TRACE_EVENT(xfs_trans_resv_calc,
>  		  __entry->logflags)
>  );
>  
> +TRACE_EVENT(xfs_agfl_reset,
> +	TP_PROTO(struct xfs_perag *pag),
> +	TP_ARGS(pag),
> +	TP_STRUCT__entry(
> +		__field(dev_t, dev)
> +		__field(xfs_agnumber_t, agno)
> +		__field(int, flcount)
> +	),
> +	TP_fast_assign(
> +		__entry->dev = pag->pag_mount->m_super->s_dev;
> +		__entry->agno = pag->pag_agno;
> +		__entry->flcount = pag->pagf_flcount;
> +	),
> +	TP_printk("dev %d:%d agno %u flcount %d",
> +		  MAJOR(__entry->dev), MINOR(__entry->dev),
> +		  __entry->agno, __entry->flcount)
> +);

Do we need a new tracepoint? Wouldn't it be better to just
call trace_xfs_agf() which will dump all the information in the AGF
just before we do the reset? We'll know it's a reset from the caller
information that is dumped by that tracepoint....

> +
>  #endif /* _TRACE_XFS_H */
>  
>  #undef TRACE_INCLUDE_PATH
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
Brian Foster March 13, 2018, 11:27 a.m. UTC | #2
On Tue, Mar 13, 2018 at 08:14:55AM +1100, Dave Chinner wrote:
> On Mon, Mar 12, 2018 at 01:35:53PM -0400, Brian Foster wrote:
> > On Mon, Mar 12, 2018 at 09:11:58AM -0400, Brian Foster wrote:
> > > On Fri, Mar 09, 2018 at 02:37:57PM -0800, Darrick J. Wong wrote:
> > > > On Fri, Mar 09, 2018 at 04:20:31PM -0500, Brian Foster wrote:
> > > > > On Fri, Mar 09, 2018 at 11:08:32AM -0800, Darrick J. Wong wrote:
> > > > > > On Fri, Mar 09, 2018 at 01:37:28PM -0500, Brian Foster wrote:
> > > > > > > On Fri, Mar 09, 2018 at 09:33:18AM -0800, Darrick J. Wong wrote:
> > ...
> > > > <nod> I've been mulling over rewriting your previous rfc patch that put
> > > > all the scanning/lifting in {get,put}_freelist but having it reset the
> > > > agfl instead of doing the swizzling stuff.
> > > > 
> > > 
> > > Something to be careful of... emptying the agfl obviously means the
> > > subsequent fixup is a btree block allocation. That may limit the context
> > > of where the fixup can be performed. IOW, deferring it to
> > > _get_freelist() might no longer be safe. Instead, I think we'd have to
> > > implement it such that the on-disk flcount is completely untrusted when
> > > the mismatch is detected.
> > > 
> > ...
> > 
> > Here's a variant of that patch that does a reset. It's definitely much
> > simpler. Thoughts?
> 
> I like it - it is so much simpler than the other proposals, and it's
> done on demand rather than by a brute-force mount/unmount scan.
> 
> > Brian
> > 
> > --- 8< ---
> > 
> > diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
> > index c02781a4c091..7d313bb4677d 100644
> > --- a/fs/xfs/libxfs/xfs_alloc.c
> > +++ b/fs/xfs/libxfs/xfs_alloc.c
> > @@ -2053,6 +2053,59 @@ xfs_alloc_space_available(
> >  	return true;
> >  }
> >  
> > +static bool
> > +xfs_agf_verify_flcount(
> > +	struct xfs_mount	*mp,
> > +	struct xfs_agf		*agf)
> 
> Needs a comment explaining what it's checking and the return value.
> Might actually read better as "xfs_agf_need_flcount_reset()"
> returning true if fixup is required, especially at the caller
> site...
> 

I changed it from something like that (see the original patch), but more
just because I expected the simpler logic would now cover more general
things than just the padding issue rather than any technical reason. The
_need_reset() name probably accomplishes the same thing so I'll go back
to the earlier logic.

I plan to add some actual comments and fix up the warning message into
something more usable.

> > +{
> > +	int			f = be32_to_cpu(agf->agf_flfirst);
> > +	int			l = be32_to_cpu(agf->agf_fllast);
> > +	int			c = be32_to_cpu(agf->agf_flcount);
> 
> These should probably be uint32_t - they are unsigned on disk.
> 

Ok.

> > +	int			active = c;
> > +	int			agfl_size = XFS_AGFL_SIZE(mp);
> > +
> > +	if (!xfs_sb_version_hascrc(&mp->m_sb))
> > +		return true;
> > +
> > +	if (c && l >= f)
> > +		active = l - f + 1;
> > +	else if (c)
> > +		active = agfl_size - f + l + 1;
> 
> 	else
> 		active = 0;
> 
> To move all the initialisation of active into the one logic
> statement and not make it dependent on the original value in the
> AGF?
> 

I think that should work.

> > +	if (active != c)
> > +		return false;
> > +	if (f >= agfl_size || l >= agfl_size)
> > +		return false;
> 
> Shouldn't we be range checking these first? Also should probably
> checking c the same way.
> 

I'll move the l/f checks to before the active calculation.

We can also add a check for c > agfl_size but note that technically
these are superfluous checks because they are checked in the read
verifier. The f/l range checks were there to cover the case of a packed
-> unpacked conversion. That would also require a downstream read
verifier tweak to handle a +1 sized valid agfl range. The verifier tweak
is not included here because going backwards as such is not something we
want to support upstream.

IOW, the read verifier dictates a subset of general agfl corruption that
this fixup is allowed to handle. All that said, the extra checks don't
hurt and it's probably smart to consistently check the fields we're
going to feed into calculations. I'll add a count check with a comment
for added context.

> > +
> > +	return true;
> > +}
> > +
...
> > diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> > index 945de08af7ba..678d602dc400 100644
> > --- a/fs/xfs/xfs_trace.h
> > +++ b/fs/xfs/xfs_trace.h
> > @@ -3339,6 +3339,24 @@ TRACE_EVENT(xfs_trans_resv_calc,
> >  		  __entry->logflags)
> >  );
> >  
> > +TRACE_EVENT(xfs_agfl_reset,
> > +	TP_PROTO(struct xfs_perag *pag),
> > +	TP_ARGS(pag),
> > +	TP_STRUCT__entry(
> > +		__field(dev_t, dev)
> > +		__field(xfs_agnumber_t, agno)
> > +		__field(int, flcount)
> > +	),
> > +	TP_fast_assign(
> > +		__entry->dev = pag->pag_mount->m_super->s_dev;
> > +		__entry->agno = pag->pag_agno;
> > +		__entry->flcount = pag->pagf_flcount;
> > +	),
> > +	TP_printk("dev %d:%d agno %u flcount %d",
> > +		  MAJOR(__entry->dev), MINOR(__entry->dev),
> > +		  __entry->agno, __entry->flcount)
> > +);
> 
> Do we need a new tracepoint? Wouldn't it be better to just
> call trace_xfs_agf() which will dump all the information in the AGF
> just before we do the reset? We'll know it's a reset from the caller
> information that is dumped by that tracepoint....
> 

Yeah, I missed that tracepoint. I may see if I can turn that into an
event class so we can still use a unique name. trace_xfs_agf() looks
like it refers to logging the agf which is already triggered by this
path after we reset the agfl pointers. Otherwise I'll just add a
pre-reset call so we have the original field values.

Thanks for the feedback..

Brian

> > +
> >  #endif /* _TRACE_XFS_H */
> >  
> >  #undef TRACE_INCLUDE_PATH
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > 
> 
> -- 
> Dave Chinner
> david@fromorbit.com
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dave Chiluk March 14, 2018, 3:07 a.m. UTC | #3
On Mon, Mar 12, 2018 at 12:35 PM, Brian Foster <bfoster@redhat.com> wrote:
>
> On Mon, Mar 12, 2018 at 09:11:58AM -0400, Brian Foster wrote:
> Here's a variant of that patch that does a reset. It's definitely much
> simpler. Thoughts?
>
> Brian
>
> --- 8< ---
>
> diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
> index c02781a4c091..7d313bb4677d 100644
> --- a/fs/xfs/libxfs/xfs_alloc.c
> +++ b/fs/xfs/libxfs/xfs_alloc.c
> @@ -2053,6 +2053,59 @@ xfs_alloc_space_available(
>         return true;
>  }
>
> +static bool
> +xfs_agf_verify_flcount(
> +       struct xfs_mount        *mp,
> +       struct xfs_agf          *agf)
> +{
> +       int                     f = be32_to_cpu(agf->agf_flfirst);
> +       int                     l = be32_to_cpu(agf->agf_fllast);
> +       int                     c = be32_to_cpu(agf->agf_flcount);
> +       int                     active = c;
> +       int                     agfl_size = XFS_AGFL_SIZE(mp);
> +
> +       if (!xfs_sb_version_hascrc(&mp->m_sb))
> +               return true;
> +
> +       if (c && l >= f)
> +               active = l - f + 1;
> +       else if (c)
> +               active = agfl_size - f + l + 1;
> +
> +       if (active != c)
> +               return false;
> +       if (f >= agfl_size || l >= agfl_size)
> +               return false;
> +
> +       return true;
> +}
> +
> +static void
> +xfs_agfl_reset(
> +       struct xfs_trans        *tp,
> +       struct xfs_buf          *agbp,
> +       struct xfs_perag        *pag)
> +{
> +       struct xfs_mount        *mp = tp->t_mountp;
> +       struct xfs_agf          *agf = XFS_BUF_TO_AGF(agbp);
> +
> +       if (!pag->pagf_needreset)
> +               return;
> +
> +       trace_xfs_agfl_reset(pag);
> +       xfs_warn(mp, "agfl reset agno %u flcount %d", pag->pag_agno,
> +                pag->pagf_flcount);
> +

Before completely leaking the entirety of the agfl couldn't we nicely
release and recover all blocks but the 119th first?  That way we'd
only be leaking the possibly problematic 119th item?  I understand we
would lose the benefit of being able to recover from otherwise corrupt
AGFLs.

If we are going to blindly leak blocks wouldn't an xfs_repair recover
these leaked blocks?  I think it would be perfectly fine to leak these
blocks if it means not crashing and then recover them at one's
convenience with an xfs_repair.

> +       agf->agf_flfirst = 0;
> +       agf->agf_fllast = cpu_to_be32(XFS_AGFL_SIZE(mp) - 1);
> +       agf->agf_flcount = 0;

Also I was under the impression that we should pre-allocate blocks in
the agfl for fast allocation of free b+tree nodes.  Wouldn't we want
to pre-allocate some blocks as would be done by xfs_repair (I have a
feeling someone is going to tell me where this happens elsewhere in
the codebase or can be handled at block run time with little ill
effect)?

If I'm correct in either case I'd appreciate a
Reviewed by: Dave Chiluk <chiluk+linuxxfs@indeed.com>

Thanks,
Dave


> +       xfs_alloc_log_agf(tp, agbp, XFS_AGF_FLFIRST | XFS_AGF_FLLAST |
> +                                   XFS_AGF_FLCOUNT);
> +
> +       pag->pagf_flcount = 0;
> +       pag->pagf_needreset = false;
> +}
> +
>  /*
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Brian Foster March 14, 2018, 11:02 a.m. UTC | #4
On Tue, Mar 13, 2018 at 10:07:27PM -0500, Dave Chiluk wrote:
> On Mon, Mar 12, 2018 at 12:35 PM, Brian Foster <bfoster@redhat.com> wrote:
> >
> > On Mon, Mar 12, 2018 at 09:11:58AM -0400, Brian Foster wrote:
> > Here's a variant of that patch that does a reset. It's definitely much
> > simpler. Thoughts?
> >
> > Brian
> >
> > --- 8< ---
> >
> > diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
> > index c02781a4c091..7d313bb4677d 100644
> > --- a/fs/xfs/libxfs/xfs_alloc.c
> > +++ b/fs/xfs/libxfs/xfs_alloc.c
> > @@ -2053,6 +2053,59 @@ xfs_alloc_space_available(
> >         return true;
> >  }
> >
> > +static bool
> > +xfs_agf_verify_flcount(
> > +       struct xfs_mount        *mp,
> > +       struct xfs_agf          *agf)
> > +{
> > +       int                     f = be32_to_cpu(agf->agf_flfirst);
> > +       int                     l = be32_to_cpu(agf->agf_fllast);
> > +       int                     c = be32_to_cpu(agf->agf_flcount);
> > +       int                     active = c;
> > +       int                     agfl_size = XFS_AGFL_SIZE(mp);
> > +
> > +       if (!xfs_sb_version_hascrc(&mp->m_sb))
> > +               return true;
> > +
> > +       if (c && l >= f)
> > +               active = l - f + 1;
> > +       else if (c)
> > +               active = agfl_size - f + l + 1;
> > +
> > +       if (active != c)
> > +               return false;
> > +       if (f >= agfl_size || l >= agfl_size)
> > +               return false;
> > +
> > +       return true;
> > +}
> > +
> > +static void
> > +xfs_agfl_reset(
> > +       struct xfs_trans        *tp,
> > +       struct xfs_buf          *agbp,
> > +       struct xfs_perag        *pag)
> > +{
> > +       struct xfs_mount        *mp = tp->t_mountp;
> > +       struct xfs_agf          *agf = XFS_BUF_TO_AGF(agbp);
> > +
> > +       if (!pag->pagf_needreset)
> > +               return;
> > +
> > +       trace_xfs_agfl_reset(pag);
> > +       xfs_warn(mp, "agfl reset agno %u flcount %d", pag->pag_agno,
> > +                pag->pagf_flcount);
> > +
> 
> Before completely leaking the entirety of the agfl couldn't we nicely
> release and recover all blocks but the 119th first?  That way we'd
> only be leaking the possibly problematic 119th item?  I understand we
> would lose the benefit of being able to recover from otherwise corrupt
> AGFLs.
> 

This is mostly covered in the discussions over the previously explored
methods. The synopsis is that yes, we could try to do something like
that, but the point of this approach is that we don't have to trust the
agfl content at all. This simplifies and genericizes the logic because
every kernel already knows how to populate a sane agfl from an empty
one.

> If we are going to blindly leak blocks wouldn't an xfs_repair recover
> these leaked blocks?  I think it would be perfectly fine to leak these
> blocks if it means not crashing and then recover them at one's
> convenience with an xfs_repair.
> 

Yes, an xfs_repair is necessary. An xfs_repair was already necessary to
fix the padding mismatch or whatever else might have been wrong. This
changes the side effect of the problem from a crash into a free space
accounting inconsistency.

> > +       agf->agf_flfirst = 0;
> > +       agf->agf_fllast = cpu_to_be32(XFS_AGFL_SIZE(mp) - 1);
> > +       agf->agf_flcount = 0;
> 
> Also I was under the impression that we should pre-allocate blocks in
> the agfl for fast allocation of free b+tree nodes.  Wouldn't we want
> to pre-allocate some blocks as would be done by xfs_repair (I have a
> feeling someone is going to tell me where this happens elsewhere in
> the codebase or can be handled at block run time with little ill
> effect)?
> 

The function that calls the reset (xfs_alloc_fix_freelist()) will
repopulate it once it sees that it is empty.

> If I'm correct in either case I'd appreciate a
> Reviewed by: Dave Chiluk <chiluk+linuxxfs@indeed.com>
> 

I'm going to defer this until posting a legitimate patch because it has
changed a bit (though not fundamentally). This post was more of a first
pass to sanity check the idea. I'd appreciate another look once a
legitimate v1 is posted.. thanks!

Brian

> Thanks,
> Dave
> 
> 
> > +       xfs_alloc_log_agf(tp, agbp, XFS_AGF_FLFIRST | XFS_AGF_FLLAST |
> > +                                   XFS_AGF_FLCOUNT);
> > +
> > +       pag->pagf_flcount = 0;
> > +       pag->pagf_needreset = false;
> > +}
> > +
> >  /*
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Darrick J. Wong March 14, 2018, 3:28 p.m. UTC | #5
On Wed, Mar 14, 2018 at 07:02:33AM -0400, Brian Foster wrote:
> On Tue, Mar 13, 2018 at 10:07:27PM -0500, Dave Chiluk wrote:
> > On Mon, Mar 12, 2018 at 12:35 PM, Brian Foster <bfoster@redhat.com> wrote:
> > >
> > > On Mon, Mar 12, 2018 at 09:11:58AM -0400, Brian Foster wrote:
> > > Here's a variant of that patch that does a reset. It's definitely much
> > > simpler. Thoughts?
> > >
> > > Brian
> > >
> > > --- 8< ---
> > >
> > > diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
> > > index c02781a4c091..7d313bb4677d 100644
> > > --- a/fs/xfs/libxfs/xfs_alloc.c
> > > +++ b/fs/xfs/libxfs/xfs_alloc.c
> > > @@ -2053,6 +2053,59 @@ xfs_alloc_space_available(
> > >         return true;
> > >  }
> > >
> > > +static bool
> > > +xfs_agf_verify_flcount(
> > > +       struct xfs_mount        *mp,
> > > +       struct xfs_agf          *agf)
> > > +{
> > > +       int                     f = be32_to_cpu(agf->agf_flfirst);
> > > +       int                     l = be32_to_cpu(agf->agf_fllast);
> > > +       int                     c = be32_to_cpu(agf->agf_flcount);
> > > +       int                     active = c;
> > > +       int                     agfl_size = XFS_AGFL_SIZE(mp);
> > > +
> > > +       if (!xfs_sb_version_hascrc(&mp->m_sb))
> > > +               return true;
> > > +
> > > +       if (c && l >= f)
> > > +               active = l - f + 1;
> > > +       else if (c)
> > > +               active = agfl_size - f + l + 1;
> > > +
> > > +       if (active != c)
> > > +               return false;
> > > +       if (f >= agfl_size || l >= agfl_size)
> > > +               return false;
> > > +
> > > +       return true;
> > > +}
> > > +
> > > +static void
> > > +xfs_agfl_reset(
> > > +       struct xfs_trans        *tp,
> > > +       struct xfs_buf          *agbp,
> > > +       struct xfs_perag        *pag)
> > > +{
> > > +       struct xfs_mount        *mp = tp->t_mountp;
> > > +       struct xfs_agf          *agf = XFS_BUF_TO_AGF(agbp);
> > > +
> > > +       if (!pag->pagf_needreset)
> > > +               return;
> > > +
> > > +       trace_xfs_agfl_reset(pag);
> > > +       xfs_warn(mp, "agfl reset agno %u flcount %d", pag->pag_agno,
> > > +                pag->pagf_flcount);
> > > +
> > 
> > Before completely leaking the entirety of the agfl couldn't we nicely
> > release and recover all blocks but the 119th first?  That way we'd
> > only be leaking the possibly problematic 119th item?  I understand we
> > would lose the benefit of being able to recover from otherwise corrupt
> > AGFLs.
> > 
> 
> This is mostly covered in the discussions over the previously explored
> methods. The synopsis is that yes, we could try to do something like
> that, but the point of this approach is that we don't have to trust the
> agfl content at all. This simplifies and genericizes the logic because
> every kernel already knows how to populate a sane agfl from an empty
> one.
> 
> > If we are going to blindly leak blocks wouldn't an xfs_repair recover
> > these leaked blocks?  I think it would be perfectly fine to leak these
> > blocks if it means not crashing and then recover them at one's
> > convenience with an xfs_repair.
> > 
> 
> Yes, an xfs_repair is necessary. An xfs_repair was already necessary to
> fix the padding mismatch or whatever else might have been wrong. This
> changes the side effect of the problem from a crash into a free space
> accounting inconsistency.
> 
> > > +       agf->agf_flfirst = 0;
> > > +       agf->agf_fllast = cpu_to_be32(XFS_AGFL_SIZE(mp) - 1);
> > > +       agf->agf_flcount = 0;
> > 
> > Also I was under the impression that we should pre-allocate blocks in
> > the agfl for fast allocation of free b+tree nodes.  Wouldn't we want
> > to pre-allocate some blocks as would be done by xfs_repair (I have a
> > feeling someone is going to tell me where this happens elsewhere in
> > the codebase or can be handled at block run time with little ill
> > effect)?
> > 
> 
> The function that calls the reset (xfs_alloc_fix_freelist()) will
> repopulate it once it sees that it is empty.
> 
> > If I'm correct in either case I'd appreciate a
> > Reviewed by: Dave Chiluk <chiluk+linuxxfs@indeed.com>
> > 
> 
> I'm going to defer this until posting a legitimate patch because it has
> changed a bit (though not fundamentally). This post was more of a first
> pass to sanity check the idea. I'd appreciate another look once a
> legitimate v1 is posted.. thanks!

While you're at it, I already applied the xfs_agfl_size decapitalization
to for-next, so please do that here too.

--D

> Brian
> 
> > Thanks,
> > Dave
> > 
> > 
> > > +       xfs_alloc_log_agf(tp, agbp, XFS_AGF_FLFIRST | XFS_AGF_FLLAST |
> > > +                                   XFS_AGF_FLCOUNT);
> > > +
> > > +       pag->pagf_flcount = 0;
> > > +       pag->pagf_needreset = false;
> > > +}
> > > +
> > >  /*
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Patch
diff mbox

diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index c02781a4c091..7d313bb4677d 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -2053,6 +2053,59 @@  xfs_alloc_space_available(
 	return true;
 }
 
+static bool
+xfs_agf_verify_flcount(
+	struct xfs_mount	*mp,
+	struct xfs_agf		*agf)
+{
+	int			f = be32_to_cpu(agf->agf_flfirst);
+	int			l = be32_to_cpu(agf->agf_fllast);
+	int			c = be32_to_cpu(agf->agf_flcount);
+	int			active = c;
+	int			agfl_size = XFS_AGFL_SIZE(mp);
+
+	if (!xfs_sb_version_hascrc(&mp->m_sb))
+		return true;
+
+	if (c && l >= f)
+		active = l - f + 1;
+	else if (c)
+		active = agfl_size - f + l + 1;
+
+	if (active != c)
+		return false;
+	if (f >= agfl_size || l >= agfl_size)
+		return false;
+
+	return true;
+}
+
+static void
+xfs_agfl_reset(
+	struct xfs_trans	*tp,
+	struct xfs_buf		*agbp,
+	struct xfs_perag	*pag)
+{
+	struct xfs_mount	*mp = tp->t_mountp;
+	struct xfs_agf		*agf = XFS_BUF_TO_AGF(agbp);
+
+	if (!pag->pagf_needreset)
+		return;
+
+	trace_xfs_agfl_reset(pag);
+	xfs_warn(mp, "agfl reset agno %u flcount %d", pag->pag_agno,
+		 pag->pagf_flcount);
+
+	agf->agf_flfirst = 0;
+	agf->agf_fllast = cpu_to_be32(XFS_AGFL_SIZE(mp) - 1);
+	agf->agf_flcount = 0;
+	xfs_alloc_log_agf(tp, agbp, XFS_AGF_FLFIRST | XFS_AGF_FLLAST |
+				    XFS_AGF_FLCOUNT);
+
+	pag->pagf_flcount = 0;
+	pag->pagf_needreset = false;
+}
+
 /*
  * Decide whether to use this allocation group for this allocation.
  * If so, fix up the btree freelist's size.
@@ -2119,6 +2172,9 @@  xfs_alloc_fix_freelist(
 	if (!xfs_alloc_space_available(args, need, flags))
 		goto out_agbp_relse;
 
+	if (pag->pagf_needreset)
+		xfs_agfl_reset(tp, agbp, pag);
+
 	/*
 	 * Make the freelist shorter if it's too long.
 	 *
@@ -2588,6 +2644,7 @@  xfs_alloc_read_agf(
 		pag->pagb_count = 0;
 		pag->pagb_tree = RB_ROOT;
 		pag->pagf_init = 1;
+		pag->pagf_needreset = !xfs_agf_verify_flcount(mp, agf);
 	}
 #ifdef DEBUG
 	else if (!XFS_FORCED_SHUTDOWN(mp)) {
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index e0792d036be2..5dd36920b7d6 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -353,6 +353,7 @@  typedef struct xfs_perag {
 	char		pagi_inodeok;	/* The agi is ok for inodes */
 	uint8_t		pagf_levels[XFS_BTNUM_AGF];
 					/* # of levels in bno & cnt btree */
+	bool		pagf_needreset;
 	uint32_t	pagf_flcount;	/* count of blocks in freelist */
 	xfs_extlen_t	pagf_freeblks;	/* total free blocks */
 	xfs_extlen_t	pagf_longest;	/* longest free space */
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 945de08af7ba..678d602dc400 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -3339,6 +3339,24 @@  TRACE_EVENT(xfs_trans_resv_calc,
 		  __entry->logflags)
 );
 
+TRACE_EVENT(xfs_agfl_reset,
+	TP_PROTO(struct xfs_perag *pag),
+	TP_ARGS(pag),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(int, flcount)
+	),
+	TP_fast_assign(
+		__entry->dev = pag->pag_mount->m_super->s_dev;
+		__entry->agno = pag->pag_agno;
+		__entry->flcount = pag->pagf_flcount;
+	),
+	TP_printk("dev %d:%d agno %u flcount %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno, __entry->flcount)
+);
+
 #endif /* _TRACE_XFS_H */
 
 #undef TRACE_INCLUDE_PATH