diff mbox

[2/4] xfs: improve handling of busy extents in the low-level allocator

Message ID 1485715421-17182-3-git-send-email-hch@lst.de (mailing list archive)
State Superseded, archived
Headers show

Commit Message

Christoph Hellwig Jan. 29, 2017, 6:43 p.m. UTC
Currently we force the log and simply try again if we hit a busy extent,
but especially with online discard enabled it might take a while after
the log force for the busy extents to disappear, and we might have
already completed our second pass.

So instead we add a new waitqueue and a generation counter to the pag
structure so that we can do wakeups once we've removed busy extents,
and we replace the single retry with an unconditional one - after
all we hold the AGF buffer lock, so no other allocations or frees
can be racing with us in this AG.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_alloc.c | 93 +++++++++++++++++++++++---------------------
 fs/xfs/xfs_extent_busy.c  | 98 ++++++++++++++++++++++++++++++++++-------------
 fs/xfs/xfs_extent_busy.h  |  8 +++-
 fs/xfs/xfs_mount.c        |  1 +
 fs/xfs/xfs_mount.h        |  2 +
 5 files changed, 129 insertions(+), 73 deletions(-)

Comments

Brian Foster Feb. 3, 2017, 3:22 p.m. UTC | #1
On Sun, Jan 29, 2017 at 07:43:39PM +0100, Christoph Hellwig wrote:
> Currently we force the log and simply try again if we hit a busy extent,
> but especially with online discard enabled it might take a while after
> the log force for the busy extents to disappear, and we might have
> already completed our second pass.
> 
> So instead we add a new waitqueue and a generation counter to the pag
> structure so that we can do wakeups once we've removed busy extents,
> and we replace the single retry with an unconditional one - after
> all we hold the AGF buffer lock, so no other allocations or frees
> can be racing with us in this AG.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  fs/xfs/libxfs/xfs_alloc.c | 93 +++++++++++++++++++++++---------------------
>  fs/xfs/xfs_extent_busy.c  | 98 ++++++++++++++++++++++++++++++++++-------------
>  fs/xfs/xfs_extent_busy.h  |  8 +++-
>  fs/xfs/xfs_mount.c        |  1 +
>  fs/xfs/xfs_mount.h        |  2 +
>  5 files changed, 129 insertions(+), 73 deletions(-)
> 
> diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
> index 9f06a21..fe98fbc 100644
> --- a/fs/xfs/libxfs/xfs_alloc.c
> +++ b/fs/xfs/libxfs/xfs_alloc.c
...
> @@ -1183,8 +1194,8 @@ xfs_alloc_ag_vextent_near(
>  			if ((error = xfs_alloc_get_rec(bno_cur_lt, &ltbno, &ltlen, &i)))
>  				goto error0;
>  			XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
> -			xfs_alloc_compute_aligned(args, ltbno, ltlen,
> -						  &ltbnoa, &ltlena);
> +			busy |= xfs_alloc_compute_aligned(args, ltbno, ltlen,
> +					&ltbnoa, &ltlena, &busy_gen);
>  			if (ltlena >= args->minlen && ltbnoa >= args->min_agbno)
>  				break;
>  			if ((error = xfs_btree_decrement(bno_cur_lt, 0, &i)))
> @@ -1199,8 +1210,8 @@ xfs_alloc_ag_vextent_near(
>  			if ((error = xfs_alloc_get_rec(bno_cur_gt, &gtbno, &gtlen, &i)))
>  				goto error0;
>  			XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
> -			xfs_alloc_compute_aligned(args, gtbno, gtlen,
> -						  &gtbnoa, &gtlena);
> +			busy |= xfs_alloc_compute_aligned(args, gtbno, gtlen,
> +					&gtbnoa, &gtlena, &busy_gen);

Not a big deal, but perhaps in the above two cases where we're
traversing the bnobt, just track the max busy gen and use that being set
non-zero to trigger (hopefully) fewer flushes rather than being subject
to whatever the last value was? Then we don't have to do the 'busy |=
..' thing either. That doesn't cover the overflow case, but that should
be rare and we still have the retry.

>  			if (gtlena >= args->minlen && gtbnoa <= args->max_agbno)
>  				break;
>  			if ((error = xfs_btree_increment(bno_cur_gt, 0, &i)))
...
> diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
> index 29c2f99..8251359 100644
> --- a/fs/xfs/xfs_extent_busy.c
> +++ b/fs/xfs/xfs_extent_busy.c
> @@ -334,14 +334,18 @@ xfs_extent_busy_reuse(
>   * subset of the extent that is not busy.  If *rlen is smaller than
>   * args->minlen no suitable extent could be found, and the higher level
>   * code needs to force out the log and retry the allocation.
> + *
> + * Return the current discard generation for the AG if the file system
> + * has online discard enabled.  This value can be used to wait for
> + * the trimmed extent to become fully available if the AG is running out
> + * of space.
>   */
> -void
> +bool
>  xfs_extent_busy_trim(
>  	struct xfs_alloc_arg	*args,
> -	xfs_agblock_t		bno,
> -	xfs_extlen_t		len,
> -	xfs_agblock_t		*rbno,
> -	xfs_extlen_t		*rlen)
> +	xfs_agblock_t		*bno,
> +	xfs_extlen_t		*len,
> +	unsigned		*busy_gen)
>  {
>  	xfs_agblock_t		fbno;
>  	xfs_extlen_t		flen;
> @@ -351,8 +355,8 @@ xfs_extent_busy_trim(
>  
>  	spin_lock(&args->pag->pagb_lock);
>  restart:
> -	fbno = bno;
> -	flen = len;
> +	fbno = *bno;
> +	flen = *len;
>  	rbp = args->pag->pagb_tree.rb_node;
>  	while (rbp && flen >= args->minlen) {
>  		struct xfs_extent_busy *busyp =
> @@ -504,24 +508,25 @@ xfs_extent_busy_trim(
>  
>  		flen = fend - fbno;
>  	}
> +out:
>  	spin_unlock(&args->pag->pagb_lock);
>  
> -	if (fbno != bno || flen != len) {
> -		trace_xfs_extent_busy_trim(args->mp, args->agno, bno, len,
> +	if (fbno != *bno || flen != *len) {
> +		trace_xfs_extent_busy_trim(args->mp, args->agno, *bno, *len,
>  					  fbno, flen);
> +		*bno = fbno;
> +		*len = flen;
> +		*busy_gen = args->pag->pagb_gen;
> +		return true;

We've already dropped pagb_lock by the time we grab pagb_gen. What
prevents this from racing with a flush and pagb_gen bump and returning a
gen value that might not have any associated busy extents?

>  	}
> -	*rbno = fbno;
> -	*rlen = flen;
> -	return;
> +	return false;
>  fail:
>  	/*
>  	 * Return a zero extent length as failure indications.  All callers
>  	 * re-check if the trimmed extent satisfies the minlen requirement.
>  	 */
> -	spin_unlock(&args->pag->pagb_lock);
> -	trace_xfs_extent_busy_trim(args->mp, args->agno, bno, len, fbno, 0);
> -	*rbno = fbno;
> -	*rlen = 0;
> +	flen = 0;
> +	goto out;
>  }
>  
>  STATIC void
...
> @@ -554,29 +574,53 @@ xfs_extent_busy_clear(
...
> +/*
> + * Flush out all busy extents for this AG.
> + */
> +void
> +xfs_extent_busy_flush(
> +	struct xfs_mount	*mp,
> +	struct xfs_perag	*pag,
> +	unsigned		busy_gen)
> +{
> +	DEFINE_WAIT		(wait);
> +	int			log_flushed = 0, error;
> +
> +	trace_xfs_log_force(mp, 0, _THIS_IP_);
> +	error = _xfs_log_force(mp, XFS_LOG_SYNC, &log_flushed);
> +	if (error)
> +		return;
> +
> +	while (busy_gen == READ_ONCE(pag->pagb_gen)) {
> +		prepare_to_wait(&pag->pagb_wait, &wait, TASK_KILLABLE);
> +		schedule();
>  	}
> +	finish_wait(&pag->pagb_wait, &wait);

This seems racy. Shouldn't this do something like:

	do {
		prepare_to_wait();
		if (busy_gen != pagb_gen)
			break;
		schedule();
		finish_wait();
	} while (1);
	finish_wait();

... to make sure we don't lose a wakeup between setting the task state
and actually scheduling out?

>  }
>  
>  /*
...
> diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> index 7f351f7..7363499 100644
> --- a/fs/xfs/xfs_mount.h
> +++ b/fs/xfs/xfs_mount.h
> @@ -384,6 +384,8 @@ typedef struct xfs_perag {
>  	xfs_agino_t	pagl_rightrec;
>  	spinlock_t	pagb_lock;	/* lock for pagb_tree */
>  	struct rb_root	pagb_tree;	/* ordered tree of busy extents */
> +	unsigned int	pagb_gen;
> +	wait_queue_head_t pagb_wait;

Can we add some comments here similar to the other fields? Also, how
about slightly more informative names... pagb_discard_[gen|wait], or
pagb_busy_*?

Brian

>  
>  	atomic_t        pagf_fstrms;    /* # of filestreams active in this AG */
>  
> -- 
> 2.1.4
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Brian Foster Feb. 3, 2017, 4:22 p.m. UTC | #2
On Sun, Jan 29, 2017 at 07:43:39PM +0100, Christoph Hellwig wrote:
> Currently we force the log and simply try again if we hit a busy extent,
> but especially with online discard enabled it might take a while after
> the log force for the busy extents to disappear, and we might have
> already completed our second pass.
> 
> So instead we add a new waitqueue and a generation counter to the pag
> structure so that we can do wakeups once we've removed busy extents,
> and we replace the single retry with an unconditional one - after
> all we hold the AGF buffer lock, so no other allocations or frees
> can be racing with us in this AG.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  fs/xfs/libxfs/xfs_alloc.c | 93 +++++++++++++++++++++++---------------------
>  fs/xfs/xfs_extent_busy.c  | 98 ++++++++++++++++++++++++++++++++++-------------
>  fs/xfs/xfs_extent_busy.h  |  8 +++-
>  fs/xfs/xfs_mount.c        |  1 +
>  fs/xfs/xfs_mount.h        |  2 +
>  5 files changed, 129 insertions(+), 73 deletions(-)
> 
...
> diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
> index 29c2f99..8251359 100644
> --- a/fs/xfs/xfs_extent_busy.c
> +++ b/fs/xfs/xfs_extent_busy.c
...
> @@ -554,29 +574,53 @@ xfs_extent_busy_clear(
>  	struct xfs_extent_busy	*busyp, *n;
>  	struct xfs_perag	*pag = NULL;
>  	xfs_agnumber_t		agno = NULLAGNUMBER;
> +	bool			wakeup = false;
>  
>  	list_for_each_entry_safe(busyp, n, list, list) {
>  		if (busyp->agno != agno) {
> -			if (pag) {
> -				spin_unlock(&pag->pagb_lock);
> -				xfs_perag_put(pag);
> -			}
> -			pag = xfs_perag_get(mp, busyp->agno);
> -			spin_lock(&pag->pagb_lock);
> +			if (pag)
> +				xfs_extent_busy_put_pag(pag, wakeup);
>  			agno = busyp->agno;
> +			pag = xfs_perag_get(mp, agno);
> +			spin_lock(&pag->pagb_lock);
> +			wakeup = false;
>  		}
>  
>  		if (do_discard && busyp->length &&
> -		    !(busyp->flags & XFS_EXTENT_BUSY_SKIP_DISCARD))
> +		    !(busyp->flags & XFS_EXTENT_BUSY_SKIP_DISCARD)) {
>  			busyp->flags = XFS_EXTENT_BUSY_DISCARDED;
> -		else
> +		} else {
>  			xfs_extent_busy_clear_one(mp, pag, busyp);
> +			wakeup = true;
> +		}

I didn't catch this until looking through everything after the next
patch, but I think there's a problem with the wakeup here as well. If we
have a busy extent with XFS_EXTENT_BUSY_SKIP_DISCARD set, we immediately
issue a wake from the first xfs_extent_busy_clear() in the cil committed
handler, regardless of whether !SKIP_DISCARD extents exist as well
under the current gen value. I think that means we'd get a premature
wake any time a busy_list has at least one of each type..?

Brian

>  	}
>  
> -	if (pag) {
> -		spin_unlock(&pag->pagb_lock);
> -		xfs_perag_put(pag);
> +	if (pag)
> +		xfs_extent_busy_put_pag(pag, wakeup);
> +}
> +
> +/*
> + * Flush out all busy extents for this AG.
> + */
> +void
> +xfs_extent_busy_flush(
> +	struct xfs_mount	*mp,
> +	struct xfs_perag	*pag,
> +	unsigned		busy_gen)
> +{
> +	DEFINE_WAIT		(wait);
> +	int			log_flushed = 0, error;
> +
> +	trace_xfs_log_force(mp, 0, _THIS_IP_);
> +	error = _xfs_log_force(mp, XFS_LOG_SYNC, &log_flushed);
> +	if (error)
> +		return;
> +
> +	while (busy_gen == READ_ONCE(pag->pagb_gen)) {
> +		prepare_to_wait(&pag->pagb_wait, &wait, TASK_KILLABLE);
> +		schedule();
>  	}
> +	finish_wait(&pag->pagb_wait, &wait);
>  }
>  
>  /*
> diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h
> index bfff284..bcb99463 100644
> --- a/fs/xfs/xfs_extent_busy.h
> +++ b/fs/xfs/xfs_extent_busy.h
> @@ -58,9 +58,13 @@ void
>  xfs_extent_busy_reuse(struct xfs_mount *mp, xfs_agnumber_t agno,
>  	xfs_agblock_t fbno, xfs_extlen_t flen, bool userdata);
>  
> +bool
> +xfs_extent_busy_trim(struct xfs_alloc_arg *args, xfs_agblock_t *bno,
> +		xfs_extlen_t *len, unsigned *busy_gen);
> +
>  void
> -xfs_extent_busy_trim(struct xfs_alloc_arg *args, xfs_agblock_t bno,
> -	xfs_extlen_t len, xfs_agblock_t *rbno, xfs_extlen_t *rlen);
> +xfs_extent_busy_flush(struct xfs_mount *mp, struct xfs_perag *pag,
> +	unsigned discards);
>  
>  int
>  xfs_extent_busy_ag_cmp(void *priv, struct list_head *a, struct list_head *b);
> diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
> index 9b9540d..4e9feb1 100644
> --- a/fs/xfs/xfs_mount.c
> +++ b/fs/xfs/xfs_mount.c
> @@ -215,6 +215,7 @@ xfs_initialize_perag(
>  		INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
>  		if (xfs_buf_hash_init(pag))
>  			goto out_unwind;
> +		init_waitqueue_head(&pag->pagb_wait);
>  
>  		if (radix_tree_preload(GFP_NOFS))
>  			goto out_unwind;
> diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> index 7f351f7..7363499 100644
> --- a/fs/xfs/xfs_mount.h
> +++ b/fs/xfs/xfs_mount.h
> @@ -384,6 +384,8 @@ typedef struct xfs_perag {
>  	xfs_agino_t	pagl_rightrec;
>  	spinlock_t	pagb_lock;	/* lock for pagb_tree */
>  	struct rb_root	pagb_tree;	/* ordered tree of busy extents */
> +	unsigned int	pagb_gen;
> +	wait_queue_head_t pagb_wait;
>  
>  	atomic_t        pagf_fstrms;    /* # of filestreams active in this AG */
>  
> -- 
> 2.1.4
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig Feb. 4, 2017, 9:54 a.m. UTC | #3
On Fri, Feb 03, 2017 at 10:22:33AM -0500, Brian Foster wrote:
> Not a big deal, but perhaps in the above two cases where we're
> traversing the bnobt, just track the max busy gen and use that being set
> non-zero to trigger (hopefully) fewer flushes rather than being subject
> to whatever the last value was? Then we don't have to do the 'busy |=
> ..' thing either. That doesn't cover the overflow case, but that should
> be rare and we still have the retry.

It would hang for the overflow case, been there done that.  Note that
we only rety if we failed the allocation anyway, so it won't actually
trigger any less flushes either.

> > +out:
> >  	spin_unlock(&args->pag->pagb_lock);
> >  
> > -	if (fbno != bno || flen != len) {
> > -		trace_xfs_extent_busy_trim(args->mp, args->agno, bno, len,
> > +	if (fbno != *bno || flen != *len) {
> > +		trace_xfs_extent_busy_trim(args->mp, args->agno, *bno, *len,
> >  					  fbno, flen);
> > +		*bno = fbno;
> > +		*len = flen;
> > +		*busy_gen = args->pag->pagb_gen;
> > +		return true;
> 
> We've already dropped pagb_lock by the time we grab pagb_gen. What
> prevents this from racing with a flush and pagb_gen bump and returning a
> gen value that might not have any associated busy extents?

Good point.  I though I had moved the lock around but obviously
didn't.  I'll fix it up for the next version.

> > +	while (busy_gen == READ_ONCE(pag->pagb_gen)) {
> > +		prepare_to_wait(&pag->pagb_wait, &wait, TASK_KILLABLE);
> > +		schedule();
> >  	}
> > +	finish_wait(&pag->pagb_wait, &wait);
> 
> This seems racy. Shouldn't this do something like:
> 
> 	do {
> 		prepare_to_wait();
> 		if (busy_gen != pagb_gen)
> 			break;
> 		schedule();
> 		finish_wait();
> 	} while (1);
> 	finish_wait();
> 
> ... to make sure we don't lose a wakeup between setting the task state
> and actually scheduling out?

Yes, will fix.

> > +++ b/fs/xfs/xfs_mount.h
> > @@ -384,6 +384,8 @@ typedef struct xfs_perag {
> >  	xfs_agino_t	pagl_rightrec;
> >  	spinlock_t	pagb_lock;	/* lock for pagb_tree */
> >  	struct rb_root	pagb_tree;	/* ordered tree of busy extents */
> > +	unsigned int	pagb_gen;
> > +	wait_queue_head_t pagb_wait;
> 
> Can we add some comments here similar to the other fields?

Sure.

> Also, how
> about slightly more informative names... pagb_discard_[gen|wait], or
> pagb_busy_*?

That's what I had first - but:

 - pagb is the short name for the pag busy tree and I wanted to
   follow that convention.  And with the current series we also
   use the wakeup code for normal busy extents, even without discards.
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig Feb. 4, 2017, 9:56 a.m. UTC | #4
On Fri, Feb 03, 2017 at 11:22:43AM -0500, Brian Foster wrote:
> I didn't catch this until looking through everything after the next
> patch, but I think there's a problem with the wakeup here as well. If we
> have a busy extent with XFS_EXTENT_BUSY_SKIP_DISCARD set, we immediately
> issue a wake from the first xfs_extent_busy_clear() in the cil committed
> handler, regardless of whether !SKIP_DISCARD extents exist as well
> under the current gen value. I think that means we'd get a premature
> wake any time a busy_list has at least one of each type..?

We'll need to wake as soon as a previously busy extent in the AG
becomes available, and that includes  XFS_EXTENT_BUSY_SKIP_DISCARD
ones.  Otherwise e.g. a transaction only containing
XFS_EXTENT_BUSY_SKIP_DISCARD will never wake at all.

--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Brian Foster Feb. 6, 2017, 4:47 p.m. UTC | #5
On Sat, Feb 04, 2017 at 10:56:06AM +0100, Christoph Hellwig wrote:
> On Fri, Feb 03, 2017 at 11:22:43AM -0500, Brian Foster wrote:
> > I didn't catch this until looking through everything after the next
> > patch, but I think there's a problem with the wakeup here as well. If we
> > have a busy extent with XFS_EXTENT_BUSY_SKIP_DISCARD set, we immediately
> > issue a wake from the first xfs_extent_busy_clear() in the cil committed
> > handler, regardless of whether !SKIP_DISCARD extents exist as well
> > under the current gen value. I think that means we'd get a premature
> > wake any time a busy_list has at least one of each type..?
> 
> We'll need to wake as soon as a previously busy extent in the AG
> becomes available, and that includes  XFS_EXTENT_BUSY_SKIP_DISCARD
> ones.  Otherwise e.g. a transaction only containing
> XFS_EXTENT_BUSY_SKIP_DISCARD will never wake at all.
> 

Hmm, Ok. I suppose that isn't a problem so long as we don't ever wait
for a particular extent based on a particular generation number. The
current code is just retrying allocations and whatnot, so is probably
safe.

That said, that limitation should be noted somewhere. Can we add a
comment in xfs_extent_busy_clear() right above the hunk where we do the
SKIP_DISCARD wake? E.g., something that points out the gen number for
any particular extent could be bumped in such a situation.. (or
something along those lines)?

Brian

> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig Feb. 7, 2017, 9:43 a.m. UTC | #6
On Mon, Feb 06, 2017 at 11:47:50AM -0500, Brian Foster wrote:
> That said, that limitation should be noted somewhere. Can we add a
> comment in xfs_extent_busy_clear() right above the hunk where we do the
> SKIP_DISCARD wake? E.g., something that points out the gen number for
> any particular extent could be bumped in such a situation.. (or
> something along those lines)?

I could add a comment, but given that the tree tracks busy extents
and is not in any way specific to discard I think it's more confusing
than not having it.
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Brian Foster Feb. 7, 2017, 1:13 p.m. UTC | #7
On Tue, Feb 07, 2017 at 10:43:32AM +0100, Christoph Hellwig wrote:
> On Mon, Feb 06, 2017 at 11:47:50AM -0500, Brian Foster wrote:
> > That said, that limitation should be noted somewhere. Can we add a
> > comment in xfs_extent_busy_clear() right above the hunk where we do the
> > SKIP_DISCARD wake? E.g., something that points out the gen number for
> > any particular extent could be bumped in such a situation.. (or
> > something along those lines)?
> 
> I could add a comment, but given that the tree tracks busy extents
> and is not in any way specific to discard I think it's more confusing
> than not having it.

The tree is generic to busy extents, sure. The behavior of the
generation number definitely changes depending on whether discards are
enabled or not.

If discard is not enabled, the behavior is straightforward in that a
busy_gen returned for a particular extent is triggered when that extent
is made unbusy. A retry call to xfs_extent_busy_[trim|flush]() is never
necessary. If discard is enabled, the pagb_gen returned for a particular
extent is triggered when one of the busy extents in the list is made
unbusy. It might be the extent I passed to xfs_extent_busy_trim(), it
might not. I have to retry the call to know for sure.

I think it's pretty straightforward how easy that may be to misuse in
the future. My concern is that somebody down the road adds code to trim
a particular extent, waits on the pagb_gen, doesn't retry and thus
introduces a very subtle bug into the code.

I can pretty much guarantee after a month or so I'm going to forget all
about this constraint, never mind longer than that. Hence, I'm asking
for a brief comment to clarify how pagb_gen actually works. The more I
think about it, I think above xfs_busy_extent_trim() is the right place
since that is where we introduce/return the gen, we have a comment
update there already, and it apparently already mistakenly refers to
online discard. How about we update it to something like the following?

/*
 * ...
 *
 * Return the current busy generation for the AG if the extent is busy. This
 * value can be used to wait for at least one of the currently busy extents to
 * be cleared. Note that the busy list is not guaranteed to be empty after the
 * gen is woken. The state of a specific extent must always be confirmed with
 * another call to xfs_extent_busy_trim() before it can be used.
 */

Brian

> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig Feb. 7, 2017, 3:45 p.m. UTC | #8
On Tue, Feb 07, 2017 at 08:13:54AM -0500, Brian Foster wrote:
> If discard is not enabled, the behavior is straightforward in that a
> busy_gen returned for a particular extent is triggered when that extent
> is made unbusy. A retry call to xfs_extent_busy_[trim|flush]() is never
> necessary. If discard is enabled, the pagb_gen returned for a particular
> extent is triggered when one of the busy extents in the list is made
> unbusy. It might be the extent I passed to xfs_extent_busy_trim(), it
> might not. I have to retry the call to know for sure.

pagb_gen is always incremented when _a_ busy extent is freed.
Without online discard this does indeed only happen once per
log commit / AG.  With discards it could happen twice, but this
difference in semantics shouldn't really matter.

> I think it's pretty straightforward how easy that may be to misuse in
> the future. My concern is that somebody down the road adds code to trim
> a particular extent, waits on the pagb_gen, doesn't retry and thus
> introduces a very subtle bug into the code.
> 
> I can pretty much guarantee after a month or so I'm going to forget all
> about this constraint, never mind longer than that. Hence, I'm asking
> for a brief comment to clarify how pagb_gen actually works. The more I
> think about it, I think above xfs_busy_extent_trim() is the right place

>
> since that is where we introduce/return the gen, we have a comment
> update there already, and it apparently already mistakenly refers to
> online discard. How about we update it to something like the following?

Sure, I'll add it.
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 9f06a21..fe98fbc 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -221,20 +221,22 @@  xfs_alloc_get_rec(
  * Compute aligned version of the found extent.
  * Takes alignment and min length into account.
  */
-STATIC void
+STATIC bool
 xfs_alloc_compute_aligned(
 	xfs_alloc_arg_t	*args,		/* allocation argument structure */
 	xfs_agblock_t	foundbno,	/* starting block in found extent */
 	xfs_extlen_t	foundlen,	/* length in found extent */
 	xfs_agblock_t	*resbno,	/* result block number */
-	xfs_extlen_t	*reslen)	/* result length */
+	xfs_extlen_t	*reslen,	/* result length */
+	unsigned	*busy_gen)
 {
-	xfs_agblock_t	bno;
-	xfs_extlen_t	len;
+	xfs_agblock_t	bno = foundbno;
+	xfs_extlen_t	len = foundlen;
 	xfs_extlen_t	diff;
+	bool		busy;
 
 	/* Trim busy sections out of found extent */
-	xfs_extent_busy_trim(args, foundbno, foundlen, &bno, &len);
+	busy = xfs_extent_busy_trim(args, &bno, &len, busy_gen);
 
 	/*
 	 * If we have a largish extent that happens to start before min_agbno,
@@ -259,6 +261,8 @@  xfs_alloc_compute_aligned(
 		*resbno = bno;
 		*reslen = len;
 	}
+
+	return busy;
 }
 
 /*
@@ -737,10 +741,11 @@  xfs_alloc_ag_vextent_exact(
 	int		error;
 	xfs_agblock_t	fbno;	/* start block of found extent */
 	xfs_extlen_t	flen;	/* length of found extent */
-	xfs_agblock_t	tbno;	/* start block of trimmed extent */
-	xfs_extlen_t	tlen;	/* length of trimmed extent */
-	xfs_agblock_t	tend;	/* end block of trimmed extent */
+	xfs_agblock_t	tbno;	/* start block of busy extent */
+	xfs_extlen_t	tlen;	/* length of busy extent */
+	xfs_agblock_t	tend;	/* end block of busy extent */
 	int		i;	/* success/failure of operation */
+	unsigned	busy_gen;
 
 	ASSERT(args->alignment == 1);
 
@@ -773,7 +778,9 @@  xfs_alloc_ag_vextent_exact(
 	/*
 	 * Check for overlapping busy extents.
 	 */
-	xfs_extent_busy_trim(args, fbno, flen, &tbno, &tlen);
+	tbno = fbno;
+	tlen = flen;
+	xfs_extent_busy_trim(args, &tbno, &tlen, &busy_gen);
 
 	/*
 	 * Give up if the start of the extent is busy, or the freespace isn't
@@ -853,6 +860,7 @@  xfs_alloc_find_best_extent(
 	xfs_agblock_t		sdiff;
 	int			error;
 	int			i;
+	unsigned		busy_gen;
 
 	/* The good extent is perfect, no need to  search. */
 	if (!gdiff)
@@ -866,7 +874,8 @@  xfs_alloc_find_best_extent(
 		if (error)
 			goto error0;
 		XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
-		xfs_alloc_compute_aligned(args, *sbno, *slen, sbnoa, slena);
+		xfs_alloc_compute_aligned(args, *sbno, *slen,
+				sbnoa, slena, &busy_gen);
 
 		/*
 		 * The good extent is closer than this one.
@@ -955,7 +964,8 @@  xfs_alloc_ag_vextent_near(
 	xfs_extlen_t	ltlena;		/* aligned ... */
 	xfs_agblock_t	ltnew;		/* useful start bno of left side */
 	xfs_extlen_t	rlen;		/* length of returned extent */
-	int		forced = 0;
+	bool		busy;
+	unsigned	busy_gen;
 #ifdef DEBUG
 	/*
 	 * Randomly don't execute the first algorithm.
@@ -982,6 +992,7 @@  xfs_alloc_ag_vextent_near(
 	ltlen = 0;
 	gtlena = 0;
 	ltlena = 0;
+	busy = false;
 
 	/*
 	 * Get a cursor for the by-size btree.
@@ -1064,8 +1075,8 @@  xfs_alloc_ag_vextent_near(
 			if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno, &ltlen, &i)))
 				goto error0;
 			XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
-			xfs_alloc_compute_aligned(args, ltbno, ltlen,
-						  &ltbnoa, &ltlena);
+			busy = xfs_alloc_compute_aligned(args, ltbno, ltlen,
+					&ltbnoa, &ltlena, &busy_gen);
 			if (ltlena < args->minlen)
 				continue;
 			if (ltbnoa < args->min_agbno || ltbnoa > args->max_agbno)
@@ -1183,8 +1194,8 @@  xfs_alloc_ag_vextent_near(
 			if ((error = xfs_alloc_get_rec(bno_cur_lt, &ltbno, &ltlen, &i)))
 				goto error0;
 			XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
-			xfs_alloc_compute_aligned(args, ltbno, ltlen,
-						  &ltbnoa, &ltlena);
+			busy |= xfs_alloc_compute_aligned(args, ltbno, ltlen,
+					&ltbnoa, &ltlena, &busy_gen);
 			if (ltlena >= args->minlen && ltbnoa >= args->min_agbno)
 				break;
 			if ((error = xfs_btree_decrement(bno_cur_lt, 0, &i)))
@@ -1199,8 +1210,8 @@  xfs_alloc_ag_vextent_near(
 			if ((error = xfs_alloc_get_rec(bno_cur_gt, &gtbno, &gtlen, &i)))
 				goto error0;
 			XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
-			xfs_alloc_compute_aligned(args, gtbno, gtlen,
-						  &gtbnoa, &gtlena);
+			busy |= xfs_alloc_compute_aligned(args, gtbno, gtlen,
+					&gtbnoa, &gtlena, &busy_gen);
 			if (gtlena >= args->minlen && gtbnoa <= args->max_agbno)
 				break;
 			if ((error = xfs_btree_increment(bno_cur_gt, 0, &i)))
@@ -1261,9 +1272,9 @@  xfs_alloc_ag_vextent_near(
 	if (bno_cur_lt == NULL && bno_cur_gt == NULL) {
 		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
 
-		if (!forced++) {
+		if (busy) {
 			trace_xfs_alloc_near_busy(args);
-			xfs_log_force(args->mp, XFS_LOG_SYNC);
+			xfs_extent_busy_flush(args->mp, args->pag, busy_gen);
 			goto restart;
 		}
 		trace_xfs_alloc_size_neither(args);
@@ -1344,7 +1355,8 @@  xfs_alloc_ag_vextent_size(
 	int		i;		/* temp status variable */
 	xfs_agblock_t	rbno;		/* returned block number */
 	xfs_extlen_t	rlen;		/* length of returned extent */
-	int		forced = 0;
+	bool		busy;
+	unsigned	busy_gen;
 
 restart:
 	/*
@@ -1353,6 +1365,7 @@  xfs_alloc_ag_vextent_size(
 	cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
 		args->agno, XFS_BTNUM_CNT);
 	bno_cur = NULL;
+	busy = false;
 
 	/*
 	 * Look for an entry >= maxlen+alignment-1 blocks.
@@ -1362,14 +1375,13 @@  xfs_alloc_ag_vextent_size(
 		goto error0;
 
 	/*
-	 * If none or we have busy extents that we cannot allocate from, then
-	 * we have to settle for a smaller extent. In the case that there are
-	 * no large extents, this will return the last entry in the tree unless
-	 * the tree is empty. In the case that there are only busy large
-	 * extents, this will return the largest small extent unless there
+	 * If none then we have to settle for a smaller extent. In the case that
+	 * there are no large extents, this will return the last entry in the
+	 * tree unless the tree is empty. In the case that there are only busy
+	 * large extents, this will return the largest small extent unless there
 	 * are no smaller extents available.
 	 */
-	if (!i || forced > 1) {
+	if (!i) {
 		error = xfs_alloc_ag_vextent_small(args, cnt_cur,
 						   &fbno, &flen, &i);
 		if (error)
@@ -1380,13 +1392,11 @@  xfs_alloc_ag_vextent_size(
 			return 0;
 		}
 		ASSERT(i == 1);
-		xfs_alloc_compute_aligned(args, fbno, flen, &rbno, &rlen);
+		busy = xfs_alloc_compute_aligned(args, fbno, flen, &rbno,
+				&rlen, &busy_gen);
 	} else {
 		/*
 		 * Search for a non-busy extent that is large enough.
-		 * If we are at low space, don't check, or if we fall of
-		 * the end of the btree, turn off the busy check and
-		 * restart.
 		 */
 		for (;;) {
 			error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i);
@@ -1394,8 +1404,8 @@  xfs_alloc_ag_vextent_size(
 				goto error0;
 			XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
 
-			xfs_alloc_compute_aligned(args, fbno, flen,
-						  &rbno, &rlen);
+			busy = xfs_alloc_compute_aligned(args, fbno, flen,
+					&rbno, &rlen, &busy_gen);
 
 			if (rlen >= args->maxlen)
 				break;
@@ -1407,18 +1417,13 @@  xfs_alloc_ag_vextent_size(
 				/*
 				 * Our only valid extents must have been busy.
 				 * Make it unbusy by forcing the log out and
-				 * retrying. If we've been here before, forcing
-				 * the log isn't making the extents available,
-				 * which means they have probably been freed in
-				 * this transaction.  In that case, we have to
-				 * give up on them and we'll attempt a minlen
-				 * allocation the next time around.
+				 * retrying.
 				 */
 				xfs_btree_del_cursor(cnt_cur,
 						     XFS_BTREE_NOERROR);
 				trace_xfs_alloc_size_busy(args);
-				if (!forced++)
-					xfs_log_force(args->mp, XFS_LOG_SYNC);
+				xfs_extent_busy_flush(args->mp,
+							args->pag, busy_gen);
 				goto restart;
 			}
 		}
@@ -1454,8 +1459,8 @@  xfs_alloc_ag_vextent_size(
 			XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
 			if (flen < bestrlen)
 				break;
-			xfs_alloc_compute_aligned(args, fbno, flen,
-						  &rbno, &rlen);
+			busy = xfs_alloc_compute_aligned(args, fbno, flen,
+					&rbno, &rlen, &busy_gen);
 			rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
 			XFS_WANT_CORRUPTED_GOTO(args->mp, rlen == 0 ||
 				(rlen <= flen && rbno + rlen <= fbno + flen),
@@ -1484,10 +1489,10 @@  xfs_alloc_ag_vextent_size(
 	 */
 	args->len = rlen;
 	if (rlen < args->minlen) {
-		if (!forced++) {
+		if (busy) {
 			xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
 			trace_xfs_alloc_size_busy(args);
-			xfs_log_force(args->mp, XFS_LOG_SYNC);
+			xfs_extent_busy_flush(args->mp, args->pag, busy_gen);
 			goto restart;
 		}
 		goto out_nominleft;
diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
index 29c2f99..8251359 100644
--- a/fs/xfs/xfs_extent_busy.c
+++ b/fs/xfs/xfs_extent_busy.c
@@ -334,14 +334,18 @@  xfs_extent_busy_reuse(
  * subset of the extent that is not busy.  If *rlen is smaller than
  * args->minlen no suitable extent could be found, and the higher level
  * code needs to force out the log and retry the allocation.
+ *
+ * Return the current discard generation for the AG if the file system
+ * has online discard enabled.  This value can be used to wait for
+ * the trimmed extent to become fully available if the AG is running out
+ * of space.
  */
-void
+bool
 xfs_extent_busy_trim(
 	struct xfs_alloc_arg	*args,
-	xfs_agblock_t		bno,
-	xfs_extlen_t		len,
-	xfs_agblock_t		*rbno,
-	xfs_extlen_t		*rlen)
+	xfs_agblock_t		*bno,
+	xfs_extlen_t		*len,
+	unsigned		*busy_gen)
 {
 	xfs_agblock_t		fbno;
 	xfs_extlen_t		flen;
@@ -351,8 +355,8 @@  xfs_extent_busy_trim(
 
 	spin_lock(&args->pag->pagb_lock);
 restart:
-	fbno = bno;
-	flen = len;
+	fbno = *bno;
+	flen = *len;
 	rbp = args->pag->pagb_tree.rb_node;
 	while (rbp && flen >= args->minlen) {
 		struct xfs_extent_busy *busyp =
@@ -504,24 +508,25 @@  xfs_extent_busy_trim(
 
 		flen = fend - fbno;
 	}
+out:
 	spin_unlock(&args->pag->pagb_lock);
 
-	if (fbno != bno || flen != len) {
-		trace_xfs_extent_busy_trim(args->mp, args->agno, bno, len,
+	if (fbno != *bno || flen != *len) {
+		trace_xfs_extent_busy_trim(args->mp, args->agno, *bno, *len,
 					  fbno, flen);
+		*bno = fbno;
+		*len = flen;
+		*busy_gen = args->pag->pagb_gen;
+		return true;
 	}
-	*rbno = fbno;
-	*rlen = flen;
-	return;
+	return false;
 fail:
 	/*
 	 * Return a zero extent length as failure indications.  All callers
 	 * re-check if the trimmed extent satisfies the minlen requirement.
 	 */
-	spin_unlock(&args->pag->pagb_lock);
-	trace_xfs_extent_busy_trim(args->mp, args->agno, bno, len, fbno, 0);
-	*rbno = fbno;
-	*rlen = 0;
+	flen = 0;
+	goto out;
 }
 
 STATIC void
@@ -540,6 +545,21 @@  xfs_extent_busy_clear_one(
 	kmem_free(busyp);
 }
 
+static void
+xfs_extent_busy_put_pag(
+	struct xfs_perag	*pag,
+	bool			wakeup)
+		__releases(pag->pagb_lock)
+{
+	if (wakeup) {
+		pag->pagb_gen++;
+		wake_up_all(&pag->pagb_wait);
+	}
+
+	spin_unlock(&pag->pagb_lock);
+	xfs_perag_put(pag);
+}
+
 /*
  * Remove all extents on the passed in list from the busy extents tree.
  * If do_discard is set skip extents that need to be discarded, and mark
@@ -554,29 +574,53 @@  xfs_extent_busy_clear(
 	struct xfs_extent_busy	*busyp, *n;
 	struct xfs_perag	*pag = NULL;
 	xfs_agnumber_t		agno = NULLAGNUMBER;
+	bool			wakeup = false;
 
 	list_for_each_entry_safe(busyp, n, list, list) {
 		if (busyp->agno != agno) {
-			if (pag) {
-				spin_unlock(&pag->pagb_lock);
-				xfs_perag_put(pag);
-			}
-			pag = xfs_perag_get(mp, busyp->agno);
-			spin_lock(&pag->pagb_lock);
+			if (pag)
+				xfs_extent_busy_put_pag(pag, wakeup);
 			agno = busyp->agno;
+			pag = xfs_perag_get(mp, agno);
+			spin_lock(&pag->pagb_lock);
+			wakeup = false;
 		}
 
 		if (do_discard && busyp->length &&
-		    !(busyp->flags & XFS_EXTENT_BUSY_SKIP_DISCARD))
+		    !(busyp->flags & XFS_EXTENT_BUSY_SKIP_DISCARD)) {
 			busyp->flags = XFS_EXTENT_BUSY_DISCARDED;
-		else
+		} else {
 			xfs_extent_busy_clear_one(mp, pag, busyp);
+			wakeup = true;
+		}
 	}
 
-	if (pag) {
-		spin_unlock(&pag->pagb_lock);
-		xfs_perag_put(pag);
+	if (pag)
+		xfs_extent_busy_put_pag(pag, wakeup);
+}
+
+/*
+ * Flush out all busy extents for this AG.
+ */
+void
+xfs_extent_busy_flush(
+	struct xfs_mount	*mp,
+	struct xfs_perag	*pag,
+	unsigned		busy_gen)
+{
+	DEFINE_WAIT		(wait);
+	int			log_flushed = 0, error;
+
+	trace_xfs_log_force(mp, 0, _THIS_IP_);
+	error = _xfs_log_force(mp, XFS_LOG_SYNC, &log_flushed);
+	if (error)
+		return;
+
+	while (busy_gen == READ_ONCE(pag->pagb_gen)) {
+		prepare_to_wait(&pag->pagb_wait, &wait, TASK_KILLABLE);
+		schedule();
 	}
+	finish_wait(&pag->pagb_wait, &wait);
 }
 
 /*
diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h
index bfff284..bcb99463 100644
--- a/fs/xfs/xfs_extent_busy.h
+++ b/fs/xfs/xfs_extent_busy.h
@@ -58,9 +58,13 @@  void
 xfs_extent_busy_reuse(struct xfs_mount *mp, xfs_agnumber_t agno,
 	xfs_agblock_t fbno, xfs_extlen_t flen, bool userdata);
 
+bool
+xfs_extent_busy_trim(struct xfs_alloc_arg *args, xfs_agblock_t *bno,
+		xfs_extlen_t *len, unsigned *busy_gen);
+
 void
-xfs_extent_busy_trim(struct xfs_alloc_arg *args, xfs_agblock_t bno,
-	xfs_extlen_t len, xfs_agblock_t *rbno, xfs_extlen_t *rlen);
+xfs_extent_busy_flush(struct xfs_mount *mp, struct xfs_perag *pag,
+	unsigned discards);
 
 int
 xfs_extent_busy_ag_cmp(void *priv, struct list_head *a, struct list_head *b);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 9b9540d..4e9feb1 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -215,6 +215,7 @@  xfs_initialize_perag(
 		INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
 		if (xfs_buf_hash_init(pag))
 			goto out_unwind;
+		init_waitqueue_head(&pag->pagb_wait);
 
 		if (radix_tree_preload(GFP_NOFS))
 			goto out_unwind;
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 7f351f7..7363499 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -384,6 +384,8 @@  typedef struct xfs_perag {
 	xfs_agino_t	pagl_rightrec;
 	spinlock_t	pagb_lock;	/* lock for pagb_tree */
 	struct rb_root	pagb_tree;	/* ordered tree of busy extents */
+	unsigned int	pagb_gen;
+	wait_queue_head_t pagb_wait;
 
 	atomic_t        pagf_fstrms;    /* # of filestreams active in this AG */