Message ID | 1485715421-17182-3-git-send-email-hch@lst.de (mailing list archive) |
---|---|
State | Superseded, archived |
Headers | show |
On Sun, Jan 29, 2017 at 07:43:39PM +0100, Christoph Hellwig wrote: > Currently we force the log and simply try again if we hit a busy extent, > but especially with online discard enabled it might take a while after > the log force for the busy extents to disappear, and we might have > already completed our second pass. > > So instead we add a new waitqueue and a generation counter to the pag > structure so that we can do wakeups once we've removed busy extents, > and we replace the single retry with an unconditional one - after > all we hold the AGF buffer lock, so no other allocations or frees > can be racing with us in this AG. > > Signed-off-by: Christoph Hellwig <hch@lst.de> > --- > fs/xfs/libxfs/xfs_alloc.c | 93 +++++++++++++++++++++++--------------------- > fs/xfs/xfs_extent_busy.c | 98 ++++++++++++++++++++++++++++++++++------------- > fs/xfs/xfs_extent_busy.h | 8 +++- > fs/xfs/xfs_mount.c | 1 + > fs/xfs/xfs_mount.h | 2 + > 5 files changed, 129 insertions(+), 73 deletions(-) > > diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c > index 9f06a21..fe98fbc 100644 > --- a/fs/xfs/libxfs/xfs_alloc.c > +++ b/fs/xfs/libxfs/xfs_alloc.c ... > @@ -1183,8 +1194,8 @@ xfs_alloc_ag_vextent_near( > if ((error = xfs_alloc_get_rec(bno_cur_lt, <bno, <len, &i))) > goto error0; > XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); > - xfs_alloc_compute_aligned(args, ltbno, ltlen, > - <bnoa, <lena); > + busy |= xfs_alloc_compute_aligned(args, ltbno, ltlen, > + <bnoa, <lena, &busy_gen); > if (ltlena >= args->minlen && ltbnoa >= args->min_agbno) > break; > if ((error = xfs_btree_decrement(bno_cur_lt, 0, &i))) > @@ -1199,8 +1210,8 @@ xfs_alloc_ag_vextent_near( > if ((error = xfs_alloc_get_rec(bno_cur_gt, >bno, >len, &i))) > goto error0; > XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); > - xfs_alloc_compute_aligned(args, gtbno, gtlen, > - >bnoa, >lena); > + busy |= xfs_alloc_compute_aligned(args, gtbno, gtlen, > + >bnoa, >lena, &busy_gen); Not a big deal, but perhaps in the above two cases where we're traversing the bnobt, just track the max busy gen and use that being set non-zero to trigger (hopefully) fewer flushes rather than being subject to whatever the last value was? Then we don't have to do the 'busy |= ..' thing either. That doesn't cover the overflow case, but that should be rare and we still have the retry. > if (gtlena >= args->minlen && gtbnoa <= args->max_agbno) > break; > if ((error = xfs_btree_increment(bno_cur_gt, 0, &i))) ... > diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c > index 29c2f99..8251359 100644 > --- a/fs/xfs/xfs_extent_busy.c > +++ b/fs/xfs/xfs_extent_busy.c > @@ -334,14 +334,18 @@ xfs_extent_busy_reuse( > * subset of the extent that is not busy. If *rlen is smaller than > * args->minlen no suitable extent could be found, and the higher level > * code needs to force out the log and retry the allocation. > + * > + * Return the current discard generation for the AG if the file system > + * has online discard enabled. This value can be used to wait for > + * the trimmed extent to become fully available if the AG is running out > + * of space. > */ > -void > +bool > xfs_extent_busy_trim( > struct xfs_alloc_arg *args, > - xfs_agblock_t bno, > - xfs_extlen_t len, > - xfs_agblock_t *rbno, > - xfs_extlen_t *rlen) > + xfs_agblock_t *bno, > + xfs_extlen_t *len, > + unsigned *busy_gen) > { > xfs_agblock_t fbno; > xfs_extlen_t flen; > @@ -351,8 +355,8 @@ xfs_extent_busy_trim( > > spin_lock(&args->pag->pagb_lock); > restart: > - fbno = bno; > - flen = len; > + fbno = *bno; > + flen = *len; > rbp = args->pag->pagb_tree.rb_node; > while (rbp && flen >= args->minlen) { > struct xfs_extent_busy *busyp = > @@ -504,24 +508,25 @@ xfs_extent_busy_trim( > > flen = fend - fbno; > } > +out: > spin_unlock(&args->pag->pagb_lock); > > - if (fbno != bno || flen != len) { > - trace_xfs_extent_busy_trim(args->mp, args->agno, bno, len, > + if (fbno != *bno || flen != *len) { > + trace_xfs_extent_busy_trim(args->mp, args->agno, *bno, *len, > fbno, flen); > + *bno = fbno; > + *len = flen; > + *busy_gen = args->pag->pagb_gen; > + return true; We've already dropped pagb_lock by the time we grab pagb_gen. What prevents this from racing with a flush and pagb_gen bump and returning a gen value that might not have any associated busy extents? > } > - *rbno = fbno; > - *rlen = flen; > - return; > + return false; > fail: > /* > * Return a zero extent length as failure indications. All callers > * re-check if the trimmed extent satisfies the minlen requirement. > */ > - spin_unlock(&args->pag->pagb_lock); > - trace_xfs_extent_busy_trim(args->mp, args->agno, bno, len, fbno, 0); > - *rbno = fbno; > - *rlen = 0; > + flen = 0; > + goto out; > } > > STATIC void ... > @@ -554,29 +574,53 @@ xfs_extent_busy_clear( ... > +/* > + * Flush out all busy extents for this AG. > + */ > +void > +xfs_extent_busy_flush( > + struct xfs_mount *mp, > + struct xfs_perag *pag, > + unsigned busy_gen) > +{ > + DEFINE_WAIT (wait); > + int log_flushed = 0, error; > + > + trace_xfs_log_force(mp, 0, _THIS_IP_); > + error = _xfs_log_force(mp, XFS_LOG_SYNC, &log_flushed); > + if (error) > + return; > + > + while (busy_gen == READ_ONCE(pag->pagb_gen)) { > + prepare_to_wait(&pag->pagb_wait, &wait, TASK_KILLABLE); > + schedule(); > } > + finish_wait(&pag->pagb_wait, &wait); This seems racy. Shouldn't this do something like: do { prepare_to_wait(); if (busy_gen != pagb_gen) break; schedule(); finish_wait(); } while (1); finish_wait(); ... to make sure we don't lose a wakeup between setting the task state and actually scheduling out? > } > > /* ... > diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h > index 7f351f7..7363499 100644 > --- a/fs/xfs/xfs_mount.h > +++ b/fs/xfs/xfs_mount.h > @@ -384,6 +384,8 @@ typedef struct xfs_perag { > xfs_agino_t pagl_rightrec; > spinlock_t pagb_lock; /* lock for pagb_tree */ > struct rb_root pagb_tree; /* ordered tree of busy extents */ > + unsigned int pagb_gen; > + wait_queue_head_t pagb_wait; Can we add some comments here similar to the other fields? Also, how about slightly more informative names... pagb_discard_[gen|wait], or pagb_busy_*? Brian > > atomic_t pagf_fstrms; /* # of filestreams active in this AG */ > > -- > 2.1.4 > > -- > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Sun, Jan 29, 2017 at 07:43:39PM +0100, Christoph Hellwig wrote: > Currently we force the log and simply try again if we hit a busy extent, > but especially with online discard enabled it might take a while after > the log force for the busy extents to disappear, and we might have > already completed our second pass. > > So instead we add a new waitqueue and a generation counter to the pag > structure so that we can do wakeups once we've removed busy extents, > and we replace the single retry with an unconditional one - after > all we hold the AGF buffer lock, so no other allocations or frees > can be racing with us in this AG. > > Signed-off-by: Christoph Hellwig <hch@lst.de> > --- > fs/xfs/libxfs/xfs_alloc.c | 93 +++++++++++++++++++++++--------------------- > fs/xfs/xfs_extent_busy.c | 98 ++++++++++++++++++++++++++++++++++------------- > fs/xfs/xfs_extent_busy.h | 8 +++- > fs/xfs/xfs_mount.c | 1 + > fs/xfs/xfs_mount.h | 2 + > 5 files changed, 129 insertions(+), 73 deletions(-) > ... > diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c > index 29c2f99..8251359 100644 > --- a/fs/xfs/xfs_extent_busy.c > +++ b/fs/xfs/xfs_extent_busy.c ... > @@ -554,29 +574,53 @@ xfs_extent_busy_clear( > struct xfs_extent_busy *busyp, *n; > struct xfs_perag *pag = NULL; > xfs_agnumber_t agno = NULLAGNUMBER; > + bool wakeup = false; > > list_for_each_entry_safe(busyp, n, list, list) { > if (busyp->agno != agno) { > - if (pag) { > - spin_unlock(&pag->pagb_lock); > - xfs_perag_put(pag); > - } > - pag = xfs_perag_get(mp, busyp->agno); > - spin_lock(&pag->pagb_lock); > + if (pag) > + xfs_extent_busy_put_pag(pag, wakeup); > agno = busyp->agno; > + pag = xfs_perag_get(mp, agno); > + spin_lock(&pag->pagb_lock); > + wakeup = false; > } > > if (do_discard && busyp->length && > - !(busyp->flags & XFS_EXTENT_BUSY_SKIP_DISCARD)) > + !(busyp->flags & XFS_EXTENT_BUSY_SKIP_DISCARD)) { > busyp->flags = XFS_EXTENT_BUSY_DISCARDED; > - else > + } else { > xfs_extent_busy_clear_one(mp, pag, busyp); > + wakeup = true; > + } I didn't catch this until looking through everything after the next patch, but I think there's a problem with the wakeup here as well. If we have a busy extent with XFS_EXTENT_BUSY_SKIP_DISCARD set, we immediately issue a wake from the first xfs_extent_busy_clear() in the cil committed handler, regardless of whether !SKIP_DISCARD extents exist as well under the current gen value. I think that means we'd get a premature wake any time a busy_list has at least one of each type..? Brian > } > > - if (pag) { > - spin_unlock(&pag->pagb_lock); > - xfs_perag_put(pag); > + if (pag) > + xfs_extent_busy_put_pag(pag, wakeup); > +} > + > +/* > + * Flush out all busy extents for this AG. > + */ > +void > +xfs_extent_busy_flush( > + struct xfs_mount *mp, > + struct xfs_perag *pag, > + unsigned busy_gen) > +{ > + DEFINE_WAIT (wait); > + int log_flushed = 0, error; > + > + trace_xfs_log_force(mp, 0, _THIS_IP_); > + error = _xfs_log_force(mp, XFS_LOG_SYNC, &log_flushed); > + if (error) > + return; > + > + while (busy_gen == READ_ONCE(pag->pagb_gen)) { > + prepare_to_wait(&pag->pagb_wait, &wait, TASK_KILLABLE); > + schedule(); > } > + finish_wait(&pag->pagb_wait, &wait); > } > > /* > diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h > index bfff284..bcb99463 100644 > --- a/fs/xfs/xfs_extent_busy.h > +++ b/fs/xfs/xfs_extent_busy.h > @@ -58,9 +58,13 @@ void > xfs_extent_busy_reuse(struct xfs_mount *mp, xfs_agnumber_t agno, > xfs_agblock_t fbno, xfs_extlen_t flen, bool userdata); > > +bool > +xfs_extent_busy_trim(struct xfs_alloc_arg *args, xfs_agblock_t *bno, > + xfs_extlen_t *len, unsigned *busy_gen); > + > void > -xfs_extent_busy_trim(struct xfs_alloc_arg *args, xfs_agblock_t bno, > - xfs_extlen_t len, xfs_agblock_t *rbno, xfs_extlen_t *rlen); > +xfs_extent_busy_flush(struct xfs_mount *mp, struct xfs_perag *pag, > + unsigned discards); > > int > xfs_extent_busy_ag_cmp(void *priv, struct list_head *a, struct list_head *b); > diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c > index 9b9540d..4e9feb1 100644 > --- a/fs/xfs/xfs_mount.c > +++ b/fs/xfs/xfs_mount.c > @@ -215,6 +215,7 @@ xfs_initialize_perag( > INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC); > if (xfs_buf_hash_init(pag)) > goto out_unwind; > + init_waitqueue_head(&pag->pagb_wait); > > if (radix_tree_preload(GFP_NOFS)) > goto out_unwind; > diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h > index 7f351f7..7363499 100644 > --- a/fs/xfs/xfs_mount.h > +++ b/fs/xfs/xfs_mount.h > @@ -384,6 +384,8 @@ typedef struct xfs_perag { > xfs_agino_t pagl_rightrec; > spinlock_t pagb_lock; /* lock for pagb_tree */ > struct rb_root pagb_tree; /* ordered tree of busy extents */ > + unsigned int pagb_gen; > + wait_queue_head_t pagb_wait; > > atomic_t pagf_fstrms; /* # of filestreams active in this AG */ > > -- > 2.1.4 > > -- > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, Feb 03, 2017 at 10:22:33AM -0500, Brian Foster wrote: > Not a big deal, but perhaps in the above two cases where we're > traversing the bnobt, just track the max busy gen and use that being set > non-zero to trigger (hopefully) fewer flushes rather than being subject > to whatever the last value was? Then we don't have to do the 'busy |= > ..' thing either. That doesn't cover the overflow case, but that should > be rare and we still have the retry. It would hang for the overflow case, been there done that. Note that we only rety if we failed the allocation anyway, so it won't actually trigger any less flushes either. > > +out: > > spin_unlock(&args->pag->pagb_lock); > > > > - if (fbno != bno || flen != len) { > > - trace_xfs_extent_busy_trim(args->mp, args->agno, bno, len, > > + if (fbno != *bno || flen != *len) { > > + trace_xfs_extent_busy_trim(args->mp, args->agno, *bno, *len, > > fbno, flen); > > + *bno = fbno; > > + *len = flen; > > + *busy_gen = args->pag->pagb_gen; > > + return true; > > We've already dropped pagb_lock by the time we grab pagb_gen. What > prevents this from racing with a flush and pagb_gen bump and returning a > gen value that might not have any associated busy extents? Good point. I though I had moved the lock around but obviously didn't. I'll fix it up for the next version. > > + while (busy_gen == READ_ONCE(pag->pagb_gen)) { > > + prepare_to_wait(&pag->pagb_wait, &wait, TASK_KILLABLE); > > + schedule(); > > } > > + finish_wait(&pag->pagb_wait, &wait); > > This seems racy. Shouldn't this do something like: > > do { > prepare_to_wait(); > if (busy_gen != pagb_gen) > break; > schedule(); > finish_wait(); > } while (1); > finish_wait(); > > ... to make sure we don't lose a wakeup between setting the task state > and actually scheduling out? Yes, will fix. > > +++ b/fs/xfs/xfs_mount.h > > @@ -384,6 +384,8 @@ typedef struct xfs_perag { > > xfs_agino_t pagl_rightrec; > > spinlock_t pagb_lock; /* lock for pagb_tree */ > > struct rb_root pagb_tree; /* ordered tree of busy extents */ > > + unsigned int pagb_gen; > > + wait_queue_head_t pagb_wait; > > Can we add some comments here similar to the other fields? Sure. > Also, how > about slightly more informative names... pagb_discard_[gen|wait], or > pagb_busy_*? That's what I had first - but: - pagb is the short name for the pag busy tree and I wanted to follow that convention. And with the current series we also use the wakeup code for normal busy extents, even without discards. -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, Feb 03, 2017 at 11:22:43AM -0500, Brian Foster wrote: > I didn't catch this until looking through everything after the next > patch, but I think there's a problem with the wakeup here as well. If we > have a busy extent with XFS_EXTENT_BUSY_SKIP_DISCARD set, we immediately > issue a wake from the first xfs_extent_busy_clear() in the cil committed > handler, regardless of whether !SKIP_DISCARD extents exist as well > under the current gen value. I think that means we'd get a premature > wake any time a busy_list has at least one of each type..? We'll need to wake as soon as a previously busy extent in the AG becomes available, and that includes XFS_EXTENT_BUSY_SKIP_DISCARD ones. Otherwise e.g. a transaction only containing XFS_EXTENT_BUSY_SKIP_DISCARD will never wake at all. -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Sat, Feb 04, 2017 at 10:56:06AM +0100, Christoph Hellwig wrote: > On Fri, Feb 03, 2017 at 11:22:43AM -0500, Brian Foster wrote: > > I didn't catch this until looking through everything after the next > > patch, but I think there's a problem with the wakeup here as well. If we > > have a busy extent with XFS_EXTENT_BUSY_SKIP_DISCARD set, we immediately > > issue a wake from the first xfs_extent_busy_clear() in the cil committed > > handler, regardless of whether !SKIP_DISCARD extents exist as well > > under the current gen value. I think that means we'd get a premature > > wake any time a busy_list has at least one of each type..? > > We'll need to wake as soon as a previously busy extent in the AG > becomes available, and that includes XFS_EXTENT_BUSY_SKIP_DISCARD > ones. Otherwise e.g. a transaction only containing > XFS_EXTENT_BUSY_SKIP_DISCARD will never wake at all. > Hmm, Ok. I suppose that isn't a problem so long as we don't ever wait for a particular extent based on a particular generation number. The current code is just retrying allocations and whatnot, so is probably safe. That said, that limitation should be noted somewhere. Can we add a comment in xfs_extent_busy_clear() right above the hunk where we do the SKIP_DISCARD wake? E.g., something that points out the gen number for any particular extent could be bumped in such a situation.. (or something along those lines)? Brian > -- > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, Feb 06, 2017 at 11:47:50AM -0500, Brian Foster wrote: > That said, that limitation should be noted somewhere. Can we add a > comment in xfs_extent_busy_clear() right above the hunk where we do the > SKIP_DISCARD wake? E.g., something that points out the gen number for > any particular extent could be bumped in such a situation.. (or > something along those lines)? I could add a comment, but given that the tree tracks busy extents and is not in any way specific to discard I think it's more confusing than not having it. -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Feb 07, 2017 at 10:43:32AM +0100, Christoph Hellwig wrote: > On Mon, Feb 06, 2017 at 11:47:50AM -0500, Brian Foster wrote: > > That said, that limitation should be noted somewhere. Can we add a > > comment in xfs_extent_busy_clear() right above the hunk where we do the > > SKIP_DISCARD wake? E.g., something that points out the gen number for > > any particular extent could be bumped in such a situation.. (or > > something along those lines)? > > I could add a comment, but given that the tree tracks busy extents > and is not in any way specific to discard I think it's more confusing > than not having it. The tree is generic to busy extents, sure. The behavior of the generation number definitely changes depending on whether discards are enabled or not. If discard is not enabled, the behavior is straightforward in that a busy_gen returned for a particular extent is triggered when that extent is made unbusy. A retry call to xfs_extent_busy_[trim|flush]() is never necessary. If discard is enabled, the pagb_gen returned for a particular extent is triggered when one of the busy extents in the list is made unbusy. It might be the extent I passed to xfs_extent_busy_trim(), it might not. I have to retry the call to know for sure. I think it's pretty straightforward how easy that may be to misuse in the future. My concern is that somebody down the road adds code to trim a particular extent, waits on the pagb_gen, doesn't retry and thus introduces a very subtle bug into the code. I can pretty much guarantee after a month or so I'm going to forget all about this constraint, never mind longer than that. Hence, I'm asking for a brief comment to clarify how pagb_gen actually works. The more I think about it, I think above xfs_busy_extent_trim() is the right place since that is where we introduce/return the gen, we have a comment update there already, and it apparently already mistakenly refers to online discard. How about we update it to something like the following? /* * ... * * Return the current busy generation for the AG if the extent is busy. This * value can be used to wait for at least one of the currently busy extents to * be cleared. Note that the busy list is not guaranteed to be empty after the * gen is woken. The state of a specific extent must always be confirmed with * another call to xfs_extent_busy_trim() before it can be used. */ Brian > -- > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Feb 07, 2017 at 08:13:54AM -0500, Brian Foster wrote: > If discard is not enabled, the behavior is straightforward in that a > busy_gen returned for a particular extent is triggered when that extent > is made unbusy. A retry call to xfs_extent_busy_[trim|flush]() is never > necessary. If discard is enabled, the pagb_gen returned for a particular > extent is triggered when one of the busy extents in the list is made > unbusy. It might be the extent I passed to xfs_extent_busy_trim(), it > might not. I have to retry the call to know for sure. pagb_gen is always incremented when _a_ busy extent is freed. Without online discard this does indeed only happen once per log commit / AG. With discards it could happen twice, but this difference in semantics shouldn't really matter. > I think it's pretty straightforward how easy that may be to misuse in > the future. My concern is that somebody down the road adds code to trim > a particular extent, waits on the pagb_gen, doesn't retry and thus > introduces a very subtle bug into the code. > > I can pretty much guarantee after a month or so I'm going to forget all > about this constraint, never mind longer than that. Hence, I'm asking > for a brief comment to clarify how pagb_gen actually works. The more I > think about it, I think above xfs_busy_extent_trim() is the right place > > since that is where we introduce/return the gen, we have a comment > update there already, and it apparently already mistakenly refers to > online discard. How about we update it to something like the following? Sure, I'll add it. -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 9f06a21..fe98fbc 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -221,20 +221,22 @@ xfs_alloc_get_rec( * Compute aligned version of the found extent. * Takes alignment and min length into account. */ -STATIC void +STATIC bool xfs_alloc_compute_aligned( xfs_alloc_arg_t *args, /* allocation argument structure */ xfs_agblock_t foundbno, /* starting block in found extent */ xfs_extlen_t foundlen, /* length in found extent */ xfs_agblock_t *resbno, /* result block number */ - xfs_extlen_t *reslen) /* result length */ + xfs_extlen_t *reslen, /* result length */ + unsigned *busy_gen) { - xfs_agblock_t bno; - xfs_extlen_t len; + xfs_agblock_t bno = foundbno; + xfs_extlen_t len = foundlen; xfs_extlen_t diff; + bool busy; /* Trim busy sections out of found extent */ - xfs_extent_busy_trim(args, foundbno, foundlen, &bno, &len); + busy = xfs_extent_busy_trim(args, &bno, &len, busy_gen); /* * If we have a largish extent that happens to start before min_agbno, @@ -259,6 +261,8 @@ xfs_alloc_compute_aligned( *resbno = bno; *reslen = len; } + + return busy; } /* @@ -737,10 +741,11 @@ xfs_alloc_ag_vextent_exact( int error; xfs_agblock_t fbno; /* start block of found extent */ xfs_extlen_t flen; /* length of found extent */ - xfs_agblock_t tbno; /* start block of trimmed extent */ - xfs_extlen_t tlen; /* length of trimmed extent */ - xfs_agblock_t tend; /* end block of trimmed extent */ + xfs_agblock_t tbno; /* start block of busy extent */ + xfs_extlen_t tlen; /* length of busy extent */ + xfs_agblock_t tend; /* end block of busy extent */ int i; /* success/failure of operation */ + unsigned busy_gen; ASSERT(args->alignment == 1); @@ -773,7 +778,9 @@ xfs_alloc_ag_vextent_exact( /* * Check for overlapping busy extents. */ - xfs_extent_busy_trim(args, fbno, flen, &tbno, &tlen); + tbno = fbno; + tlen = flen; + xfs_extent_busy_trim(args, &tbno, &tlen, &busy_gen); /* * Give up if the start of the extent is busy, or the freespace isn't @@ -853,6 +860,7 @@ xfs_alloc_find_best_extent( xfs_agblock_t sdiff; int error; int i; + unsigned busy_gen; /* The good extent is perfect, no need to search. */ if (!gdiff) @@ -866,7 +874,8 @@ xfs_alloc_find_best_extent( if (error) goto error0; XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); - xfs_alloc_compute_aligned(args, *sbno, *slen, sbnoa, slena); + xfs_alloc_compute_aligned(args, *sbno, *slen, + sbnoa, slena, &busy_gen); /* * The good extent is closer than this one. @@ -955,7 +964,8 @@ xfs_alloc_ag_vextent_near( xfs_extlen_t ltlena; /* aligned ... */ xfs_agblock_t ltnew; /* useful start bno of left side */ xfs_extlen_t rlen; /* length of returned extent */ - int forced = 0; + bool busy; + unsigned busy_gen; #ifdef DEBUG /* * Randomly don't execute the first algorithm. @@ -982,6 +992,7 @@ xfs_alloc_ag_vextent_near( ltlen = 0; gtlena = 0; ltlena = 0; + busy = false; /* * Get a cursor for the by-size btree. @@ -1064,8 +1075,8 @@ xfs_alloc_ag_vextent_near( if ((error = xfs_alloc_get_rec(cnt_cur, <bno, <len, &i))) goto error0; XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); - xfs_alloc_compute_aligned(args, ltbno, ltlen, - <bnoa, <lena); + busy = xfs_alloc_compute_aligned(args, ltbno, ltlen, + <bnoa, <lena, &busy_gen); if (ltlena < args->minlen) continue; if (ltbnoa < args->min_agbno || ltbnoa > args->max_agbno) @@ -1183,8 +1194,8 @@ xfs_alloc_ag_vextent_near( if ((error = xfs_alloc_get_rec(bno_cur_lt, <bno, <len, &i))) goto error0; XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); - xfs_alloc_compute_aligned(args, ltbno, ltlen, - <bnoa, <lena); + busy |= xfs_alloc_compute_aligned(args, ltbno, ltlen, + <bnoa, <lena, &busy_gen); if (ltlena >= args->minlen && ltbnoa >= args->min_agbno) break; if ((error = xfs_btree_decrement(bno_cur_lt, 0, &i))) @@ -1199,8 +1210,8 @@ xfs_alloc_ag_vextent_near( if ((error = xfs_alloc_get_rec(bno_cur_gt, >bno, >len, &i))) goto error0; XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); - xfs_alloc_compute_aligned(args, gtbno, gtlen, - >bnoa, >lena); + busy |= xfs_alloc_compute_aligned(args, gtbno, gtlen, + >bnoa, >lena, &busy_gen); if (gtlena >= args->minlen && gtbnoa <= args->max_agbno) break; if ((error = xfs_btree_increment(bno_cur_gt, 0, &i))) @@ -1261,9 +1272,9 @@ xfs_alloc_ag_vextent_near( if (bno_cur_lt == NULL && bno_cur_gt == NULL) { xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); - if (!forced++) { + if (busy) { trace_xfs_alloc_near_busy(args); - xfs_log_force(args->mp, XFS_LOG_SYNC); + xfs_extent_busy_flush(args->mp, args->pag, busy_gen); goto restart; } trace_xfs_alloc_size_neither(args); @@ -1344,7 +1355,8 @@ xfs_alloc_ag_vextent_size( int i; /* temp status variable */ xfs_agblock_t rbno; /* returned block number */ xfs_extlen_t rlen; /* length of returned extent */ - int forced = 0; + bool busy; + unsigned busy_gen; restart: /* @@ -1353,6 +1365,7 @@ xfs_alloc_ag_vextent_size( cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, args->agno, XFS_BTNUM_CNT); bno_cur = NULL; + busy = false; /* * Look for an entry >= maxlen+alignment-1 blocks. @@ -1362,14 +1375,13 @@ xfs_alloc_ag_vextent_size( goto error0; /* - * If none or we have busy extents that we cannot allocate from, then - * we have to settle for a smaller extent. In the case that there are - * no large extents, this will return the last entry in the tree unless - * the tree is empty. In the case that there are only busy large - * extents, this will return the largest small extent unless there + * If none then we have to settle for a smaller extent. In the case that + * there are no large extents, this will return the last entry in the + * tree unless the tree is empty. In the case that there are only busy + * large extents, this will return the largest small extent unless there * are no smaller extents available. */ - if (!i || forced > 1) { + if (!i) { error = xfs_alloc_ag_vextent_small(args, cnt_cur, &fbno, &flen, &i); if (error) @@ -1380,13 +1392,11 @@ xfs_alloc_ag_vextent_size( return 0; } ASSERT(i == 1); - xfs_alloc_compute_aligned(args, fbno, flen, &rbno, &rlen); + busy = xfs_alloc_compute_aligned(args, fbno, flen, &rbno, + &rlen, &busy_gen); } else { /* * Search for a non-busy extent that is large enough. - * If we are at low space, don't check, or if we fall of - * the end of the btree, turn off the busy check and - * restart. */ for (;;) { error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i); @@ -1394,8 +1404,8 @@ xfs_alloc_ag_vextent_size( goto error0; XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); - xfs_alloc_compute_aligned(args, fbno, flen, - &rbno, &rlen); + busy = xfs_alloc_compute_aligned(args, fbno, flen, + &rbno, &rlen, &busy_gen); if (rlen >= args->maxlen) break; @@ -1407,18 +1417,13 @@ xfs_alloc_ag_vextent_size( /* * Our only valid extents must have been busy. * Make it unbusy by forcing the log out and - * retrying. If we've been here before, forcing - * the log isn't making the extents available, - * which means they have probably been freed in - * this transaction. In that case, we have to - * give up on them and we'll attempt a minlen - * allocation the next time around. + * retrying. */ xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); trace_xfs_alloc_size_busy(args); - if (!forced++) - xfs_log_force(args->mp, XFS_LOG_SYNC); + xfs_extent_busy_flush(args->mp, + args->pag, busy_gen); goto restart; } } @@ -1454,8 +1459,8 @@ xfs_alloc_ag_vextent_size( XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); if (flen < bestrlen) break; - xfs_alloc_compute_aligned(args, fbno, flen, - &rbno, &rlen); + busy = xfs_alloc_compute_aligned(args, fbno, flen, + &rbno, &rlen, &busy_gen); rlen = XFS_EXTLEN_MIN(args->maxlen, rlen); XFS_WANT_CORRUPTED_GOTO(args->mp, rlen == 0 || (rlen <= flen && rbno + rlen <= fbno + flen), @@ -1484,10 +1489,10 @@ xfs_alloc_ag_vextent_size( */ args->len = rlen; if (rlen < args->minlen) { - if (!forced++) { + if (busy) { xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); trace_xfs_alloc_size_busy(args); - xfs_log_force(args->mp, XFS_LOG_SYNC); + xfs_extent_busy_flush(args->mp, args->pag, busy_gen); goto restart; } goto out_nominleft; diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c index 29c2f99..8251359 100644 --- a/fs/xfs/xfs_extent_busy.c +++ b/fs/xfs/xfs_extent_busy.c @@ -334,14 +334,18 @@ xfs_extent_busy_reuse( * subset of the extent that is not busy. If *rlen is smaller than * args->minlen no suitable extent could be found, and the higher level * code needs to force out the log and retry the allocation. + * + * Return the current discard generation for the AG if the file system + * has online discard enabled. This value can be used to wait for + * the trimmed extent to become fully available if the AG is running out + * of space. */ -void +bool xfs_extent_busy_trim( struct xfs_alloc_arg *args, - xfs_agblock_t bno, - xfs_extlen_t len, - xfs_agblock_t *rbno, - xfs_extlen_t *rlen) + xfs_agblock_t *bno, + xfs_extlen_t *len, + unsigned *busy_gen) { xfs_agblock_t fbno; xfs_extlen_t flen; @@ -351,8 +355,8 @@ xfs_extent_busy_trim( spin_lock(&args->pag->pagb_lock); restart: - fbno = bno; - flen = len; + fbno = *bno; + flen = *len; rbp = args->pag->pagb_tree.rb_node; while (rbp && flen >= args->minlen) { struct xfs_extent_busy *busyp = @@ -504,24 +508,25 @@ xfs_extent_busy_trim( flen = fend - fbno; } +out: spin_unlock(&args->pag->pagb_lock); - if (fbno != bno || flen != len) { - trace_xfs_extent_busy_trim(args->mp, args->agno, bno, len, + if (fbno != *bno || flen != *len) { + trace_xfs_extent_busy_trim(args->mp, args->agno, *bno, *len, fbno, flen); + *bno = fbno; + *len = flen; + *busy_gen = args->pag->pagb_gen; + return true; } - *rbno = fbno; - *rlen = flen; - return; + return false; fail: /* * Return a zero extent length as failure indications. All callers * re-check if the trimmed extent satisfies the minlen requirement. */ - spin_unlock(&args->pag->pagb_lock); - trace_xfs_extent_busy_trim(args->mp, args->agno, bno, len, fbno, 0); - *rbno = fbno; - *rlen = 0; + flen = 0; + goto out; } STATIC void @@ -540,6 +545,21 @@ xfs_extent_busy_clear_one( kmem_free(busyp); } +static void +xfs_extent_busy_put_pag( + struct xfs_perag *pag, + bool wakeup) + __releases(pag->pagb_lock) +{ + if (wakeup) { + pag->pagb_gen++; + wake_up_all(&pag->pagb_wait); + } + + spin_unlock(&pag->pagb_lock); + xfs_perag_put(pag); +} + /* * Remove all extents on the passed in list from the busy extents tree. * If do_discard is set skip extents that need to be discarded, and mark @@ -554,29 +574,53 @@ xfs_extent_busy_clear( struct xfs_extent_busy *busyp, *n; struct xfs_perag *pag = NULL; xfs_agnumber_t agno = NULLAGNUMBER; + bool wakeup = false; list_for_each_entry_safe(busyp, n, list, list) { if (busyp->agno != agno) { - if (pag) { - spin_unlock(&pag->pagb_lock); - xfs_perag_put(pag); - } - pag = xfs_perag_get(mp, busyp->agno); - spin_lock(&pag->pagb_lock); + if (pag) + xfs_extent_busy_put_pag(pag, wakeup); agno = busyp->agno; + pag = xfs_perag_get(mp, agno); + spin_lock(&pag->pagb_lock); + wakeup = false; } if (do_discard && busyp->length && - !(busyp->flags & XFS_EXTENT_BUSY_SKIP_DISCARD)) + !(busyp->flags & XFS_EXTENT_BUSY_SKIP_DISCARD)) { busyp->flags = XFS_EXTENT_BUSY_DISCARDED; - else + } else { xfs_extent_busy_clear_one(mp, pag, busyp); + wakeup = true; + } } - if (pag) { - spin_unlock(&pag->pagb_lock); - xfs_perag_put(pag); + if (pag) + xfs_extent_busy_put_pag(pag, wakeup); +} + +/* + * Flush out all busy extents for this AG. + */ +void +xfs_extent_busy_flush( + struct xfs_mount *mp, + struct xfs_perag *pag, + unsigned busy_gen) +{ + DEFINE_WAIT (wait); + int log_flushed = 0, error; + + trace_xfs_log_force(mp, 0, _THIS_IP_); + error = _xfs_log_force(mp, XFS_LOG_SYNC, &log_flushed); + if (error) + return; + + while (busy_gen == READ_ONCE(pag->pagb_gen)) { + prepare_to_wait(&pag->pagb_wait, &wait, TASK_KILLABLE); + schedule(); } + finish_wait(&pag->pagb_wait, &wait); } /* diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h index bfff284..bcb99463 100644 --- a/fs/xfs/xfs_extent_busy.h +++ b/fs/xfs/xfs_extent_busy.h @@ -58,9 +58,13 @@ void xfs_extent_busy_reuse(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t fbno, xfs_extlen_t flen, bool userdata); +bool +xfs_extent_busy_trim(struct xfs_alloc_arg *args, xfs_agblock_t *bno, + xfs_extlen_t *len, unsigned *busy_gen); + void -xfs_extent_busy_trim(struct xfs_alloc_arg *args, xfs_agblock_t bno, - xfs_extlen_t len, xfs_agblock_t *rbno, xfs_extlen_t *rlen); +xfs_extent_busy_flush(struct xfs_mount *mp, struct xfs_perag *pag, + unsigned discards); int xfs_extent_busy_ag_cmp(void *priv, struct list_head *a, struct list_head *b); diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 9b9540d..4e9feb1 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -215,6 +215,7 @@ xfs_initialize_perag( INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC); if (xfs_buf_hash_init(pag)) goto out_unwind; + init_waitqueue_head(&pag->pagb_wait); if (radix_tree_preload(GFP_NOFS)) goto out_unwind; diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 7f351f7..7363499 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -384,6 +384,8 @@ typedef struct xfs_perag { xfs_agino_t pagl_rightrec; spinlock_t pagb_lock; /* lock for pagb_tree */ struct rb_root pagb_tree; /* ordered tree of busy extents */ + unsigned int pagb_gen; + wait_queue_head_t pagb_wait; atomic_t pagf_fstrms; /* # of filestreams active in this AG */
Currently we force the log and simply try again if we hit a busy extent, but especially with online discard enabled it might take a while after the log force for the busy extents to disappear, and we might have already completed our second pass. So instead we add a new waitqueue and a generation counter to the pag structure so that we can do wakeups once we've removed busy extents, and we replace the single retry with an unconditional one - after all we hold the AGF buffer lock, so no other allocations or frees can be racing with us in this AG. Signed-off-by: Christoph Hellwig <hch@lst.de> --- fs/xfs/libxfs/xfs_alloc.c | 93 +++++++++++++++++++++++--------------------- fs/xfs/xfs_extent_busy.c | 98 ++++++++++++++++++++++++++++++++++------------- fs/xfs/xfs_extent_busy.h | 8 +++- fs/xfs/xfs_mount.c | 1 + fs/xfs/xfs_mount.h | 2 + 5 files changed, 129 insertions(+), 73 deletions(-)