Message ID | 1525102372-8430-3-git-send-email-axboe@kernel.dk (mailing list archive) |
---|---|
State | Superseded, archived |
Headers | show |
On Mon, Apr 30, 2018 at 09:32:52AM -0600, Jens Axboe wrote: > XFS recently added support for async discards. While this can be > a win for some workloads and devices, there are also cases where > async bursty discard will severly harm the latencies of reads > and writes. > > Add a 'discard_sync' mount flag to revert to using sync discard, > issuing them one at the time and waiting for each one. This fixes > a big performance regression we had moving to kernels that include > the XFS async discard support. > > Signed-off-by: Jens Axboe <axboe@kernel.dk> > --- Hm, I figured the async discard stuff would have been a pretty clear win all around, but then again I'm not terribly familiar with what happens with discards beneath the fs. I do know that the previous behavior would cause fs level latencies due to holding up log I/O completion while discards completed one at a time. My understanding is that this lead to online discard being pretty much universally "not recommended" in favor of fstrim. Do you have any more data around the workload where the old sync discard behavior actually provides an overall win over the newer behavior? Is it purely a matter of workload, or is it a workload+device thing with how discards may be handled by certain devices? I'm ultimately not against doing this if it's useful for somebody and is otherwise buried under a mount option, but it would be nice to see if there's opportunity to improve the async mechanism before resorting to that. Is the issue here too large bio chains, too many chains at once, or just simply too many discards (regardless of chaining) at the same time? I'm wondering if xfs could separate discard submission from log I/O completion entirely and then perhaps limit/throttle submission somehow or another (Christoph, thoughts?) via a background task. Perhaps doing something like that may even eliminate the need for some discards on busy filesystems with a lot of block free -> reallocation activity, but I'm just handwaving atm. Brian > fs/xfs/xfs_log_cil.c | 18 ++++++++++++++---- > fs/xfs/xfs_mount.h | 1 + > fs/xfs/xfs_super.c | 7 ++++++- > 3 files changed, 21 insertions(+), 5 deletions(-) > > diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c > index 4668403b1741..4eced3eceea5 100644 > --- a/fs/xfs/xfs_log_cil.c > +++ b/fs/xfs/xfs_log_cil.c > @@ -552,10 +552,16 @@ xlog_discard_busy_extents( > struct bio *bio = NULL; > struct blk_plug plug; > int error = 0; > + int discard_flags = 0; > + bool sync_discard = mp->m_flags & XFS_MOUNT_DISCARD_SYNC; > > ASSERT(mp->m_flags & XFS_MOUNT_DISCARD); > > - blk_start_plug(&plug); > + if (!sync_discard) > + blk_start_plug(&plug); > + else > + discard_flags = BLKDEV_DISCARD_SYNC; > + > list_for_each_entry(busyp, list, list) { > trace_xfs_discard_extent(mp, busyp->agno, busyp->bno, > busyp->length); > @@ -563,7 +569,7 @@ xlog_discard_busy_extents( > error = __blkdev_issue_discard(mp->m_ddev_targp->bt_bdev, > XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno), > XFS_FSB_TO_BB(mp, busyp->length), > - GFP_NOFS, 0, &bio); > + GFP_NOFS, discard_flags, &bio); > if (error && error != -EOPNOTSUPP) { > xfs_info(mp, > "discard failed for extent [0x%llx,%u], error %d", > @@ -574,14 +580,18 @@ xlog_discard_busy_extents( > } > } > > - if (bio) { > + if (sync_discard) { > + xfs_extent_busy_clear(mp, &ctx->busy_extents, false); > + kmem_free(ctx); > + } else if (bio) { > bio->bi_private = ctx; > bio->bi_end_io = xlog_discard_endio; > submit_bio(bio); > + blk_finish_plug(&plug); > } else { > xlog_discard_endio_work(&ctx->discard_endio_work); > + blk_finish_plug(&plug); > } > - blk_finish_plug(&plug); > } > > /* > diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h > index 10b90bbc5162..3f0b7b9106c7 100644 > --- a/fs/xfs/xfs_mount.h > +++ b/fs/xfs/xfs_mount.h > @@ -219,6 +219,7 @@ typedef struct xfs_mount { > operations, typically for > disk errors in metadata */ > #define XFS_MOUNT_DISCARD (1ULL << 5) /* discard unused blocks */ > +#define XFS_MOUNT_DISCARD_SYNC (1ULL << 6) /* discard unused blocks sync */ > #define XFS_MOUNT_NOALIGN (1ULL << 7) /* turn off stripe alignment > allocations */ > #define XFS_MOUNT_ATTR2 (1ULL << 8) /* allow use of attr2 format */ > diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c > index d71424052917..6d960bb4725f 100644 > --- a/fs/xfs/xfs_super.c > +++ b/fs/xfs/xfs_super.c > @@ -83,7 +83,7 @@ enum { > Opt_quota, Opt_noquota, Opt_usrquota, Opt_grpquota, Opt_prjquota, > Opt_uquota, Opt_gquota, Opt_pquota, > Opt_uqnoenforce, Opt_gqnoenforce, Opt_pqnoenforce, Opt_qnoenforce, > - Opt_discard, Opt_nodiscard, Opt_dax, Opt_err, > + Opt_discard, Opt_nodiscard, Opt_dax, Opt_err, Opt_discard_sync, > }; > > static const match_table_t tokens = { > @@ -129,6 +129,7 @@ static const match_table_t tokens = { > {Opt_pqnoenforce,"pqnoenforce"},/* project quota limit enforcement */ > {Opt_qnoenforce, "qnoenforce"}, /* same as uqnoenforce */ > {Opt_discard, "discard"}, /* Discard unused blocks */ > + {Opt_discard_sync,"discard_sync"},/* Discard unused blocks sync */ > {Opt_nodiscard, "nodiscard"}, /* Do not discard unused blocks */ > > {Opt_dax, "dax"}, /* Enable direct access to bdev pages */ > @@ -363,11 +364,15 @@ xfs_parseargs( > mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE); > mp->m_qflags &= ~XFS_GQUOTA_ENFD; > break; > + case Opt_discard_sync: > + mp->m_flags |= XFS_MOUNT_DISCARD_SYNC; > + /* fall through and set XFS_MOUNT_DISCARD too */ > case Opt_discard: > mp->m_flags |= XFS_MOUNT_DISCARD; > break; > case Opt_nodiscard: > mp->m_flags &= ~XFS_MOUNT_DISCARD; > + mp->m_flags &= ~XFS_MOUNT_DISCARD_SYNC; > break; > #ifdef CONFIG_FS_DAX > case Opt_dax: > -- > 2.7.4 > > -- > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 4/30/18 11:19 AM, Brian Foster wrote: > On Mon, Apr 30, 2018 at 09:32:52AM -0600, Jens Axboe wrote: >> XFS recently added support for async discards. While this can be >> a win for some workloads and devices, there are also cases where >> async bursty discard will severly harm the latencies of reads >> and writes. >> >> Add a 'discard_sync' mount flag to revert to using sync discard, >> issuing them one at the time and waiting for each one. This fixes >> a big performance regression we had moving to kernels that include >> the XFS async discard support. >> >> Signed-off-by: Jens Axboe <axboe@kernel.dk> >> --- > > Hm, I figured the async discard stuff would have been a pretty clear win > all around, but then again I'm not terribly familiar with what happens > with discards beneath the fs. I do know that the previous behavior would > cause fs level latencies due to holding up log I/O completion while > discards completed one at a time. My understanding is that this lead to > online discard being pretty much universally "not recommended" in favor > of fstrim. It's not a secret that most devices suck at discard. While the async discard is nifty and I bet works well for some cases, it can also cause a flood of discards on the device side which does not work well for other cases. > Do you have any more data around the workload where the old sync discard > behavior actually provides an overall win over the newer behavior? Is it > purely a matter of workload, or is it a workload+device thing with how > discards may be handled by certain devices? The worse read latencies were observed on more than one device type, making it sync again was universally a win. We've had many issues with discard, one trick that is often used is to chop up file deletion into smaller chunks. Say you want to kill 10GB of data, you do it incrementally, since 10G of discard usually doesn't behave very nicely. If you make that async, then you're back to square one. > I'm ultimately not against doing this if it's useful for somebody and is > otherwise buried under a mount option, but it would be nice to see if > there's opportunity to improve the async mechanism before resorting to > that. Is the issue here too large bio chains, too many chains at once, > or just simply too many discards (regardless of chaining) at the same > time? Well, ultimately you'd need better scheduling of the discards, but for most devices what really works the best is a simple "just do one at the time". The size constraint was added to further limit the impact. Honestly, I think the only real improvement would be on the device side. Most folks want discard as an advisory hint, and it should not impact current workloads at all. In reality, many implementations are much more strict and even include substantial flash writes. For the cases where we can't just turn it off (I'd love to), we at least need to make it less intrusive. > I'm wondering if xfs could separate discard submission from log I/O > completion entirely and then perhaps limit/throttle submission somehow > or another (Christoph, thoughts?) via a background task. Perhaps doing > something like that may even eliminate the need for some discards on > busy filesystems with a lot of block free -> reallocation activity, but > I'm just handwaving atm. Just having the sync vs async option is the best split imho. The async could potentially be scheduled. I don't think more involved logic belongs in the fs.
On Mon, Apr 30, 2018 at 12:07:31PM -0600, Jens Axboe wrote: > On 4/30/18 11:19 AM, Brian Foster wrote: > > On Mon, Apr 30, 2018 at 09:32:52AM -0600, Jens Axboe wrote: > >> XFS recently added support for async discards. While this can be > >> a win for some workloads and devices, there are also cases where > >> async bursty discard will severly harm the latencies of reads > >> and writes. > >> > >> Add a 'discard_sync' mount flag to revert to using sync discard, > >> issuing them one at the time and waiting for each one. This fixes > >> a big performance regression we had moving to kernels that include > >> the XFS async discard support. > >> > >> Signed-off-by: Jens Axboe <axboe@kernel.dk> > >> --- > > > > Hm, I figured the async discard stuff would have been a pretty clear win > > all around, but then again I'm not terribly familiar with what happens > > with discards beneath the fs. I do know that the previous behavior would > > cause fs level latencies due to holding up log I/O completion while > > discards completed one at a time. My understanding is that this lead to > > online discard being pretty much universally "not recommended" in favor > > of fstrim. > > It's not a secret that most devices suck at discard. How can we know if a device sucks at discard? > While the async > discard is nifty and I bet works well for some cases, it can also cause > a flood of discards on the device side which does not work well for > other cases. Shouldn't async then be only enabled if the device used supports it well? Or should a blacklist per device be more suitable? Which is more popular? Luis -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 4/30/18 12:25 PM, Luis R. Rodriguez wrote: > On Mon, Apr 30, 2018 at 12:07:31PM -0600, Jens Axboe wrote: >> On 4/30/18 11:19 AM, Brian Foster wrote: >>> On Mon, Apr 30, 2018 at 09:32:52AM -0600, Jens Axboe wrote: >>>> XFS recently added support for async discards. While this can be >>>> a win for some workloads and devices, there are also cases where >>>> async bursty discard will severly harm the latencies of reads >>>> and writes. >>>> >>>> Add a 'discard_sync' mount flag to revert to using sync discard, >>>> issuing them one at the time and waiting for each one. This fixes >>>> a big performance regression we had moving to kernels that include >>>> the XFS async discard support. >>>> >>>> Signed-off-by: Jens Axboe <axboe@kernel.dk> >>>> --- >>> >>> Hm, I figured the async discard stuff would have been a pretty clear win >>> all around, but then again I'm not terribly familiar with what happens >>> with discards beneath the fs. I do know that the previous behavior would >>> cause fs level latencies due to holding up log I/O completion while >>> discards completed one at a time. My understanding is that this lead to >>> online discard being pretty much universally "not recommended" in favor >>> of fstrim. >> >> It's not a secret that most devices suck at discard. > > How can we know if a device sucks at discard? This test usually works well - does it support discard? Then it probably sucks :-) But seriously, synthetic test case with reads/writes (the actual workload) and then mix in trims. If the performance suffers, then discards suck. Just consider this series - it was a 25% latency win. >> While the async >> discard is nifty and I bet works well for some cases, it can also cause >> a flood of discards on the device side which does not work well for >> other cases. > > Shouldn't async then be only enabled if the device used supports it well? > Or should a blacklist per device be more suitable? Which is more popular? You'd be left with nothing... My general recommendation is to not use discards at all, unless there's a proven case that it makes a write amplification difference for you - from at all, to "big enough that I'd suffer the latency consequences".
On Mon, Apr 30, 2018 at 12:07:31PM -0600, Jens Axboe wrote: > On 4/30/18 11:19 AM, Brian Foster wrote: > > On Mon, Apr 30, 2018 at 09:32:52AM -0600, Jens Axboe wrote: > >> XFS recently added support for async discards. While this can be > >> a win for some workloads and devices, there are also cases where > >> async bursty discard will severly harm the latencies of reads > >> and writes. > >> > >> Add a 'discard_sync' mount flag to revert to using sync discard, > >> issuing them one at the time and waiting for each one. This fixes > >> a big performance regression we had moving to kernels that include > >> the XFS async discard support. > >> > >> Signed-off-by: Jens Axboe <axboe@kernel.dk> > >> --- > > > > Hm, I figured the async discard stuff would have been a pretty clear win > > all around, but then again I'm not terribly familiar with what happens > > with discards beneath the fs. I do know that the previous behavior would > > cause fs level latencies due to holding up log I/O completion while > > discards completed one at a time. My understanding is that this lead to > > online discard being pretty much universally "not recommended" in favor > > of fstrim. > > It's not a secret that most devices suck at discard. While the async > discard is nifty and I bet works well for some cases, it can also cause > a flood of discards on the device side which does not work well for > other cases. > Heh, Ok. > > Do you have any more data around the workload where the old sync discard > > behavior actually provides an overall win over the newer behavior? Is it > > purely a matter of workload, or is it a workload+device thing with how > > discards may be handled by certain devices? > > The worse read latencies were observed on more than one device type, > making it sync again was universally a win. We've had many issues > with discard, one trick that is often used is to chop up file deletion > into smaller chunks. Say you want to kill 10GB of data, you do it > incrementally, since 10G of discard usually doesn't behave very nicely. > If you make that async, then you're back to square one. > Makes sense, so there's not much win in chopping up huge discard ranges into smaller, async requests that cover the same overall size/range.. > > I'm ultimately not against doing this if it's useful for somebody and is > > otherwise buried under a mount option, but it would be nice to see if > > there's opportunity to improve the async mechanism before resorting to > > that. Is the issue here too large bio chains, too many chains at once, > > or just simply too many discards (regardless of chaining) at the same > > time? > > Well, ultimately you'd need better scheduling of the discards, but for > most devices what really works the best is a simple "just do one at > the time". The size constraint was added to further limit the impact. > ... but presumably there is some value in submitting some number of requests together provided they adhere to some size constraint..? Is there a typical size constraint for the average ssd, or is this value all over the place? (Is there a field somewhere in the bdev that the fs can query?) (I guess I'll defer to Christoph's input on this, I assume he measured some kind of improvement in the previous async work..) > Honestly, I think the only real improvement would be on the device > side. Most folks want discard as an advisory hint, and it should not > impact current workloads at all. In reality, many implementations > are much more strict and even include substantial flash writes. For > the cases where we can't just turn it off (I'd love to), we at least > need to make it less intrusive. > > > I'm wondering if xfs could separate discard submission from log I/O > > completion entirely and then perhaps limit/throttle submission somehow > > or another (Christoph, thoughts?) via a background task. Perhaps doing > > something like that may even eliminate the need for some discards on > > busy filesystems with a lot of block free -> reallocation activity, but > > I'm just handwaving atm. > > Just having the sync vs async option is the best split imho. The async > could potentially be scheduled. I don't think more involved logic > belongs in the fs. > The more interesting part to me is whether we can safely separate discard from log I/O completion in XFS. Then we can release the log buffer locks and whatnot and let the fs proceed without waiting on any number of discards to complete. In theory, I think the background task could issue discards one at a time (or N at a time, or N blocks at a time, whatever..) without putting us back in a place where discards hold up the log and subsequently lock up the rest of the fs. If that's possible, then the whole sync/async thing is more of an implementation detail and we have no need for separate mount options for users to try and grok. Brian > -- > Jens Axboe > > -- > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 4/30/18 1:25 PM, Luis R. Rodriguez wrote: > On Mon, Apr 30, 2018 at 12:07:31PM -0600, Jens Axboe wrote: >> On 4/30/18 11:19 AM, Brian Foster wrote: >>> On Mon, Apr 30, 2018 at 09:32:52AM -0600, Jens Axboe wrote: >>>> XFS recently added support for async discards. While this can be >>>> a win for some workloads and devices, there are also cases where >>>> async bursty discard will severly harm the latencies of reads >>>> and writes. >>>> >>>> Add a 'discard_sync' mount flag to revert to using sync discard, >>>> issuing them one at the time and waiting for each one. This fixes >>>> a big performance regression we had moving to kernels that include >>>> the XFS async discard support. >>>> >>>> Signed-off-by: Jens Axboe <axboe@kernel.dk> >>>> --- >>> >>> Hm, I figured the async discard stuff would have been a pretty clear win >>> all around, but then again I'm not terribly familiar with what happens >>> with discards beneath the fs. I do know that the previous behavior would >>> cause fs level latencies due to holding up log I/O completion while >>> discards completed one at a time. My understanding is that this lead to >>> online discard being pretty much universally "not recommended" in favor >>> of fstrim. >> >> It's not a secret that most devices suck at discard. > > How can we know if a device sucks at discard? I was going to ask the same thing. ;) "Meh, punt to the admin!" I'm having deja vu but can't remember why. Seems like this has come up before and we thought it should be a block device tunable, not pushed down from the filesystem. Is that possible? -Eric -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 4/30/18 1:19 PM, Eric Sandeen wrote: > > > On 4/30/18 1:25 PM, Luis R. Rodriguez wrote: >> On Mon, Apr 30, 2018 at 12:07:31PM -0600, Jens Axboe wrote: >>> On 4/30/18 11:19 AM, Brian Foster wrote: >>>> On Mon, Apr 30, 2018 at 09:32:52AM -0600, Jens Axboe wrote: >>>>> XFS recently added support for async discards. While this can be >>>>> a win for some workloads and devices, there are also cases where >>>>> async bursty discard will severly harm the latencies of reads >>>>> and writes. >>>>> >>>>> Add a 'discard_sync' mount flag to revert to using sync discard, >>>>> issuing them one at the time and waiting for each one. This fixes >>>>> a big performance regression we had moving to kernels that include >>>>> the XFS async discard support. >>>>> >>>>> Signed-off-by: Jens Axboe <axboe@kernel.dk> >>>>> --- >>>> >>>> Hm, I figured the async discard stuff would have been a pretty clear win >>>> all around, but then again I'm not terribly familiar with what happens >>>> with discards beneath the fs. I do know that the previous behavior would >>>> cause fs level latencies due to holding up log I/O completion while >>>> discards completed one at a time. My understanding is that this lead to >>>> online discard being pretty much universally "not recommended" in favor >>>> of fstrim. >>> >>> It's not a secret that most devices suck at discard. >> >> How can we know if a device sucks at discard? > > I was going to ask the same thing. ;) "Meh, punt to the admin!" > > I'm having deja vu but can't remember why. Seems like this has come up > before and we thought it should be a block device tunable, not pushed down > from the filesystem. Is that possible? The problem is that it'll depend on the workload as well. The device in may laptop is fine with discard for my workload, which is very light. But if you are running RocksDB on it, and doing heavy compactions and deletes, it probably would not be.
On 4/30/18 2:21 PM, Jens Axboe wrote: > On 4/30/18 1:19 PM, Eric Sandeen wrote: >> >> >> On 4/30/18 1:25 PM, Luis R. Rodriguez wrote: >>> On Mon, Apr 30, 2018 at 12:07:31PM -0600, Jens Axboe wrote: >>>> On 4/30/18 11:19 AM, Brian Foster wrote: >>>>> On Mon, Apr 30, 2018 at 09:32:52AM -0600, Jens Axboe wrote: >>>>>> XFS recently added support for async discards. While this can be >>>>>> a win for some workloads and devices, there are also cases where >>>>>> async bursty discard will severly harm the latencies of reads >>>>>> and writes. >>>>>> >>>>>> Add a 'discard_sync' mount flag to revert to using sync discard, >>>>>> issuing them one at the time and waiting for each one. This fixes >>>>>> a big performance regression we had moving to kernels that include >>>>>> the XFS async discard support. >>>>>> >>>>>> Signed-off-by: Jens Axboe <axboe@kernel.dk> >>>>>> --- >>>>> >>>>> Hm, I figured the async discard stuff would have been a pretty clear win >>>>> all around, but then again I'm not terribly familiar with what happens >>>>> with discards beneath the fs. I do know that the previous behavior would >>>>> cause fs level latencies due to holding up log I/O completion while >>>>> discards completed one at a time. My understanding is that this lead to >>>>> online discard being pretty much universally "not recommended" in favor >>>>> of fstrim. >>>> >>>> It's not a secret that most devices suck at discard. >>> >>> How can we know if a device sucks at discard? >> >> I was going to ask the same thing. ;) "Meh, punt to the admin!" >> >> I'm having deja vu but can't remember why. Seems like this has come up >> before and we thought it should be a block device tunable, not pushed down >> from the filesystem. Is that possible? > > The problem is that it'll depend on the workload as well. The device in > may laptop is fine with discard for my workload, which is very light. > But if you are running RocksDB on it, and doing heavy compactions and > deletes, it probably would not be. Ok, but I'm not sure how that precludes a block device tunable? You'd tune it for your workload, right? Or is the concern that it could only be for the entire block device, and perhaps different partitions have different workloads? Sorry, caveman filesystem guy doesn't completely understand block devices. -Eric -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 4/30/18 1:57 PM, Eric Sandeen wrote: > > > On 4/30/18 2:21 PM, Jens Axboe wrote: >> On 4/30/18 1:19 PM, Eric Sandeen wrote: >>> >>> >>> On 4/30/18 1:25 PM, Luis R. Rodriguez wrote: >>>> On Mon, Apr 30, 2018 at 12:07:31PM -0600, Jens Axboe wrote: >>>>> On 4/30/18 11:19 AM, Brian Foster wrote: >>>>>> On Mon, Apr 30, 2018 at 09:32:52AM -0600, Jens Axboe wrote: >>>>>>> XFS recently added support for async discards. While this can be >>>>>>> a win for some workloads and devices, there are also cases where >>>>>>> async bursty discard will severly harm the latencies of reads >>>>>>> and writes. >>>>>>> >>>>>>> Add a 'discard_sync' mount flag to revert to using sync discard, >>>>>>> issuing them one at the time and waiting for each one. This fixes >>>>>>> a big performance regression we had moving to kernels that include >>>>>>> the XFS async discard support. >>>>>>> >>>>>>> Signed-off-by: Jens Axboe <axboe@kernel.dk> >>>>>>> --- >>>>>> >>>>>> Hm, I figured the async discard stuff would have been a pretty clear win >>>>>> all around, but then again I'm not terribly familiar with what happens >>>>>> with discards beneath the fs. I do know that the previous behavior would >>>>>> cause fs level latencies due to holding up log I/O completion while >>>>>> discards completed one at a time. My understanding is that this lead to >>>>>> online discard being pretty much universally "not recommended" in favor >>>>>> of fstrim. >>>>> >>>>> It's not a secret that most devices suck at discard. >>>> >>>> How can we know if a device sucks at discard? >>> >>> I was going to ask the same thing. ;) "Meh, punt to the admin!" >>> >>> I'm having deja vu but can't remember why. Seems like this has come up >>> before and we thought it should be a block device tunable, not pushed down >>> from the filesystem. Is that possible? >> >> The problem is that it'll depend on the workload as well. The device in >> may laptop is fine with discard for my workload, which is very light. >> But if you are running RocksDB on it, and doing heavy compactions and >> deletes, it probably would not be. > > Ok, but I'm not sure how that precludes a block device tunable? You'd tune > it for your workload, right? What kind of tunable are you thinking of? Right now we have one tunable, which is the max discard size. Patch #1 actually helps make this do what it should for sync discards, instead of just building a bio chain and submitting that all at once. > Or is the concern that it could only be for the entire block device, and > perhaps different partitions have different workloads? > > Sorry, caveman filesystem guy doesn't completely understand block devices. I just don't know what you are trying to tune :-)
On Mon, Apr 30, 2018 at 09:32:52AM -0600, Jens Axboe wrote: > XFS recently added support for async discards. While this can be > a win for some workloads and devices, there are also cases where > async bursty discard will severly harm the latencies of reads > and writes. FWIW, convention is it document the performance regression in the commit message, not leave the reader to guess at what it was.... Did anyone analyse the pattern of discards being issued to work out what pattern was worse for async vs sync discard? is it lots of little discards, large extents being discarded, perhaps a problem with the request request queue starving other IOs because we queue so many async discards in such a short time (which is the difference in behaviour vs the old code), or something else? > Add a 'discard_sync' mount flag to revert to using sync discard, > issuing them one at the time and waiting for each one. This fixes > a big performance regression we had moving to kernels that include > the XFS async discard support. I'm not a fan of adding a mount option to work around bad, unpredictable performance due to a mount option we recommend you don't use because it results in bad, unpredictable performance. Without any details of the discard pattern that results in problems I don't think we should be changing anything - adding an opaque, user-unfriendly mount option does nothing to address the underlying problem - it's just a hack to work around the symptoms being seen... More details of the regression and the root cause analysis is needed, please. Cheers, Dave.
On 4/30/18 3:31 PM, Dave Chinner wrote: > On Mon, Apr 30, 2018 at 09:32:52AM -0600, Jens Axboe wrote: >> XFS recently added support for async discards. While this can be >> a win for some workloads and devices, there are also cases where >> async bursty discard will severly harm the latencies of reads >> and writes. > > FWIW, convention is it document the performance regression in the > commit message, not leave the reader to guess at what it was.... Yeah I'll give you that, I can improve the commit message for sure. > Did anyone analyse the pattern of discards being issued to work out > what pattern was worse for async vs sync discard? is it lots of > little discards, large extents being discarded, perhaps a problem > with the request request queue starving other IOs because we queue > so many async discards in such a short time (which is the difference > in behaviour vs the old code), or something else? What was observed was a big discard which would previously have gone down as smaller discards now going down as either one or many discards. Looking at the blktrace data, it's the difference between discard 1 queue discard 1 complete discatd 2 queue discard 2 complete [...] discard n queue discard n complete which is now discard 1 queue discard 2 queue [...] discard n queue [...] discard 1 complete discard 2 complete [...] discard n complete Note that we set a max discard size of 64MB for most devices, since it's been shown to have less impact on latencies for the IO that jobs actually care about. >> Add a 'discard_sync' mount flag to revert to using sync discard, >> issuing them one at the time and waiting for each one. This fixes >> a big performance regression we had moving to kernels that include >> the XFS async discard support. > > I'm not a fan of adding a mount option to work around bad, > unpredictable performance due to a mount option we recommend you > don't use because it results in bad, unpredictable performance. Oh I hear you, as I wrote in other replies, I don't generally recommend discard except for cases where it's been proven to be useful in terms of write amplification improvements. If we can avoid using it, we do. It's a tradeoff, and for some situations, the right decision is to use discards. > Without any details of the discard pattern that results in problems > I don't think we should be changing anything - adding an opaque, > user-unfriendly mount option does nothing to address the underlying > problem - it's just a hack to work around the symptoms being seen... > > More details of the regression and the root cause analysis is > needed, please. It brings back the same behavior as we had before, which performs better for us. It's preventing users of XFS+discard from upgrading, which is sad.
On Mon, Apr 30, 2018 at 03:42:11PM -0600, Jens Axboe wrote: > On 4/30/18 3:31 PM, Dave Chinner wrote: > > On Mon, Apr 30, 2018 at 09:32:52AM -0600, Jens Axboe wrote: > >> XFS recently added support for async discards. While this can be > >> a win for some workloads and devices, there are also cases where > >> async bursty discard will severly harm the latencies of reads > >> and writes. > > > > FWIW, convention is it document the performance regression in the > > commit message, not leave the reader to guess at what it was.... > > Yeah I'll give you that, I can improve the commit message for sure. > > > Did anyone analyse the pattern of discards being issued to work out > > what pattern was worse for async vs sync discard? is it lots of > > little discards, large extents being discarded, perhaps a problem > > with the request request queue starving other IOs because we queue > > so many async discards in such a short time (which is the difference > > in behaviour vs the old code), or something else? > > What was observed was a big discard which would previously have > gone down as smaller discards now going down as either one or many > discards. Looking at the blktrace data, it's the difference between > > discard 1 queue > discard 1 complete > discatd 2 queue > discard 2 complete > [...] > discard n queue > discard n complete > > which is now > > discard 1 queue > discard 2 queue > [...] > discard n queue > [...] > discard 1 complete > discard 2 complete > [...] > discard n complete > > Note that we set a max discard size of 64MB for most devices, > since it's been shown to have less impact on latencies for > the IO that jobs actually care about. IOWs, this has nothing to do with XFS behaviour, and everything to do with suboptimal scheduling control for concurrent queued discards in the block layer.... XFS can issue discard requests of up to 8GB (on 4k block size filesystems), and how they are optimised is completely up to the block layer. blkdev_issue_discard() happens to be synchronous (for historical reasons) and that does not match our asynchronous log IO model. it's always been a cause of total filesystem stall problems for XFS because we can't free log space until all the pending discards have been issued. Hence we can see global filesystems stalls of *minutes* with synchronous discards on bad devices and can cause OOM and all sorts of other nasty cascading failures. Async discard dispatch solves this problem for XFS - discards no longer block forward journal progress, and we really don't want to go back to having that ticking timebomb in XFS. Handling concurrent discard requests in an optimal manner is not a filesystem problem - it's an IO scheduling problem. Essentially, we've exposed yet another limitation of the block/IO layer handling of discard requests in the linux storage stack - it does not have a configurable discard queue depth. I'd much prefer these problems get handled at the IO scheduling layer where there is visibulity of device capabilities and request queue state. i.e. we should be throttling async discards just like we can throttle read or write IO, especially given there are many devices that serialise discards at the device level (i.e. not queued device commands). This solves the problem for everyone and makes things much better for the future where hardware implementations are likely to get better and support more and more concurrency in discard operations. IMO, the way the high level code dispatches discard requests is irrelevant here - this is a problem to do with queue depth and IO scheduling/throttling. That's why I don't think adding permanent ABI changes to filesystems to work around this problem is the right solution.... > > I don't think we should be changing anything - adding an opaque, > > user-unfriendly mount option does nothing to address the underlying > > problem - it's just a hack to work around the symptoms being seen... > > > > More details of the regression and the root cause analysis is > > needed, please. > > It brings back the same behavior as we had before, which performs > better for us. It's preventing users of XFS+discard from upgrading, > which is sad. Yes, it does, but so would having the block layer to throttle device discard requests in flight to a queue depth of 1. And then we don't have to change XFS at all. Cheers, Dave.
On 4/30/18 4:28 PM, Dave Chinner wrote: > On Mon, Apr 30, 2018 at 03:42:11PM -0600, Jens Axboe wrote: >> On 4/30/18 3:31 PM, Dave Chinner wrote: >>> On Mon, Apr 30, 2018 at 09:32:52AM -0600, Jens Axboe wrote: >>>> XFS recently added support for async discards. While this can be >>>> a win for some workloads and devices, there are also cases where >>>> async bursty discard will severly harm the latencies of reads >>>> and writes. >>> >>> FWIW, convention is it document the performance regression in the >>> commit message, not leave the reader to guess at what it was.... >> >> Yeah I'll give you that, I can improve the commit message for sure. >> >>> Did anyone analyse the pattern of discards being issued to work out >>> what pattern was worse for async vs sync discard? is it lots of >>> little discards, large extents being discarded, perhaps a problem >>> with the request request queue starving other IOs because we queue >>> so many async discards in such a short time (which is the difference >>> in behaviour vs the old code), or something else? >> >> What was observed was a big discard which would previously have >> gone down as smaller discards now going down as either one or many >> discards. Looking at the blktrace data, it's the difference between >> >> discard 1 queue >> discard 1 complete >> discatd 2 queue >> discard 2 complete >> [...] >> discard n queue >> discard n complete >> >> which is now >> >> discard 1 queue >> discard 2 queue >> [...] >> discard n queue >> [...] >> discard 1 complete >> discard 2 complete >> [...] >> discard n complete >> >> Note that we set a max discard size of 64MB for most devices, >> since it's been shown to have less impact on latencies for >> the IO that jobs actually care about. > > IOWs, this has nothing to do with XFS behaviour, and everything to > do with suboptimal scheduling control for concurrent queued discards > in the block layer.... You could argue that, and it's fallout from XFS being the first user of async discard. Prior to that, we've never had that use case. I'm quite fine making all discards throttle to depth 1. > XFS can issue discard requests of up to 8GB (on 4k block size > filesystems), and how they are optimised is completely up to the > block layer. blkdev_issue_discard() happens to be synchronous (for > historical reasons) and that does not match our asynchronous log IO > model. it's always been a cause of total filesystem stall problems > for XFS because we can't free log space until all the pending > discards have been issued. Hence we can see global filesystems > stalls of *minutes* with synchronous discards on bad devices and can > cause OOM and all sorts of other nasty cascading failures. > > Async discard dispatch solves this problem for XFS - discards no > longer block forward journal progress, and we really don't want to > go back to having that ticking timebomb in XFS. Handling concurrent > discard requests in an optimal manner is not a filesystem problem - > it's an IO scheduling problem. > > > Essentially, we've exposed yet another limitation of the block/IO > layer handling of discard requests in the linux storage stack - it > does not have a configurable discard queue depth. > > I'd much prefer these problems get handled at the IO scheduling > layer where there is visibulity of device capabilities and request > queue state. i.e. we should be throttling async discards just like > we can throttle read or write IO, especially given there are many > devices that serialise discards at the device level (i.e. not queued > device commands). This solves the problem for everyone and makes > things much better for the future where hardware implementations are > likely to get better and support more and more concurrency in > discard operations. > > IMO, the way the high level code dispatches discard requests is > irrelevant here - this is a problem to do with queue depth and IO > scheduling/throttling. That's why I don't think adding permanent ABI > changes to filesystems to work around this problem is the right > solution.... If you get off your high horse for a bit, this is essentially a performance regression. I can either migrate folks off of XFS, or I can come up with something that works for them. It's pretty easy to claim this is "yet another limitation of the IO stack", but it's really not fair to make crazy claims like that when it's an entirely new use case. Let's try to state things objectively and fairly. This work should probably have been done before making XFS discard async. This isn't the first fallout we've had from that code. >>> I don't think we should be changing anything - adding an opaque, >>> user-unfriendly mount option does nothing to address the underlying >>> problem - it's just a hack to work around the symptoms being seen... >>> >>> More details of the regression and the root cause analysis is >>> needed, please. >> >> It brings back the same behavior as we had before, which performs >> better for us. It's preventing users of XFS+discard from upgrading, >> which is sad. > > Yes, it does, but so would having the block layer to throttle device > discard requests in flight to a queue depth of 1. And then we don't > have to change XFS at all. I'm perfectly fine with making that change by default, and much easier for me since I don't have to patch file systems.
On 4/30/18 2:58 PM, Jens Axboe wrote: > On 4/30/18 1:57 PM, Eric Sandeen wrote: >> On 4/30/18 2:21 PM, Jens Axboe wrote: ... >> Ok, but I'm not sure how that precludes a block device tunable? You'd tune >> it for your workload, right? > > What kind of tunable are you thinking of? Right now we have one tunable, > which is the max discard size. Patch #1 actually helps make this do what > it should for sync discards, instead of just building a bio chain and > submitting that all at once. > >> Or is the concern that it could only be for the entire block device, and >> perhaps different partitions have different workloads? >> >> Sorry, caveman filesystem guy doesn't completely understand block devices. > > I just don't know what you are trying to tune :-) I'm wondering if "make discards synchronous" couldn't just be a block device tunable, rather than a filesystem mount option tunable. -Eric -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, Apr 30, 2018 at 04:40:04PM -0600, Jens Axboe wrote: > On 4/30/18 4:28 PM, Dave Chinner wrote: > > On Mon, Apr 30, 2018 at 03:42:11PM -0600, Jens Axboe wrote: > >> On 4/30/18 3:31 PM, Dave Chinner wrote: > >>> On Mon, Apr 30, 2018 at 09:32:52AM -0600, Jens Axboe wrote: > >>>> XFS recently added support for async discards. While this can be > >>>> a win for some workloads and devices, there are also cases where > >>>> async bursty discard will severly harm the latencies of reads > >>>> and writes. > >>> > >>> FWIW, convention is it document the performance regression in the > >>> commit message, not leave the reader to guess at what it was.... > >> > >> Yeah I'll give you that, I can improve the commit message for sure. > >> > >>> Did anyone analyse the pattern of discards being issued to work out > >>> what pattern was worse for async vs sync discard? is it lots of > >>> little discards, large extents being discarded, perhaps a problem > >>> with the request request queue starving other IOs because we queue > >>> so many async discards in such a short time (which is the difference > >>> in behaviour vs the old code), or something else? > >> > >> What was observed was a big discard which would previously have > >> gone down as smaller discards now going down as either one or many > >> discards. Looking at the blktrace data, it's the difference between > >> > >> discard 1 queue > >> discard 1 complete > >> discatd 2 queue > >> discard 2 complete > >> [...] > >> discard n queue > >> discard n complete > >> > >> which is now > >> > >> discard 1 queue > >> discard 2 queue > >> [...] > >> discard n queue > >> [...] > >> discard 1 complete > >> discard 2 complete > >> [...] > >> discard n complete > >> > >> Note that we set a max discard size of 64MB for most devices, > >> since it's been shown to have less impact on latencies for > >> the IO that jobs actually care about. Ok, so as I see it the problem here is that discards are not some instantaneous command, but instead have some cost that is (probably) less than (say) a full write of the same quantity of zeroes. Previously we'd feed the block layer one discard io at a time, but now we batch them up and send multiple large discard requests at the same time. The discards then tie up the device for long periods of time. We'd get the same result if we sent gigabytes of write commands simultaneously too, right? Wouldn't the same problem would happen if say 200 threads were each issuing discards one by one because there's just too many bytes in flight at a time? So what I'm asking here is, can we throttle discard IOs so that no single issuer can monopolize a device? As a stupid example, if program A is sending 200MB of reads, program B is sending 200MB of writes, and xfs is sending 200MB of discards at the same time, can we throttle all three so that they each get roughly a third of the queue at a time? This way XFS can still send huge batches of discard IOs asynchronously, and it's fine enough if some of those have to wait longer in order to avoid starving other things. (Yes, I imagine you could serialize discards and run them one at a time, but that seems a little suboptimal...) > > > > IOWs, this has nothing to do with XFS behaviour, and everything to > > do with suboptimal scheduling control for concurrent queued discards > > in the block layer.... > > You could argue that, and it's fallout from XFS being the first > user of async discard. Prior to that, we've never had that use > case. I'm quite fine making all discards throttle to depth > 1. > > > XFS can issue discard requests of up to 8GB (on 4k block size > > filesystems), and how they are optimised is completely up to the > > block layer. blkdev_issue_discard() happens to be synchronous (for > > historical reasons) and that does not match our asynchronous log IO > > model. it's always been a cause of total filesystem stall problems > > for XFS because we can't free log space until all the pending > > discards have been issued. Hence we can see global filesystems > > stalls of *minutes* with synchronous discards on bad devices and can > > cause OOM and all sorts of other nasty cascading failures. > > > > Async discard dispatch solves this problem for XFS - discards no > > longer block forward journal progress, and we really don't want to > > go back to having that ticking timebomb in XFS. Handling concurrent > > discard requests in an optimal manner is not a filesystem problem - > > it's an IO scheduling problem. > > > > > > Essentially, we've exposed yet another limitation of the block/IO > > layer handling of discard requests in the linux storage stack - it > > does not have a configurable discard queue depth. > > > > I'd much prefer these problems get handled at the IO scheduling > > layer where there is visibulity of device capabilities and request > > queue state. i.e. we should be throttling async discards just like > > we can throttle read or write IO, especially given there are many > > devices that serialise discards at the device level (i.e. not queued > > device commands). This solves the problem for everyone and makes > > things much better for the future where hardware implementations are > > likely to get better and support more and more concurrency in > > discard operations. > > > > IMO, the way the high level code dispatches discard requests is > > irrelevant here - this is a problem to do with queue depth and IO > > scheduling/throttling. That's why I don't think adding permanent ABI > > changes to filesystems to work around this problem is the right > > solution.... > > If you get off your high horse for a bit, this is essentially > a performance regression. I can either migrate folks off of XFS, or > I can come up with something that works for them. It's pretty > easy to claim this is "yet another limitation of the IO stack", > but it's really not fair to make crazy claims like that when it's > an entirely new use case. Let's try to state things objectively > and fairly. > > This work should probably have been done before making XFS > discard async. This isn't the first fallout we've had from that > code. > > >>> I don't think we should be changing anything - adding an opaque, > >>> user-unfriendly mount option does nothing to address the underlying > >>> problem - it's just a hack to work around the symptoms being seen... > >>> > >>> More details of the regression and the root cause analysis is > >>> needed, please. > >> > >> It brings back the same behavior as we had before, which performs > >> better for us. It's preventing users of XFS+discard from upgrading, > >> which is sad. > > > > Yes, it does, but so would having the block layer to throttle device > > discard requests in flight to a queue depth of 1. And then we don't > > have to change XFS at all. > > I'm perfectly fine with making that change by default, and much easier > for me since I don't have to patch file systems. ...and I prefer not to add mount options, fwiw. --D > -- > Jens Axboe > > -- > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 4/30/18 4:59 PM, Eric Sandeen wrote: > On 4/30/18 2:58 PM, Jens Axboe wrote: >> On 4/30/18 1:57 PM, Eric Sandeen wrote: >>> On 4/30/18 2:21 PM, Jens Axboe wrote: > > ... > >>> Ok, but I'm not sure how that precludes a block device tunable? You'd tune >>> it for your workload, right? >> >> What kind of tunable are you thinking of? Right now we have one tunable, >> which is the max discard size. Patch #1 actually helps make this do what >> it should for sync discards, instead of just building a bio chain and >> submitting that all at once. >> >>> Or is the concern that it could only be for the entire block device, and >>> perhaps different partitions have different workloads? >>> >>> Sorry, caveman filesystem guy doesn't completely understand block devices. >> >> I just don't know what you are trying to tune :-) > > I'm wondering if "make discards synchronous" couldn't just be a block device > tunable, rather than a filesystem mount option tunable. See e-mail I just sent out. With that, you get that tunable bundled with the discard_max_bytes tunable. If you leave it at the default, which is generally very high, then you get huge discards. If you tune it down, they become basically sync. I don't think we need a separate tunable for this. It's still per-caller, which is good enough for us. I don't want to make it more complicated until people show up with a valid use case that shows we need it strictly throttled per-device.
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 4668403b1741..4eced3eceea5 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -552,10 +552,16 @@ xlog_discard_busy_extents( struct bio *bio = NULL; struct blk_plug plug; int error = 0; + int discard_flags = 0; + bool sync_discard = mp->m_flags & XFS_MOUNT_DISCARD_SYNC; ASSERT(mp->m_flags & XFS_MOUNT_DISCARD); - blk_start_plug(&plug); + if (!sync_discard) + blk_start_plug(&plug); + else + discard_flags = BLKDEV_DISCARD_SYNC; + list_for_each_entry(busyp, list, list) { trace_xfs_discard_extent(mp, busyp->agno, busyp->bno, busyp->length); @@ -563,7 +569,7 @@ xlog_discard_busy_extents( error = __blkdev_issue_discard(mp->m_ddev_targp->bt_bdev, XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno), XFS_FSB_TO_BB(mp, busyp->length), - GFP_NOFS, 0, &bio); + GFP_NOFS, discard_flags, &bio); if (error && error != -EOPNOTSUPP) { xfs_info(mp, "discard failed for extent [0x%llx,%u], error %d", @@ -574,14 +580,18 @@ xlog_discard_busy_extents( } } - if (bio) { + if (sync_discard) { + xfs_extent_busy_clear(mp, &ctx->busy_extents, false); + kmem_free(ctx); + } else if (bio) { bio->bi_private = ctx; bio->bi_end_io = xlog_discard_endio; submit_bio(bio); + blk_finish_plug(&plug); } else { xlog_discard_endio_work(&ctx->discard_endio_work); + blk_finish_plug(&plug); } - blk_finish_plug(&plug); } /* diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 10b90bbc5162..3f0b7b9106c7 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -219,6 +219,7 @@ typedef struct xfs_mount { operations, typically for disk errors in metadata */ #define XFS_MOUNT_DISCARD (1ULL << 5) /* discard unused blocks */ +#define XFS_MOUNT_DISCARD_SYNC (1ULL << 6) /* discard unused blocks sync */ #define XFS_MOUNT_NOALIGN (1ULL << 7) /* turn off stripe alignment allocations */ #define XFS_MOUNT_ATTR2 (1ULL << 8) /* allow use of attr2 format */ diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index d71424052917..6d960bb4725f 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -83,7 +83,7 @@ enum { Opt_quota, Opt_noquota, Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_uquota, Opt_gquota, Opt_pquota, Opt_uqnoenforce, Opt_gqnoenforce, Opt_pqnoenforce, Opt_qnoenforce, - Opt_discard, Opt_nodiscard, Opt_dax, Opt_err, + Opt_discard, Opt_nodiscard, Opt_dax, Opt_err, Opt_discard_sync, }; static const match_table_t tokens = { @@ -129,6 +129,7 @@ static const match_table_t tokens = { {Opt_pqnoenforce,"pqnoenforce"},/* project quota limit enforcement */ {Opt_qnoenforce, "qnoenforce"}, /* same as uqnoenforce */ {Opt_discard, "discard"}, /* Discard unused blocks */ + {Opt_discard_sync,"discard_sync"},/* Discard unused blocks sync */ {Opt_nodiscard, "nodiscard"}, /* Do not discard unused blocks */ {Opt_dax, "dax"}, /* Enable direct access to bdev pages */ @@ -363,11 +364,15 @@ xfs_parseargs( mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE); mp->m_qflags &= ~XFS_GQUOTA_ENFD; break; + case Opt_discard_sync: + mp->m_flags |= XFS_MOUNT_DISCARD_SYNC; + /* fall through and set XFS_MOUNT_DISCARD too */ case Opt_discard: mp->m_flags |= XFS_MOUNT_DISCARD; break; case Opt_nodiscard: mp->m_flags &= ~XFS_MOUNT_DISCARD; + mp->m_flags &= ~XFS_MOUNT_DISCARD_SYNC; break; #ifdef CONFIG_FS_DAX case Opt_dax:
XFS recently added support for async discards. While this can be a win for some workloads and devices, there are also cases where async bursty discard will severly harm the latencies of reads and writes. Add a 'discard_sync' mount flag to revert to using sync discard, issuing them one at the time and waiting for each one. This fixes a big performance regression we had moving to kernels that include the XFS async discard support. Signed-off-by: Jens Axboe <axboe@kernel.dk> --- fs/xfs/xfs_log_cil.c | 18 ++++++++++++++---- fs/xfs/xfs_mount.h | 1 + fs/xfs/xfs_super.c | 7 ++++++- 3 files changed, 21 insertions(+), 5 deletions(-)