diff mbox

[2/2] xfs: add 'discard_sync' mount flag

Message ID 1525102372-8430-3-git-send-email-axboe@kernel.dk (mailing list archive)
State Superseded, archived
Headers show

Commit Message

Jens Axboe April 30, 2018, 3:32 p.m. UTC
XFS recently added support for async discards. While this can be
a win for some workloads and devices, there are also cases where
async bursty discard will severly harm the latencies of reads
and writes.

Add a 'discard_sync' mount flag to revert to using sync discard,
issuing them one at the time and waiting for each one. This fixes
a big performance regression we had moving to kernels that include
the XFS async discard support.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/xfs/xfs_log_cil.c | 18 ++++++++++++++----
 fs/xfs/xfs_mount.h   |  1 +
 fs/xfs/xfs_super.c   |  7 ++++++-
 3 files changed, 21 insertions(+), 5 deletions(-)

Comments

Brian Foster April 30, 2018, 5:19 p.m. UTC | #1
On Mon, Apr 30, 2018 at 09:32:52AM -0600, Jens Axboe wrote:
> XFS recently added support for async discards. While this can be
> a win for some workloads and devices, there are also cases where
> async bursty discard will severly harm the latencies of reads
> and writes.
> 
> Add a 'discard_sync' mount flag to revert to using sync discard,
> issuing them one at the time and waiting for each one. This fixes
> a big performance regression we had moving to kernels that include
> the XFS async discard support.
> 
> Signed-off-by: Jens Axboe <axboe@kernel.dk>
> ---

Hm, I figured the async discard stuff would have been a pretty clear win
all around, but then again I'm not terribly familiar with what happens
with discards beneath the fs. I do know that the previous behavior would
cause fs level latencies due to holding up log I/O completion while
discards completed one at a time. My understanding is that this lead to
online discard being pretty much universally "not recommended" in favor
of fstrim.

Do you have any more data around the workload where the old sync discard
behavior actually provides an overall win over the newer behavior? Is it
purely a matter of workload, or is it a workload+device thing with how
discards may be handled by certain devices?

I'm ultimately not against doing this if it's useful for somebody and is
otherwise buried under a mount option, but it would be nice to see if
there's opportunity to improve the async mechanism before resorting to
that. Is the issue here too large bio chains, too many chains at once,
or just simply too many discards (regardless of chaining) at the same
time?

I'm wondering if xfs could separate discard submission from log I/O
completion entirely and then perhaps limit/throttle submission somehow
or another (Christoph, thoughts?) via a background task. Perhaps doing
something like that may even eliminate the need for some discards on
busy filesystems with a lot of block free -> reallocation activity, but
I'm just handwaving atm.

Brian

>  fs/xfs/xfs_log_cil.c | 18 ++++++++++++++----
>  fs/xfs/xfs_mount.h   |  1 +
>  fs/xfs/xfs_super.c   |  7 ++++++-
>  3 files changed, 21 insertions(+), 5 deletions(-)
> 
> diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
> index 4668403b1741..4eced3eceea5 100644
> --- a/fs/xfs/xfs_log_cil.c
> +++ b/fs/xfs/xfs_log_cil.c
> @@ -552,10 +552,16 @@ xlog_discard_busy_extents(
>  	struct bio		*bio = NULL;
>  	struct blk_plug		plug;
>  	int			error = 0;
> +	int			discard_flags = 0;
> +	bool			sync_discard = mp->m_flags & XFS_MOUNT_DISCARD_SYNC;
>  
>  	ASSERT(mp->m_flags & XFS_MOUNT_DISCARD);
>  
> -	blk_start_plug(&plug);
> +	if (!sync_discard)
> +		blk_start_plug(&plug);
> +	else
> +		discard_flags = BLKDEV_DISCARD_SYNC;
> +
>  	list_for_each_entry(busyp, list, list) {
>  		trace_xfs_discard_extent(mp, busyp->agno, busyp->bno,
>  					 busyp->length);
> @@ -563,7 +569,7 @@ xlog_discard_busy_extents(
>  		error = __blkdev_issue_discard(mp->m_ddev_targp->bt_bdev,
>  				XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno),
>  				XFS_FSB_TO_BB(mp, busyp->length),
> -				GFP_NOFS, 0, &bio);
> +				GFP_NOFS, discard_flags, &bio);
>  		if (error && error != -EOPNOTSUPP) {
>  			xfs_info(mp,
>  	 "discard failed for extent [0x%llx,%u], error %d",
> @@ -574,14 +580,18 @@ xlog_discard_busy_extents(
>  		}
>  	}
>  
> -	if (bio) {
> +	if (sync_discard) {
> +		xfs_extent_busy_clear(mp, &ctx->busy_extents, false);
> +		kmem_free(ctx);
> +	} else if (bio) {
>  		bio->bi_private = ctx;
>  		bio->bi_end_io = xlog_discard_endio;
>  		submit_bio(bio);
> +		blk_finish_plug(&plug);
>  	} else {
>  		xlog_discard_endio_work(&ctx->discard_endio_work);
> +		blk_finish_plug(&plug);
>  	}
> -	blk_finish_plug(&plug);
>  }
>  
>  /*
> diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> index 10b90bbc5162..3f0b7b9106c7 100644
> --- a/fs/xfs/xfs_mount.h
> +++ b/fs/xfs/xfs_mount.h
> @@ -219,6 +219,7 @@ typedef struct xfs_mount {
>  						   operations, typically for
>  						   disk errors in metadata */
>  #define XFS_MOUNT_DISCARD	(1ULL << 5)	/* discard unused blocks */
> +#define XFS_MOUNT_DISCARD_SYNC	(1ULL << 6)	/* discard unused blocks sync */
>  #define XFS_MOUNT_NOALIGN	(1ULL << 7)	/* turn off stripe alignment
>  						   allocations */
>  #define XFS_MOUNT_ATTR2		(1ULL << 8)	/* allow use of attr2 format */
> diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> index d71424052917..6d960bb4725f 100644
> --- a/fs/xfs/xfs_super.c
> +++ b/fs/xfs/xfs_super.c
> @@ -83,7 +83,7 @@ enum {
>  	Opt_quota, Opt_noquota, Opt_usrquota, Opt_grpquota, Opt_prjquota,
>  	Opt_uquota, Opt_gquota, Opt_pquota,
>  	Opt_uqnoenforce, Opt_gqnoenforce, Opt_pqnoenforce, Opt_qnoenforce,
> -	Opt_discard, Opt_nodiscard, Opt_dax, Opt_err,
> +	Opt_discard, Opt_nodiscard, Opt_dax, Opt_err, Opt_discard_sync,
>  };
>  
>  static const match_table_t tokens = {
> @@ -129,6 +129,7 @@ static const match_table_t tokens = {
>  	{Opt_pqnoenforce,"pqnoenforce"},/* project quota limit enforcement */
>  	{Opt_qnoenforce, "qnoenforce"},	/* same as uqnoenforce */
>  	{Opt_discard,	"discard"},	/* Discard unused blocks */
> +	{Opt_discard_sync,"discard_sync"},/* Discard unused blocks sync */
>  	{Opt_nodiscard,	"nodiscard"},	/* Do not discard unused blocks */
>  
>  	{Opt_dax,	"dax"},		/* Enable direct access to bdev pages */
> @@ -363,11 +364,15 @@ xfs_parseargs(
>  			mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE);
>  			mp->m_qflags &= ~XFS_GQUOTA_ENFD;
>  			break;
> +		case Opt_discard_sync:
> +			mp->m_flags |= XFS_MOUNT_DISCARD_SYNC;
> +			/* fall through and set XFS_MOUNT_DISCARD too */
>  		case Opt_discard:
>  			mp->m_flags |= XFS_MOUNT_DISCARD;
>  			break;
>  		case Opt_nodiscard:
>  			mp->m_flags &= ~XFS_MOUNT_DISCARD;
> +			mp->m_flags &= ~XFS_MOUNT_DISCARD_SYNC;
>  			break;
>  #ifdef CONFIG_FS_DAX
>  		case Opt_dax:
> -- 
> 2.7.4
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jens Axboe April 30, 2018, 6:07 p.m. UTC | #2
On 4/30/18 11:19 AM, Brian Foster wrote:
> On Mon, Apr 30, 2018 at 09:32:52AM -0600, Jens Axboe wrote:
>> XFS recently added support for async discards. While this can be
>> a win for some workloads and devices, there are also cases where
>> async bursty discard will severly harm the latencies of reads
>> and writes.
>>
>> Add a 'discard_sync' mount flag to revert to using sync discard,
>> issuing them one at the time and waiting for each one. This fixes
>> a big performance regression we had moving to kernels that include
>> the XFS async discard support.
>>
>> Signed-off-by: Jens Axboe <axboe@kernel.dk>
>> ---
> 
> Hm, I figured the async discard stuff would have been a pretty clear win
> all around, but then again I'm not terribly familiar with what happens
> with discards beneath the fs. I do know that the previous behavior would
> cause fs level latencies due to holding up log I/O completion while
> discards completed one at a time. My understanding is that this lead to
> online discard being pretty much universally "not recommended" in favor
> of fstrim.

It's not a secret that most devices suck at discard. While the async
discard is nifty and I bet works well for some cases, it can also cause
a flood of discards on the device side which does not work well for
other cases.

> Do you have any more data around the workload where the old sync discard
> behavior actually provides an overall win over the newer behavior? Is it
> purely a matter of workload, or is it a workload+device thing with how
> discards may be handled by certain devices?

The worse read latencies were observed on more than one device type,
making it sync again was universally a win. We've had many issues
with discard, one trick that is often used is to chop up file deletion
into smaller chunks. Say you want to kill 10GB of data, you do it
incrementally, since 10G of discard usually doesn't behave very nicely.
If you make that async, then you're back to square one.

> I'm ultimately not against doing this if it's useful for somebody and is
> otherwise buried under a mount option, but it would be nice to see if
> there's opportunity to improve the async mechanism before resorting to
> that. Is the issue here too large bio chains, too many chains at once,
> or just simply too many discards (regardless of chaining) at the same
> time?

Well, ultimately you'd need better scheduling of the discards, but for
most devices what really works the best is a simple "just do one at
the time". The size constraint was added to further limit the impact.

Honestly, I think the only real improvement would be on the device
side. Most folks want discard as an advisory hint, and it should not
impact current workloads at all. In reality, many implementations
are much more strict and even include substantial flash writes. For
the cases where we can't just turn it off (I'd love to), we at least
need to make it less intrusive.

> I'm wondering if xfs could separate discard submission from log I/O
> completion entirely and then perhaps limit/throttle submission somehow
> or another (Christoph, thoughts?) via a background task. Perhaps doing
> something like that may even eliminate the need for some discards on
> busy filesystems with a lot of block free -> reallocation activity, but
> I'm just handwaving atm.

Just having the sync vs async option is the best split imho. The async
could potentially be scheduled. I don't think more involved logic
belongs in the fs.
Luis Chamberlain April 30, 2018, 6:25 p.m. UTC | #3
On Mon, Apr 30, 2018 at 12:07:31PM -0600, Jens Axboe wrote:
> On 4/30/18 11:19 AM, Brian Foster wrote:
> > On Mon, Apr 30, 2018 at 09:32:52AM -0600, Jens Axboe wrote:
> >> XFS recently added support for async discards. While this can be
> >> a win for some workloads and devices, there are also cases where
> >> async bursty discard will severly harm the latencies of reads
> >> and writes.
> >>
> >> Add a 'discard_sync' mount flag to revert to using sync discard,
> >> issuing them one at the time and waiting for each one. This fixes
> >> a big performance regression we had moving to kernels that include
> >> the XFS async discard support.
> >>
> >> Signed-off-by: Jens Axboe <axboe@kernel.dk>
> >> ---
> > 
> > Hm, I figured the async discard stuff would have been a pretty clear win
> > all around, but then again I'm not terribly familiar with what happens
> > with discards beneath the fs. I do know that the previous behavior would
> > cause fs level latencies due to holding up log I/O completion while
> > discards completed one at a time. My understanding is that this lead to
> > online discard being pretty much universally "not recommended" in favor
> > of fstrim.
> 
> It's not a secret that most devices suck at discard.

How can we know if a device sucks at discard?

> While the async
> discard is nifty and I bet works well for some cases, it can also cause
> a flood of discards on the device side which does not work well for
> other cases.

Shouldn't async then be only enabled if the device used supports it well?
Or should a blacklist per device be more suitable? Which is more popular?

  Luis
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jens Axboe April 30, 2018, 6:31 p.m. UTC | #4
On 4/30/18 12:25 PM, Luis R. Rodriguez wrote:
> On Mon, Apr 30, 2018 at 12:07:31PM -0600, Jens Axboe wrote:
>> On 4/30/18 11:19 AM, Brian Foster wrote:
>>> On Mon, Apr 30, 2018 at 09:32:52AM -0600, Jens Axboe wrote:
>>>> XFS recently added support for async discards. While this can be
>>>> a win for some workloads and devices, there are also cases where
>>>> async bursty discard will severly harm the latencies of reads
>>>> and writes.
>>>>
>>>> Add a 'discard_sync' mount flag to revert to using sync discard,
>>>> issuing them one at the time and waiting for each one. This fixes
>>>> a big performance regression we had moving to kernels that include
>>>> the XFS async discard support.
>>>>
>>>> Signed-off-by: Jens Axboe <axboe@kernel.dk>
>>>> ---
>>>
>>> Hm, I figured the async discard stuff would have been a pretty clear win
>>> all around, but then again I'm not terribly familiar with what happens
>>> with discards beneath the fs. I do know that the previous behavior would
>>> cause fs level latencies due to holding up log I/O completion while
>>> discards completed one at a time. My understanding is that this lead to
>>> online discard being pretty much universally "not recommended" in favor
>>> of fstrim.
>>
>> It's not a secret that most devices suck at discard.
> 
> How can we know if a device sucks at discard?

This test usually works well - does it support discard? Then it probably
sucks :-)

But seriously, synthetic test case with reads/writes (the actual workload)
and then mix in trims. If the performance suffers, then discards suck.
Just consider this series - it was a 25% latency win.

>> While the async
>> discard is nifty and I bet works well for some cases, it can also cause
>> a flood of discards on the device side which does not work well for
>> other cases.
> 
> Shouldn't async then be only enabled if the device used supports it well?
> Or should a blacklist per device be more suitable? Which is more popular?

You'd be left with nothing... My general recommendation is to not use
discards at all, unless there's a proven case that it makes a write
amplification difference for you - from at all, to "big enough that I'd
suffer the latency consequences".
Brian Foster April 30, 2018, 7:18 p.m. UTC | #5
On Mon, Apr 30, 2018 at 12:07:31PM -0600, Jens Axboe wrote:
> On 4/30/18 11:19 AM, Brian Foster wrote:
> > On Mon, Apr 30, 2018 at 09:32:52AM -0600, Jens Axboe wrote:
> >> XFS recently added support for async discards. While this can be
> >> a win for some workloads and devices, there are also cases where
> >> async bursty discard will severly harm the latencies of reads
> >> and writes.
> >>
> >> Add a 'discard_sync' mount flag to revert to using sync discard,
> >> issuing them one at the time and waiting for each one. This fixes
> >> a big performance regression we had moving to kernels that include
> >> the XFS async discard support.
> >>
> >> Signed-off-by: Jens Axboe <axboe@kernel.dk>
> >> ---
> > 
> > Hm, I figured the async discard stuff would have been a pretty clear win
> > all around, but then again I'm not terribly familiar with what happens
> > with discards beneath the fs. I do know that the previous behavior would
> > cause fs level latencies due to holding up log I/O completion while
> > discards completed one at a time. My understanding is that this lead to
> > online discard being pretty much universally "not recommended" in favor
> > of fstrim.
> 
> It's not a secret that most devices suck at discard. While the async
> discard is nifty and I bet works well for some cases, it can also cause
> a flood of discards on the device side which does not work well for
> other cases.
> 

Heh, Ok.

> > Do you have any more data around the workload where the old sync discard
> > behavior actually provides an overall win over the newer behavior? Is it
> > purely a matter of workload, or is it a workload+device thing with how
> > discards may be handled by certain devices?
> 
> The worse read latencies were observed on more than one device type,
> making it sync again was universally a win. We've had many issues
> with discard, one trick that is often used is to chop up file deletion
> into smaller chunks. Say you want to kill 10GB of data, you do it
> incrementally, since 10G of discard usually doesn't behave very nicely.
> If you make that async, then you're back to square one.
> 

Makes sense, so there's not much win in chopping up huge discard ranges
into smaller, async requests that cover the same overall size/range..

> > I'm ultimately not against doing this if it's useful for somebody and is
> > otherwise buried under a mount option, but it would be nice to see if
> > there's opportunity to improve the async mechanism before resorting to
> > that. Is the issue here too large bio chains, too many chains at once,
> > or just simply too many discards (regardless of chaining) at the same
> > time?
> 
> Well, ultimately you'd need better scheduling of the discards, but for
> most devices what really works the best is a simple "just do one at
> the time". The size constraint was added to further limit the impact.
> 

... but presumably there is some value in submitting some number of
requests together provided they adhere to some size constraint..? Is
there a typical size constraint for the average ssd, or is this value
all over the place? (Is there a field somewhere in the bdev that the fs
can query?)

(I guess I'll defer to Christoph's input on this, I assume he measured
some kind of improvement in the previous async work..)

> Honestly, I think the only real improvement would be on the device
> side. Most folks want discard as an advisory hint, and it should not
> impact current workloads at all. In reality, many implementations
> are much more strict and even include substantial flash writes. For
> the cases where we can't just turn it off (I'd love to), we at least
> need to make it less intrusive.
> 
> > I'm wondering if xfs could separate discard submission from log I/O
> > completion entirely and then perhaps limit/throttle submission somehow
> > or another (Christoph, thoughts?) via a background task. Perhaps doing
> > something like that may even eliminate the need for some discards on
> > busy filesystems with a lot of block free -> reallocation activity, but
> > I'm just handwaving atm.
> 
> Just having the sync vs async option is the best split imho. The async
> could potentially be scheduled. I don't think more involved logic
> belongs in the fs.
> 

The more interesting part to me is whether we can safely separate
discard from log I/O completion in XFS. Then we can release the log
buffer locks and whatnot and let the fs proceed without waiting on any
number of discards to complete. In theory, I think the background task
could issue discards one at a time (or N at a time, or N blocks at a
time, whatever..) without putting us back in a place where discards hold
up the log and subsequently lock up the rest of the fs.

If that's possible, then the whole sync/async thing is more of an
implementation detail and we have no need for separate mount options for
users to try and grok.

Brian

> -- 
> Jens Axboe
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Sandeen April 30, 2018, 7:19 p.m. UTC | #6
On 4/30/18 1:25 PM, Luis R. Rodriguez wrote:
> On Mon, Apr 30, 2018 at 12:07:31PM -0600, Jens Axboe wrote:
>> On 4/30/18 11:19 AM, Brian Foster wrote:
>>> On Mon, Apr 30, 2018 at 09:32:52AM -0600, Jens Axboe wrote:
>>>> XFS recently added support for async discards. While this can be
>>>> a win for some workloads and devices, there are also cases where
>>>> async bursty discard will severly harm the latencies of reads
>>>> and writes.
>>>>
>>>> Add a 'discard_sync' mount flag to revert to using sync discard,
>>>> issuing them one at the time and waiting for each one. This fixes
>>>> a big performance regression we had moving to kernels that include
>>>> the XFS async discard support.
>>>>
>>>> Signed-off-by: Jens Axboe <axboe@kernel.dk>
>>>> ---
>>>
>>> Hm, I figured the async discard stuff would have been a pretty clear win
>>> all around, but then again I'm not terribly familiar with what happens
>>> with discards beneath the fs. I do know that the previous behavior would
>>> cause fs level latencies due to holding up log I/O completion while
>>> discards completed one at a time. My understanding is that this lead to
>>> online discard being pretty much universally "not recommended" in favor
>>> of fstrim.
>>
>> It's not a secret that most devices suck at discard.
> 
> How can we know if a device sucks at discard?

I was going to ask the same thing.  ;)  "Meh, punt to the admin!"

I'm having deja vu but can't remember why.  Seems like this has come up
before and we thought it should be a block device tunable, not pushed down
from the filesystem.  Is that possible?

-Eric
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jens Axboe April 30, 2018, 7:21 p.m. UTC | #7
On 4/30/18 1:19 PM, Eric Sandeen wrote:
> 
> 
> On 4/30/18 1:25 PM, Luis R. Rodriguez wrote:
>> On Mon, Apr 30, 2018 at 12:07:31PM -0600, Jens Axboe wrote:
>>> On 4/30/18 11:19 AM, Brian Foster wrote:
>>>> On Mon, Apr 30, 2018 at 09:32:52AM -0600, Jens Axboe wrote:
>>>>> XFS recently added support for async discards. While this can be
>>>>> a win for some workloads and devices, there are also cases where
>>>>> async bursty discard will severly harm the latencies of reads
>>>>> and writes.
>>>>>
>>>>> Add a 'discard_sync' mount flag to revert to using sync discard,
>>>>> issuing them one at the time and waiting for each one. This fixes
>>>>> a big performance regression we had moving to kernels that include
>>>>> the XFS async discard support.
>>>>>
>>>>> Signed-off-by: Jens Axboe <axboe@kernel.dk>
>>>>> ---
>>>>
>>>> Hm, I figured the async discard stuff would have been a pretty clear win
>>>> all around, but then again I'm not terribly familiar with what happens
>>>> with discards beneath the fs. I do know that the previous behavior would
>>>> cause fs level latencies due to holding up log I/O completion while
>>>> discards completed one at a time. My understanding is that this lead to
>>>> online discard being pretty much universally "not recommended" in favor
>>>> of fstrim.
>>>
>>> It's not a secret that most devices suck at discard.
>>
>> How can we know if a device sucks at discard?
> 
> I was going to ask the same thing.  ;)  "Meh, punt to the admin!"
> 
> I'm having deja vu but can't remember why.  Seems like this has come up
> before and we thought it should be a block device tunable, not pushed down
> from the filesystem.  Is that possible?

The problem is that it'll depend on the workload as well. The device in
may laptop is fine with discard for my workload, which is very light.
But if you are running RocksDB on it, and doing heavy compactions and
deletes, it probably would not be.
Eric Sandeen April 30, 2018, 7:57 p.m. UTC | #8
On 4/30/18 2:21 PM, Jens Axboe wrote:
> On 4/30/18 1:19 PM, Eric Sandeen wrote:
>>
>>
>> On 4/30/18 1:25 PM, Luis R. Rodriguez wrote:
>>> On Mon, Apr 30, 2018 at 12:07:31PM -0600, Jens Axboe wrote:
>>>> On 4/30/18 11:19 AM, Brian Foster wrote:
>>>>> On Mon, Apr 30, 2018 at 09:32:52AM -0600, Jens Axboe wrote:
>>>>>> XFS recently added support for async discards. While this can be
>>>>>> a win for some workloads and devices, there are also cases where
>>>>>> async bursty discard will severly harm the latencies of reads
>>>>>> and writes.
>>>>>>
>>>>>> Add a 'discard_sync' mount flag to revert to using sync discard,
>>>>>> issuing them one at the time and waiting for each one. This fixes
>>>>>> a big performance regression we had moving to kernels that include
>>>>>> the XFS async discard support.
>>>>>>
>>>>>> Signed-off-by: Jens Axboe <axboe@kernel.dk>
>>>>>> ---
>>>>>
>>>>> Hm, I figured the async discard stuff would have been a pretty clear win
>>>>> all around, but then again I'm not terribly familiar with what happens
>>>>> with discards beneath the fs. I do know that the previous behavior would
>>>>> cause fs level latencies due to holding up log I/O completion while
>>>>> discards completed one at a time. My understanding is that this lead to
>>>>> online discard being pretty much universally "not recommended" in favor
>>>>> of fstrim.
>>>>
>>>> It's not a secret that most devices suck at discard.
>>>
>>> How can we know if a device sucks at discard?
>>
>> I was going to ask the same thing.  ;)  "Meh, punt to the admin!"
>>
>> I'm having deja vu but can't remember why.  Seems like this has come up
>> before and we thought it should be a block device tunable, not pushed down
>> from the filesystem.  Is that possible?
> 
> The problem is that it'll depend on the workload as well. The device in
> may laptop is fine with discard for my workload, which is very light.
> But if you are running RocksDB on it, and doing heavy compactions and
> deletes, it probably would not be.

Ok, but I'm not sure how that precludes a block device tunable?  You'd tune
it for your workload, right?

Or is the concern that it could only be for the entire block device, and
perhaps different partitions have different workloads?

Sorry, caveman filesystem guy doesn't completely understand block devices.

-Eric
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jens Axboe April 30, 2018, 7:58 p.m. UTC | #9
On 4/30/18 1:57 PM, Eric Sandeen wrote:
> 
> 
> On 4/30/18 2:21 PM, Jens Axboe wrote:
>> On 4/30/18 1:19 PM, Eric Sandeen wrote:
>>>
>>>
>>> On 4/30/18 1:25 PM, Luis R. Rodriguez wrote:
>>>> On Mon, Apr 30, 2018 at 12:07:31PM -0600, Jens Axboe wrote:
>>>>> On 4/30/18 11:19 AM, Brian Foster wrote:
>>>>>> On Mon, Apr 30, 2018 at 09:32:52AM -0600, Jens Axboe wrote:
>>>>>>> XFS recently added support for async discards. While this can be
>>>>>>> a win for some workloads and devices, there are also cases where
>>>>>>> async bursty discard will severly harm the latencies of reads
>>>>>>> and writes.
>>>>>>>
>>>>>>> Add a 'discard_sync' mount flag to revert to using sync discard,
>>>>>>> issuing them one at the time and waiting for each one. This fixes
>>>>>>> a big performance regression we had moving to kernels that include
>>>>>>> the XFS async discard support.
>>>>>>>
>>>>>>> Signed-off-by: Jens Axboe <axboe@kernel.dk>
>>>>>>> ---
>>>>>>
>>>>>> Hm, I figured the async discard stuff would have been a pretty clear win
>>>>>> all around, but then again I'm not terribly familiar with what happens
>>>>>> with discards beneath the fs. I do know that the previous behavior would
>>>>>> cause fs level latencies due to holding up log I/O completion while
>>>>>> discards completed one at a time. My understanding is that this lead to
>>>>>> online discard being pretty much universally "not recommended" in favor
>>>>>> of fstrim.
>>>>>
>>>>> It's not a secret that most devices suck at discard.
>>>>
>>>> How can we know if a device sucks at discard?
>>>
>>> I was going to ask the same thing.  ;)  "Meh, punt to the admin!"
>>>
>>> I'm having deja vu but can't remember why.  Seems like this has come up
>>> before and we thought it should be a block device tunable, not pushed down
>>> from the filesystem.  Is that possible?
>>
>> The problem is that it'll depend on the workload as well. The device in
>> may laptop is fine with discard for my workload, which is very light.
>> But if you are running RocksDB on it, and doing heavy compactions and
>> deletes, it probably would not be.
> 
> Ok, but I'm not sure how that precludes a block device tunable?  You'd tune
> it for your workload, right?

What kind of tunable are you thinking of? Right now we have one tunable,
which is the max discard size. Patch #1 actually helps make this do what
it should for sync discards, instead of just building a bio chain and
submitting that all at once.

> Or is the concern that it could only be for the entire block device, and
> perhaps different partitions have different workloads?
> 
> Sorry, caveman filesystem guy doesn't completely understand block devices.

I just don't know what you are trying to tune :-)
Dave Chinner April 30, 2018, 9:31 p.m. UTC | #10
On Mon, Apr 30, 2018 at 09:32:52AM -0600, Jens Axboe wrote:
> XFS recently added support for async discards. While this can be
> a win for some workloads and devices, there are also cases where
> async bursty discard will severly harm the latencies of reads
> and writes.

FWIW, convention is it document the performance regression in the
commit message, not leave the reader to guess at what it was....

Did anyone analyse the pattern of discards being issued to work out
what pattern was worse for async vs sync discard? is it lots of
little discards, large extents being discarded, perhaps a problem
with the request request queue starving other IOs because we queue
so many async discards in such a short time (which is the difference
in behaviour vs the old code), or something else?

> Add a 'discard_sync' mount flag to revert to using sync discard,
> issuing them one at the time and waiting for each one. This fixes
> a big performance regression we had moving to kernels that include
> the XFS async discard support.

I'm not a fan of adding a mount option to work around bad,
unpredictable performance due to a mount option we recommend you
don't use because it results in bad, unpredictable performance.

Without any details of the discard pattern that results in problems
I don't think we should be changing anything - adding an opaque,
user-unfriendly mount option does nothing to address the underlying
problem - it's just a hack to work around the symptoms being seen...

More details of the regression and the root cause analysis is
needed, please.

Cheers,

Dave.
Jens Axboe April 30, 2018, 9:42 p.m. UTC | #11
On 4/30/18 3:31 PM, Dave Chinner wrote:
> On Mon, Apr 30, 2018 at 09:32:52AM -0600, Jens Axboe wrote:
>> XFS recently added support for async discards. While this can be
>> a win for some workloads and devices, there are also cases where
>> async bursty discard will severly harm the latencies of reads
>> and writes.
> 
> FWIW, convention is it document the performance regression in the
> commit message, not leave the reader to guess at what it was....

Yeah I'll give you that, I can improve the commit message for sure.

> Did anyone analyse the pattern of discards being issued to work out
> what pattern was worse for async vs sync discard? is it lots of
> little discards, large extents being discarded, perhaps a problem
> with the request request queue starving other IOs because we queue
> so many async discards in such a short time (which is the difference
> in behaviour vs the old code), or something else?

What was observed was a big discard which would previously have
gone down as smaller discards now going down as either one or many
discards. Looking at the blktrace data, it's the difference between

discard 1 queue
discard 1 complete
discatd 2 queue
discard 2 complete
[...]
discard n queue
discard n complete

which is now

discard 1 queue
discard 2 queue
[...]
discard n queue
[...]
discard 1 complete
discard 2 complete
[...]
discard n complete

Note that we set a max discard size of 64MB for most devices,
since it's been shown to have less impact on latencies for
the IO that jobs actually care about.

>> Add a 'discard_sync' mount flag to revert to using sync discard,
>> issuing them one at the time and waiting for each one. This fixes
>> a big performance regression we had moving to kernels that include
>> the XFS async discard support.
> 
> I'm not a fan of adding a mount option to work around bad,
> unpredictable performance due to a mount option we recommend you
> don't use because it results in bad, unpredictable performance.

Oh I hear you, as I wrote in other replies, I don't generally
recommend discard except for cases where it's been proven to be
useful in terms of write amplification improvements. If we can
avoid using it, we do. It's a tradeoff, and for some situations,
the right decision is to use discards.

> Without any details of the discard pattern that results in problems
> I don't think we should be changing anything - adding an opaque,
> user-unfriendly mount option does nothing to address the underlying
> problem - it's just a hack to work around the symptoms being seen...
> 
> More details of the regression and the root cause analysis is
> needed, please.

It brings back the same behavior as we had before, which performs
better for us. It's preventing users of XFS+discard from upgrading,
which is sad.
Dave Chinner April 30, 2018, 10:28 p.m. UTC | #12
On Mon, Apr 30, 2018 at 03:42:11PM -0600, Jens Axboe wrote:
> On 4/30/18 3:31 PM, Dave Chinner wrote:
> > On Mon, Apr 30, 2018 at 09:32:52AM -0600, Jens Axboe wrote:
> >> XFS recently added support for async discards. While this can be
> >> a win for some workloads and devices, there are also cases where
> >> async bursty discard will severly harm the latencies of reads
> >> and writes.
> > 
> > FWIW, convention is it document the performance regression in the
> > commit message, not leave the reader to guess at what it was....
> 
> Yeah I'll give you that, I can improve the commit message for sure.
> 
> > Did anyone analyse the pattern of discards being issued to work out
> > what pattern was worse for async vs sync discard? is it lots of
> > little discards, large extents being discarded, perhaps a problem
> > with the request request queue starving other IOs because we queue
> > so many async discards in such a short time (which is the difference
> > in behaviour vs the old code), or something else?
> 
> What was observed was a big discard which would previously have
> gone down as smaller discards now going down as either one or many
> discards. Looking at the blktrace data, it's the difference between
> 
> discard 1 queue
> discard 1 complete
> discatd 2 queue
> discard 2 complete
> [...]
> discard n queue
> discard n complete
> 
> which is now
> 
> discard 1 queue
> discard 2 queue
> [...]
> discard n queue
> [...]
> discard 1 complete
> discard 2 complete
> [...]
> discard n complete
> 
> Note that we set a max discard size of 64MB for most devices,
> since it's been shown to have less impact on latencies for
> the IO that jobs actually care about.

IOWs, this has nothing to do with XFS behaviour, and everything to
do with suboptimal scheduling control for concurrent queued discards
in the block layer....

XFS can issue discard requests of up to 8GB (on 4k block size
filesystems), and how they are optimised is completely up to the
block layer. blkdev_issue_discard() happens to be synchronous (for
historical reasons) and that does not match our asynchronous log IO
model. it's always been a cause of total filesystem stall problems
for XFS because we can't free log space until all the pending
discards have been issued. Hence we can see global filesystems
stalls of *minutes* with synchronous discards on bad devices and can
cause OOM and all sorts of other nasty cascading failures.

Async discard dispatch solves this problem for XFS - discards no
longer block forward journal progress, and we really don't want to
go back to having that ticking timebomb in XFS. Handling concurrent
discard requests in an optimal manner is not a filesystem problem -
it's an IO scheduling problem.


Essentially, we've exposed yet another limitation of the block/IO
layer handling of discard requests in the linux storage stack - it
does not have a configurable discard queue depth.

I'd much prefer these problems get handled at the IO scheduling
layer where there is visibulity of device capabilities and request
queue state. i.e. we should be throttling async discards just like
we can throttle read or write IO, especially given there are many
devices that serialise discards at the device level (i.e. not queued
device commands). This solves the problem for everyone and makes
things much better for the future where hardware implementations are
likely to get better and support more and more concurrency in
discard operations.

IMO, the way the high level code dispatches discard requests is
irrelevant here - this is a problem to do with queue depth and IO
scheduling/throttling. That's why I don't think adding permanent ABI
changes to filesystems to work around this problem is the right
solution....

> > I don't think we should be changing anything - adding an opaque,
> > user-unfriendly mount option does nothing to address the underlying
> > problem - it's just a hack to work around the symptoms being seen...
> > 
> > More details of the regression and the root cause analysis is
> > needed, please.
> 
> It brings back the same behavior as we had before, which performs
> better for us. It's preventing users of XFS+discard from upgrading,
> which is sad.

Yes, it does, but so would having the block layer to throttle device
discard requests in flight to a queue depth of 1. And then we don't
have to change XFS at all.

Cheers,

Dave.
Jens Axboe April 30, 2018, 10:40 p.m. UTC | #13
On 4/30/18 4:28 PM, Dave Chinner wrote:
> On Mon, Apr 30, 2018 at 03:42:11PM -0600, Jens Axboe wrote:
>> On 4/30/18 3:31 PM, Dave Chinner wrote:
>>> On Mon, Apr 30, 2018 at 09:32:52AM -0600, Jens Axboe wrote:
>>>> XFS recently added support for async discards. While this can be
>>>> a win for some workloads and devices, there are also cases where
>>>> async bursty discard will severly harm the latencies of reads
>>>> and writes.
>>>
>>> FWIW, convention is it document the performance regression in the
>>> commit message, not leave the reader to guess at what it was....
>>
>> Yeah I'll give you that, I can improve the commit message for sure.
>>
>>> Did anyone analyse the pattern of discards being issued to work out
>>> what pattern was worse for async vs sync discard? is it lots of
>>> little discards, large extents being discarded, perhaps a problem
>>> with the request request queue starving other IOs because we queue
>>> so many async discards in such a short time (which is the difference
>>> in behaviour vs the old code), or something else?
>>
>> What was observed was a big discard which would previously have
>> gone down as smaller discards now going down as either one or many
>> discards. Looking at the blktrace data, it's the difference between
>>
>> discard 1 queue
>> discard 1 complete
>> discatd 2 queue
>> discard 2 complete
>> [...]
>> discard n queue
>> discard n complete
>>
>> which is now
>>
>> discard 1 queue
>> discard 2 queue
>> [...]
>> discard n queue
>> [...]
>> discard 1 complete
>> discard 2 complete
>> [...]
>> discard n complete
>>
>> Note that we set a max discard size of 64MB for most devices,
>> since it's been shown to have less impact on latencies for
>> the IO that jobs actually care about.
> 
> IOWs, this has nothing to do with XFS behaviour, and everything to
> do with suboptimal scheduling control for concurrent queued discards
> in the block layer....

You could argue that, and it's fallout from XFS being the first
user of async discard. Prior to that, we've never had that use
case. I'm quite fine making all discards throttle to depth
1.

> XFS can issue discard requests of up to 8GB (on 4k block size
> filesystems), and how they are optimised is completely up to the
> block layer. blkdev_issue_discard() happens to be synchronous (for
> historical reasons) and that does not match our asynchronous log IO
> model. it's always been a cause of total filesystem stall problems
> for XFS because we can't free log space until all the pending
> discards have been issued. Hence we can see global filesystems
> stalls of *minutes* with synchronous discards on bad devices and can
> cause OOM and all sorts of other nasty cascading failures.
> 
> Async discard dispatch solves this problem for XFS - discards no
> longer block forward journal progress, and we really don't want to
> go back to having that ticking timebomb in XFS. Handling concurrent
> discard requests in an optimal manner is not a filesystem problem -
> it's an IO scheduling problem.
> 
> 
> Essentially, we've exposed yet another limitation of the block/IO
> layer handling of discard requests in the linux storage stack - it
> does not have a configurable discard queue depth.
> 
> I'd much prefer these problems get handled at the IO scheduling
> layer where there is visibulity of device capabilities and request
> queue state. i.e. we should be throttling async discards just like
> we can throttle read or write IO, especially given there are many
> devices that serialise discards at the device level (i.e. not queued
> device commands). This solves the problem for everyone and makes
> things much better for the future where hardware implementations are
> likely to get better and support more and more concurrency in
> discard operations.
> 
> IMO, the way the high level code dispatches discard requests is
> irrelevant here - this is a problem to do with queue depth and IO
> scheduling/throttling. That's why I don't think adding permanent ABI
> changes to filesystems to work around this problem is the right
> solution....

If you get off your high horse for a bit, this is essentially
a performance regression. I can either migrate folks off of XFS, or
I can come up with something that works for them. It's pretty
easy to claim this is "yet another limitation of the IO stack",
but it's really not fair to make crazy claims like that when it's
an entirely new use case. Let's try to state things objectively
and fairly.

This work should probably have been done before making XFS
discard async. This isn't the first fallout we've had from that
code.

>>> I don't think we should be changing anything - adding an opaque,
>>> user-unfriendly mount option does nothing to address the underlying
>>> problem - it's just a hack to work around the symptoms being seen...
>>>
>>> More details of the regression and the root cause analysis is
>>> needed, please.
>>
>> It brings back the same behavior as we had before, which performs
>> better for us. It's preventing users of XFS+discard from upgrading,
>> which is sad.
> 
> Yes, it does, but so would having the block layer to throttle device
> discard requests in flight to a queue depth of 1. And then we don't
> have to change XFS at all.

I'm perfectly fine with making that change by default, and much easier
for me since I don't have to patch file systems.
Eric Sandeen April 30, 2018, 10:59 p.m. UTC | #14
On 4/30/18 2:58 PM, Jens Axboe wrote:
> On 4/30/18 1:57 PM, Eric Sandeen wrote:
>> On 4/30/18 2:21 PM, Jens Axboe wrote:

...

>> Ok, but I'm not sure how that precludes a block device tunable?  You'd tune
>> it for your workload, right?
> 
> What kind of tunable are you thinking of? Right now we have one tunable,
> which is the max discard size. Patch #1 actually helps make this do what
> it should for sync discards, instead of just building a bio chain and
> submitting that all at once.
> 
>> Or is the concern that it could only be for the entire block device, and
>> perhaps different partitions have different workloads?
>>
>> Sorry, caveman filesystem guy doesn't completely understand block devices.
> 
> I just don't know what you are trying to tune :-)

I'm wondering if "make discards synchronous" couldn't just be a block device
tunable, rather than a filesystem mount option tunable.

-Eric
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Darrick J. Wong April 30, 2018, 11:01 p.m. UTC | #15
On Mon, Apr 30, 2018 at 04:40:04PM -0600, Jens Axboe wrote:
> On 4/30/18 4:28 PM, Dave Chinner wrote:
> > On Mon, Apr 30, 2018 at 03:42:11PM -0600, Jens Axboe wrote:
> >> On 4/30/18 3:31 PM, Dave Chinner wrote:
> >>> On Mon, Apr 30, 2018 at 09:32:52AM -0600, Jens Axboe wrote:
> >>>> XFS recently added support for async discards. While this can be
> >>>> a win for some workloads and devices, there are also cases where
> >>>> async bursty discard will severly harm the latencies of reads
> >>>> and writes.
> >>>
> >>> FWIW, convention is it document the performance regression in the
> >>> commit message, not leave the reader to guess at what it was....
> >>
> >> Yeah I'll give you that, I can improve the commit message for sure.
> >>
> >>> Did anyone analyse the pattern of discards being issued to work out
> >>> what pattern was worse for async vs sync discard? is it lots of
> >>> little discards, large extents being discarded, perhaps a problem
> >>> with the request request queue starving other IOs because we queue
> >>> so many async discards in such a short time (which is the difference
> >>> in behaviour vs the old code), or something else?
> >>
> >> What was observed was a big discard which would previously have
> >> gone down as smaller discards now going down as either one or many
> >> discards. Looking at the blktrace data, it's the difference between
> >>
> >> discard 1 queue
> >> discard 1 complete
> >> discatd 2 queue
> >> discard 2 complete
> >> [...]
> >> discard n queue
> >> discard n complete
> >>
> >> which is now
> >>
> >> discard 1 queue
> >> discard 2 queue
> >> [...]
> >> discard n queue
> >> [...]
> >> discard 1 complete
> >> discard 2 complete
> >> [...]
> >> discard n complete
> >>
> >> Note that we set a max discard size of 64MB for most devices,
> >> since it's been shown to have less impact on latencies for
> >> the IO that jobs actually care about.

Ok, so as I see it the problem here is that discards are not some
instantaneous command, but instead have some cost that is (probably)
less than (say) a full write of the same quantity of zeroes.  Previously
we'd feed the block layer one discard io at a time, but now we batch
them up and send multiple large discard requests at the same time.  The
discards then tie up the device for long periods of time.  We'd get the
same result if we sent gigabytes of write commands simultaneously too,
right?

Wouldn't the same problem would happen if say 200 threads were each
issuing discards one by one because there's just too many bytes in
flight at a time?

So what I'm asking here is, can we throttle discard IOs so that no
single issuer can monopolize a device?  As a stupid example, if program
A is sending 200MB of reads, program B is sending 200MB of writes, and
xfs is sending 200MB of discards at the same time, can we throttle all
three so that they each get roughly a third of the queue at a time?

This way XFS can still send huge batches of discard IOs asynchronously,
and it's fine enough if some of those have to wait longer in order to
avoid starving other things.

(Yes, I imagine you could serialize discards and run them one at a time,
but that seems a little suboptimal...)

> > 
> > IOWs, this has nothing to do with XFS behaviour, and everything to
> > do with suboptimal scheduling control for concurrent queued discards
> > in the block layer....
> 
> You could argue that, and it's fallout from XFS being the first
> user of async discard. Prior to that, we've never had that use
> case. I'm quite fine making all discards throttle to depth
> 1.
> 
> > XFS can issue discard requests of up to 8GB (on 4k block size
> > filesystems), and how they are optimised is completely up to the
> > block layer. blkdev_issue_discard() happens to be synchronous (for
> > historical reasons) and that does not match our asynchronous log IO
> > model. it's always been a cause of total filesystem stall problems
> > for XFS because we can't free log space until all the pending
> > discards have been issued. Hence we can see global filesystems
> > stalls of *minutes* with synchronous discards on bad devices and can
> > cause OOM and all sorts of other nasty cascading failures.
> > 
> > Async discard dispatch solves this problem for XFS - discards no
> > longer block forward journal progress, and we really don't want to
> > go back to having that ticking timebomb in XFS. Handling concurrent
> > discard requests in an optimal manner is not a filesystem problem -
> > it's an IO scheduling problem.
> > 
> > 
> > Essentially, we've exposed yet another limitation of the block/IO
> > layer handling of discard requests in the linux storage stack - it
> > does not have a configurable discard queue depth.
> > 
> > I'd much prefer these problems get handled at the IO scheduling
> > layer where there is visibulity of device capabilities and request
> > queue state. i.e. we should be throttling async discards just like
> > we can throttle read or write IO, especially given there are many
> > devices that serialise discards at the device level (i.e. not queued
> > device commands). This solves the problem for everyone and makes
> > things much better for the future where hardware implementations are
> > likely to get better and support more and more concurrency in
> > discard operations.
> > 
> > IMO, the way the high level code dispatches discard requests is
> > irrelevant here - this is a problem to do with queue depth and IO
> > scheduling/throttling. That's why I don't think adding permanent ABI
> > changes to filesystems to work around this problem is the right
> > solution....
> 
> If you get off your high horse for a bit, this is essentially
> a performance regression. I can either migrate folks off of XFS, or
> I can come up with something that works for them. It's pretty
> easy to claim this is "yet another limitation of the IO stack",
> but it's really not fair to make crazy claims like that when it's
> an entirely new use case. Let's try to state things objectively
> and fairly.
> 
> This work should probably have been done before making XFS
> discard async. This isn't the first fallout we've had from that
> code.
> 
> >>> I don't think we should be changing anything - adding an opaque,
> >>> user-unfriendly mount option does nothing to address the underlying
> >>> problem - it's just a hack to work around the symptoms being seen...
> >>>
> >>> More details of the regression and the root cause analysis is
> >>> needed, please.
> >>
> >> It brings back the same behavior as we had before, which performs
> >> better for us. It's preventing users of XFS+discard from upgrading,
> >> which is sad.
> > 
> > Yes, it does, but so would having the block layer to throttle device
> > discard requests in flight to a queue depth of 1. And then we don't
> > have to change XFS at all.
> 
> I'm perfectly fine with making that change by default, and much easier
> for me since I don't have to patch file systems.

...and I prefer not to add mount options, fwiw.

--D

> -- 
> Jens Axboe
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jens Axboe April 30, 2018, 11:02 p.m. UTC | #16
On 4/30/18 4:59 PM, Eric Sandeen wrote:
> On 4/30/18 2:58 PM, Jens Axboe wrote:
>> On 4/30/18 1:57 PM, Eric Sandeen wrote:
>>> On 4/30/18 2:21 PM, Jens Axboe wrote:
> 
> ...
> 
>>> Ok, but I'm not sure how that precludes a block device tunable?  You'd tune
>>> it for your workload, right?
>>
>> What kind of tunable are you thinking of? Right now we have one tunable,
>> which is the max discard size. Patch #1 actually helps make this do what
>> it should for sync discards, instead of just building a bio chain and
>> submitting that all at once.
>>
>>> Or is the concern that it could only be for the entire block device, and
>>> perhaps different partitions have different workloads?
>>>
>>> Sorry, caveman filesystem guy doesn't completely understand block devices.
>>
>> I just don't know what you are trying to tune :-)
> 
> I'm wondering if "make discards synchronous" couldn't just be a block device
> tunable, rather than a filesystem mount option tunable.

See e-mail I just sent out. With that, you get that tunable bundled with
the discard_max_bytes tunable. If you leave it at the default, which is
generally very high, then you get huge discards. If you tune it down,
they become basically sync. I don't think we need a separate tunable
for this.

It's still per-caller, which is good enough for us. I don't want to
make it more complicated until people show up with a valid use case
that shows we need it strictly throttled per-device.
diff mbox

Patch

diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 4668403b1741..4eced3eceea5 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -552,10 +552,16 @@  xlog_discard_busy_extents(
 	struct bio		*bio = NULL;
 	struct blk_plug		plug;
 	int			error = 0;
+	int			discard_flags = 0;
+	bool			sync_discard = mp->m_flags & XFS_MOUNT_DISCARD_SYNC;
 
 	ASSERT(mp->m_flags & XFS_MOUNT_DISCARD);
 
-	blk_start_plug(&plug);
+	if (!sync_discard)
+		blk_start_plug(&plug);
+	else
+		discard_flags = BLKDEV_DISCARD_SYNC;
+
 	list_for_each_entry(busyp, list, list) {
 		trace_xfs_discard_extent(mp, busyp->agno, busyp->bno,
 					 busyp->length);
@@ -563,7 +569,7 @@  xlog_discard_busy_extents(
 		error = __blkdev_issue_discard(mp->m_ddev_targp->bt_bdev,
 				XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno),
 				XFS_FSB_TO_BB(mp, busyp->length),
-				GFP_NOFS, 0, &bio);
+				GFP_NOFS, discard_flags, &bio);
 		if (error && error != -EOPNOTSUPP) {
 			xfs_info(mp,
 	 "discard failed for extent [0x%llx,%u], error %d",
@@ -574,14 +580,18 @@  xlog_discard_busy_extents(
 		}
 	}
 
-	if (bio) {
+	if (sync_discard) {
+		xfs_extent_busy_clear(mp, &ctx->busy_extents, false);
+		kmem_free(ctx);
+	} else if (bio) {
 		bio->bi_private = ctx;
 		bio->bi_end_io = xlog_discard_endio;
 		submit_bio(bio);
+		blk_finish_plug(&plug);
 	} else {
 		xlog_discard_endio_work(&ctx->discard_endio_work);
+		blk_finish_plug(&plug);
 	}
-	blk_finish_plug(&plug);
 }
 
 /*
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 10b90bbc5162..3f0b7b9106c7 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -219,6 +219,7 @@  typedef struct xfs_mount {
 						   operations, typically for
 						   disk errors in metadata */
 #define XFS_MOUNT_DISCARD	(1ULL << 5)	/* discard unused blocks */
+#define XFS_MOUNT_DISCARD_SYNC	(1ULL << 6)	/* discard unused blocks sync */
 #define XFS_MOUNT_NOALIGN	(1ULL << 7)	/* turn off stripe alignment
 						   allocations */
 #define XFS_MOUNT_ATTR2		(1ULL << 8)	/* allow use of attr2 format */
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index d71424052917..6d960bb4725f 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -83,7 +83,7 @@  enum {
 	Opt_quota, Opt_noquota, Opt_usrquota, Opt_grpquota, Opt_prjquota,
 	Opt_uquota, Opt_gquota, Opt_pquota,
 	Opt_uqnoenforce, Opt_gqnoenforce, Opt_pqnoenforce, Opt_qnoenforce,
-	Opt_discard, Opt_nodiscard, Opt_dax, Opt_err,
+	Opt_discard, Opt_nodiscard, Opt_dax, Opt_err, Opt_discard_sync,
 };
 
 static const match_table_t tokens = {
@@ -129,6 +129,7 @@  static const match_table_t tokens = {
 	{Opt_pqnoenforce,"pqnoenforce"},/* project quota limit enforcement */
 	{Opt_qnoenforce, "qnoenforce"},	/* same as uqnoenforce */
 	{Opt_discard,	"discard"},	/* Discard unused blocks */
+	{Opt_discard_sync,"discard_sync"},/* Discard unused blocks sync */
 	{Opt_nodiscard,	"nodiscard"},	/* Do not discard unused blocks */
 
 	{Opt_dax,	"dax"},		/* Enable direct access to bdev pages */
@@ -363,11 +364,15 @@  xfs_parseargs(
 			mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE);
 			mp->m_qflags &= ~XFS_GQUOTA_ENFD;
 			break;
+		case Opt_discard_sync:
+			mp->m_flags |= XFS_MOUNT_DISCARD_SYNC;
+			/* fall through and set XFS_MOUNT_DISCARD too */
 		case Opt_discard:
 			mp->m_flags |= XFS_MOUNT_DISCARD;
 			break;
 		case Opt_nodiscard:
 			mp->m_flags &= ~XFS_MOUNT_DISCARD;
+			mp->m_flags &= ~XFS_MOUNT_DISCARD_SYNC;
 			break;
 #ifdef CONFIG_FS_DAX
 		case Opt_dax: