diff mbox series

[RFC,1/3] block/mq-deadline: Revert "block/mq-deadline: Fix the tag reservation code"

Message ID 20241209115522.3741093-2-yukuai1@huaweicloud.com (mailing list archive)
State New
Headers show
Series lib/sbitmap: fix shallow_depth tag allocation | expand

Commit Message

Yu Kuai Dec. 9, 2024, 11:55 a.m. UTC
From: Yu Kuai <yukuai3@huawei.com>

This reverts commit 39823b47bbd40502632ffba90ebb34fff7c8b5e8.

Because tag reservation is not fixed and will introduce performance
problem.

1) Set min_shallow_depth to 1 will end up setting wake_batch to 1,
   deadline has no reason to do this. And this will cause performance
   degradation in some high concurrency test, for both IO bandwidth
   and cpu usage.
2) async_depth is nr_requests, hence shallow_depth will always set to
   1 << bt->sb.shift. For consequence, no tag can be reserved.

The next patch will fix tag reservation again.

Fixes: 39823b47bbd4 ("block/mq-deadline: Fix the tag reservation code")
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
---
 block/mq-deadline.c | 20 +++-----------------
 1 file changed, 3 insertions(+), 17 deletions(-)

Comments

Bart Van Assche Dec. 9, 2024, 6:02 p.m. UTC | #1
On 12/9/24 7:55 PM, Yu Kuai wrote:
> From: Yu Kuai <yukuai3@huawei.com>
> 
> This reverts commit 39823b47bbd40502632ffba90ebb34fff7c8b5e8.
> 
> Because tag reservation is not fixed and will introduce performance
> problem.

As explained in detail in the patch description and in the comments
added by that patch, commit 39823b47bbd4 ("block/mq-deadline: Fix the
tag reservation code") fixes broken code. So reverting that commit is
wrong. I think that patches 1/3 and 3/3 of this series should be
combined into a single patch.

> 1) Set min_shallow_depth to 1 will end up setting wake_batch to 1,
>     deadline has no reason to do this. And this will cause performance
>     degradation in some high concurrency test, for both IO bandwidth
>     and cpu usage.

As explained in the commit message, this is done because
min_shallow_depth must be less than or equal to shallow_depth.
Additionally, mq-deadline is not the only I/O scheduler that sets
min_shallow_depth to 1. BFQ does this too.

> 2) async_depth is nr_requests, hence shallow_depth will always set to
>     1 << bt->sb.shift. For consequence, no tag can be reserved.

This is not correct. dd->async_depth can be modified via sysfs.

Bart.
Yu Kuai Dec. 10, 2024, 1:23 a.m. UTC | #2
Hi,

在 2024/12/10 2:02, Bart Van Assche 写道:
> 
>> 1) Set min_shallow_depth to 1 will end up setting wake_batch to 1,
>>     deadline has no reason to do this. And this will cause performance
>>     degradation in some high concurrency test, for both IO bandwidth
>>     and cpu usage.
> 
> As explained in the commit message, this is done because
> min_shallow_depth must be less than or equal to shallow_depth.
> Additionally, mq-deadline is not the only I/O scheduler that sets
> min_shallow_depth to 1. BFQ does this too.

No, BFQ is totally different, one task from bfq can be limited to just
one request, due to the cgroup policy of bfq.
> 
>> 2) async_depth is nr_requests, hence shallow_depth will always set to
>>     1 << bt->sb.shift. For consequence, no tag can be reserved.
> 
> This is not correct. dd->async_depth can be modified via sysfs.

So, I'm trying to understand, there are two cases:

1) the default value of async_depth is nr_requests, and no tag can be
reserved, right? Do we want to fix this?
2) User must change async_depth to a lower value to make it work.

Now, I understand why you want to change min_shallow_depth to 1.
However, I think we should also set min_shallow_depth while writing
sysfs as well.

> 
> Bart.
> .
>
Yu Kuai Dec. 10, 2024, 1:50 a.m. UTC | #3
Hi,

在 2024/12/10 2:02, Bart Van Assche 写道:
> This is not correct. dd->async_depth can be modified via sysfs.

How about the following patch to fix min_shallow_depth for deadline?

Thanks,
Kuai

diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index a9cf8e19f9d1..040ebb0b192d 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -667,8 +667,7 @@ static void dd_depth_updated(struct blk_mq_hw_ctx *hctx)
         struct blk_mq_tags *tags = hctx->sched_tags;

         dd->async_depth = q->nr_requests;
-
-       sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, 1);
+       sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, 
dd->async_depth);
  }

  /* Called by blk_mq_init_hctx() and blk_mq_init_sched(). */
@@ -1012,6 +1011,47 @@ SHOW_INT(deadline_fifo_batch_show, dd->fifo_batch);
  #undef SHOW_INT
  #undef SHOW_JIFFIES

+static ssize_t deadline_async_depth_store(struct elevator_queue *e,
+                                         const char *page, size_t count)
+{
+       struct deadline_data *dd = e->elevator_data;
+       struct request_queue *q = dd->q;
+       struct blk_mq_hw_ctx *hctx;
+       unsigned long i;
+       int v;
+       int ret = kstrtoint(page, 0, &v);
+
+       if (ret < 0)
+               return ret;
+
+       if (v < 1)
+               v = 1;
+       else if (v > dd->q->nr_requests)
+               v = dd->q->nr_requests;
+
+       if (v == dd->async_depth)
+               return count;
+
+       blk_mq_freeze_queue(q);
+       blk_mq_quiesce_queue(q);
+
+       dd->async_depth = v;
+       if (blk_mq_is_shared_tags(q->tag_set->flags)) {
+               sbitmap_queue_min_shallow_depth(
+                       &q->sched_shared_tags->bitmap_tags, 
dd->async_depth);
+       } else {
+               queue_for_each_hw_ctx(q, hctx, i)
+                       sbitmap_queue_min_shallow_depth(
+                               &hctx->sched_tags->bitmap_tags,
+                               dd->async_depth);
+       }
+
+       blk_mq_unquiesce_queue(q);
+       blk_mq_unfreeze_queue(q);
+
+       return count;
+}
+
  #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) 
         \
  static ssize_t __FUNC(struct elevator_queue *e, const char *page, 
size_t count)        \
  {                                                                      \
@@ -1037,7 +1077,6 @@ STORE_JIFFIES(deadline_write_expire_store, 
&dd->fifo_expire[DD_WRITE], 0, INT_MA
  STORE_JIFFIES(deadline_prio_aging_expire_store, 
&dd->prio_aging_expire, 0, INT_MAX);
  STORE_INT(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, 
INT_MAX);
  STORE_INT(deadline_front_merges_store, &dd->front_merges, 0, 1);
-STORE_INT(deadline_async_depth_store, &dd->async_depth, 1, INT_MAX);
  STORE_INT(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX);
  #undef STORE_FUNCTION
  #undef STORE_INT
Yu Kuai Dec. 10, 2024, 6:22 a.m. UTC | #4
Hi,

在 2024/12/10 9:50, Yu Kuai 写道:
> Hi,
> 
> 在 2024/12/10 2:02, Bart Van Assche 写道:
>> This is not correct. dd->async_depth can be modified via sysfs.
> 
> How about the following patch to fix min_shallow_depth for deadline?
> 
> Thanks,
> Kuai
> 
> diff --git a/block/mq-deadline.c b/block/mq-deadline.c
> index a9cf8e19f9d1..040ebb0b192d 100644
> --- a/block/mq-deadline.c
> +++ b/block/mq-deadline.c
> @@ -667,8 +667,7 @@ static void dd_depth_updated(struct blk_mq_hw_ctx 
> *hctx)
>          struct blk_mq_tags *tags = hctx->sched_tags;
> 
>          dd->async_depth = q->nr_requests;
> -
> -       sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, 1);
> +       sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, 
> dd->async_depth);
>   }
> 
>   /* Called by blk_mq_init_hctx() and blk_mq_init_sched(). */
> @@ -1012,6 +1011,47 @@ SHOW_INT(deadline_fifo_batch_show, dd->fifo_batch);
>   #undef SHOW_INT
>   #undef SHOW_JIFFIES
> 
> +static ssize_t deadline_async_depth_store(struct elevator_queue *e,
> +                                         const char *page, size_t count)
> +{
> +       struct deadline_data *dd = e->elevator_data;
> +       struct request_queue *q = dd->q;
> +       struct blk_mq_hw_ctx *hctx;
> +       unsigned long i;
> +       int v;
> +       int ret = kstrtoint(page, 0, &v);
> +
> +       if (ret < 0)
> +               return ret;
> +
> +       if (v < 1)
> +               v = 1;
> +       else if (v > dd->q->nr_requests)
> +               v = dd->q->nr_requests;
> +
> +       if (v == dd->async_depth)
> +               return count;
> +
> +       blk_mq_freeze_queue(q);
> +       blk_mq_quiesce_queue(q);
> +
> +       dd->async_depth = v;
> +       if (blk_mq_is_shared_tags(q->tag_set->flags)) {
> +               sbitmap_queue_min_shallow_depth(
> +                       &q->sched_shared_tags->bitmap_tags, 
> dd->async_depth);
> +       } else {
> +               queue_for_each_hw_ctx(q, hctx, i)
> +                       sbitmap_queue_min_shallow_depth(
> +                               &hctx->sched_tags->bitmap_tags,
> +                               dd->async_depth);
> +       }

Just realized that this is not ok, q->sysfs_lock must be held to protect
changing hctx, however, the lock ordering is q->sysfs_lock before
eq->sysfs_lock, and this context already hold eq->sysfs_lock.

First of all, are we in the agreement that it's not acceptable to
sacrifice performance in the default scenario just to make sure
functional correctness if async_depth is set to 1?

If so, following are the options that I can think of to fix this:

1) make async_depth read-only, if 75% tags will hurt performance in some
cases, user can increase nr_requests to prevent it.
2) refactor elevator sysfs api, remove eq->sysfs_lock and replace it
with q->sysfs_lock, so deadline_async_depth_store() will be protected
against changing hctxs, and min_shallow_depth can be updated here.
3) other options?

Thanks,
Kuai
Bart Van Assche Dec. 10, 2024, 8:33 p.m. UTC | #5
On 12/9/24 10:22 PM, Yu Kuai wrote:
> First of all, are we in the agreement that it's not acceptable to
> sacrifice performance in the default scenario just to make sure
> functional correctness if async_depth is set to 1?

How much does this affect performance? If this affects performance
significantly I agree that this needs to be fixed.

> If so, following are the options that I can think of to fix this:
> 
> 1) make async_depth read-only, if 75% tags will hurt performance in some
> cases, user can increase nr_requests to prevent it.
> 2) refactor elevator sysfs api, remove eq->sysfs_lock and replace it
> with q->sysfs_lock, so deadline_async_depth_store() will be protected
> against changing hctxs, and min_shallow_depth can be updated here.
> 3) other options?

Another option is to remove the ability to configure async_depth. If it
is too much trouble to get the implementation right without causing
regressions for existing workloads, one possibility is to remove support
for restricting the number of asynchronous requests in flight.

Thanks,

Bart.
Zhiguo Niu Dec. 11, 2024, 2:38 a.m. UTC | #6
Bart Van Assche <bvanassche@acm.org> 于2024年12月11日周三 04:33写道:
>
> On 12/9/24 10:22 PM, Yu Kuai wrote:
> > First of all, are we in the agreement that it's not acceptable to
> > sacrifice performance in the default scenario just to make sure
> > functional correctness if async_depth is set to 1?
>
> How much does this affect performance? If this affects performance
> significantly I agree that this needs to be fixed.
>
> > If so, following are the options that I can think of to fix this:
> >
> > 1) make async_depth read-only, if 75% tags will hurt performance in some
> > cases, user can increase nr_requests to prevent it.
> > 2) refactor elevator sysfs api, remove eq->sysfs_lock and replace it
> > with q->sysfs_lock, so deadline_async_depth_store() will be protected
> > against changing hctxs, and min_shallow_depth can be updated here.
> > 3) other options?
>
> Another option is to remove the ability to configure async_depth. If it
> is too much trouble to get the implementation right without causing
> regressions for existing workloads, one possibility is to remove support
> for restricting the number of asynchronous requests in flight.
Hi Bart,
I think it is very useful to restrict asynchronous requests when IO
loading is very heavy by aysnc_depth.
the following is my androidbench experiment in android device(sched_tag=128):
1. setting heavy IO
while true; do fio -directory=/data -direct=0 -rw=write -bs=64M
-size=1G -numjobs=5 -name=fiotest
2. run androidbench  and results:
                orignial async_depth
async_depth=nr_requests*3/4      delta
seq read             33.176                                216.49
                      183.314
seq write             28.57                                  62.152
                         33.582
radom read         1.518                                  1.648
                        0.13
radom write         3.546                                  4.27
                          0.724
and our customer also feedback there is optimization when they test
APP cold start and benchmark after tunning async_depth.
thanks!
>
> Thanks,
>
> Bart.
>
Yu Kuai Dec. 11, 2024, 2:57 a.m. UTC | #7
Hi,

在 2024/12/11 10:38, Zhiguo Niu 写道:
> Bart Van Assche <bvanassche@acm.org> 于2024年12月11日周三 04:33写道:
>>
>> On 12/9/24 10:22 PM, Yu Kuai wrote:
>>> First of all, are we in the agreement that it's not acceptable to
>>> sacrifice performance in the default scenario just to make sure
>>> functional correctness if async_depth is set to 1?
>>
>> How much does this affect performance? If this affects performance
>> significantly I agree that this needs to be fixed.
>>
>>> If so, following are the options that I can think of to fix this:
>>>
>>> 1) make async_depth read-only, if 75% tags will hurt performance in some
>>> cases, user can increase nr_requests to prevent it.
>>> 2) refactor elevator sysfs api, remove eq->sysfs_lock and replace it
>>> with q->sysfs_lock, so deadline_async_depth_store() will be protected
>>> against changing hctxs, and min_shallow_depth can be updated here.
>>> 3) other options?
>>
>> Another option is to remove the ability to configure async_depth. If it
>> is too much trouble to get the implementation right without causing
>> regressions for existing workloads, one possibility is to remove support
>> for restricting the number of asynchronous requests in flight.
> Hi Bart,
> I think it is very useful to restrict asynchronous requests when IO
> loading is very heavy by aysnc_depth.
> the following is my androidbench experiment in android device(sched_tag=128):
> 1. setting heavy IO
> while true; do fio -directory=/data -direct=0 -rw=write -bs=64M
> -size=1G -numjobs=5 -name=fiotest
> 2. run androidbench  and results:
>                  orignial async_depth
> async_depth=nr_requests*3/4      delta
> seq read             33.176                                216.49
>                        183.314
> seq write             28.57                                  62.152
>                           33.582
> radom read         1.518                                  1.648
>                          0.13
> radom write         3.546                                  4.27
>                            0.724
> and our customer also feedback there is optimization when they test
> APP cold start and benchmark after tunning async_depth.

So do you guys writing async_depth? Looks like you're using
nr_requests*3/4. If this is the case, the above option 1) is still
working for you guys. However, in this test, I think the lower
async_depth is, the better result you'll get.

Thanks,
Kuai


> thanks!
>>
>> Thanks,
>>
>> Bart.
>>
> .
>
Yu Kuai Dec. 11, 2024, 3 a.m. UTC | #8
Hi,

在 2024/12/11 4:33, Bart Van Assche 写道:
> How much does this affect performance? If this affects performance
> significantly I agree that this needs to be fixed.

We are testing megaraid-sas, with 24 jobs * 1024 iodepth, for randread
and randrw tests, v5.10 is about 20% better than v6.6.

I verified that just set wake_batch from 1 back to 8 can fix this.

Thanks,
Kuai
Zhiguo Niu Dec. 11, 2024, 3:03 a.m. UTC | #9
Yu Kuai <yukuai1@huaweicloud.com> 于2024年12月11日周三 10:58写道:
>
> Hi,
>
> 在 2024/12/11 10:38, Zhiguo Niu 写道:
> > Bart Van Assche <bvanassche@acm.org> 于2024年12月11日周三 04:33写道:
> >>
> >> On 12/9/24 10:22 PM, Yu Kuai wrote:
> >>> First of all, are we in the agreement that it's not acceptable to
> >>> sacrifice performance in the default scenario just to make sure
> >>> functional correctness if async_depth is set to 1?
> >>
> >> How much does this affect performance? If this affects performance
> >> significantly I agree that this needs to be fixed.
> >>
> >>> If so, following are the options that I can think of to fix this:
> >>>
> >>> 1) make async_depth read-only, if 75% tags will hurt performance in some
> >>> cases, user can increase nr_requests to prevent it.
> >>> 2) refactor elevator sysfs api, remove eq->sysfs_lock and replace it
> >>> with q->sysfs_lock, so deadline_async_depth_store() will be protected
> >>> against changing hctxs, and min_shallow_depth can be updated here.
> >>> 3) other options?
> >>
> >> Another option is to remove the ability to configure async_depth. If it
> >> is too much trouble to get the implementation right without causing
> >> regressions for existing workloads, one possibility is to remove support
> >> for restricting the number of asynchronous requests in flight.
> > Hi Bart,
> > I think it is very useful to restrict asynchronous requests when IO
> > loading is very heavy by aysnc_depth.
> > the following is my androidbench experiment in android device(sched_tag=128):
> > 1. setting heavy IO
> > while true; do fio -directory=/data -direct=0 -rw=write -bs=64M
> > -size=1G -numjobs=5 -name=fiotest
> > 2. run androidbench  and results:
> >                  orignial async_depth
> > async_depth=nr_requests*3/4      delta
> > seq read             33.176                                216.49
> >                        183.314
> > seq write             28.57                                  62.152
> >                           33.582
> > radom read         1.518                                  1.648
> >                          0.13
> > radom write         3.546                                  4.27
> >                            0.724
> > and our customer also feedback there is optimization when they test
> > APP cold start and benchmark after tunning async_depth.
>
> So do you guys writing async_depth? Looks like you're using
> nr_requests*3/4. If this is the case, the above option 1) is still
> working for you guys. However, in this test, I think the lower
> async_depth is, the better result you'll get.
Hi Kuai,
yes, we modify async_depth to nr_reqeusts*3/4 by sysfs.
thanks!
>
> Thanks,
> Kuai
>
>
> > thanks!
> >>
> >> Thanks,
> >>
> >> Bart.
> >>
> > .
> >
>
>
diff mbox series

Patch

diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index 91b3789f710e..1f0d175a941e 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -487,20 +487,6 @@  static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
 	return rq;
 }
 
-/*
- * 'depth' is a number in the range 1..INT_MAX representing a number of
- * requests. Scale it with a factor (1 << bt->sb.shift) / q->nr_requests since
- * 1..(1 << bt->sb.shift) is the range expected by sbitmap_get_shallow().
- * Values larger than q->nr_requests have the same effect as q->nr_requests.
- */
-static int dd_to_word_depth(struct blk_mq_hw_ctx *hctx, unsigned int qdepth)
-{
-	struct sbitmap_queue *bt = &hctx->sched_tags->bitmap_tags;
-	const unsigned int nrr = hctx->queue->nr_requests;
-
-	return ((qdepth << bt->sb.shift) + nrr - 1) / nrr;
-}
-
 /*
  * Called by __blk_mq_alloc_request(). The shallow_depth value set by this
  * function is used by __blk_mq_get_tag().
@@ -517,7 +503,7 @@  static void dd_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data)
 	 * Throttle asynchronous requests and writes such that these requests
 	 * do not block the allocation of synchronous requests.
 	 */
-	data->shallow_depth = dd_to_word_depth(data->hctx, dd->async_depth);
+	data->shallow_depth = dd->async_depth;
 }
 
 /* Called by blk_mq_update_nr_requests(). */
@@ -527,9 +513,9 @@  static void dd_depth_updated(struct blk_mq_hw_ctx *hctx)
 	struct deadline_data *dd = q->elevator->elevator_data;
 	struct blk_mq_tags *tags = hctx->sched_tags;
 
-	dd->async_depth = q->nr_requests;
+	dd->async_depth = max(1UL, 3 * q->nr_requests / 4);
 
-	sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, 1);
+	sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, dd->async_depth);
 }
 
 /* Called by blk_mq_init_hctx() and blk_mq_init_sched(). */