Message ID | 20241209115522.3741093-2-yukuai1@huaweicloud.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | lib/sbitmap: fix shallow_depth tag allocation | expand |
On 12/9/24 7:55 PM, Yu Kuai wrote: > From: Yu Kuai <yukuai3@huawei.com> > > This reverts commit 39823b47bbd40502632ffba90ebb34fff7c8b5e8. > > Because tag reservation is not fixed and will introduce performance > problem. As explained in detail in the patch description and in the comments added by that patch, commit 39823b47bbd4 ("block/mq-deadline: Fix the tag reservation code") fixes broken code. So reverting that commit is wrong. I think that patches 1/3 and 3/3 of this series should be combined into a single patch. > 1) Set min_shallow_depth to 1 will end up setting wake_batch to 1, > deadline has no reason to do this. And this will cause performance > degradation in some high concurrency test, for both IO bandwidth > and cpu usage. As explained in the commit message, this is done because min_shallow_depth must be less than or equal to shallow_depth. Additionally, mq-deadline is not the only I/O scheduler that sets min_shallow_depth to 1. BFQ does this too. > 2) async_depth is nr_requests, hence shallow_depth will always set to > 1 << bt->sb.shift. For consequence, no tag can be reserved. This is not correct. dd->async_depth can be modified via sysfs. Bart.
Hi, 在 2024/12/10 2:02, Bart Van Assche 写道: > >> 1) Set min_shallow_depth to 1 will end up setting wake_batch to 1, >> deadline has no reason to do this. And this will cause performance >> degradation in some high concurrency test, for both IO bandwidth >> and cpu usage. > > As explained in the commit message, this is done because > min_shallow_depth must be less than or equal to shallow_depth. > Additionally, mq-deadline is not the only I/O scheduler that sets > min_shallow_depth to 1. BFQ does this too. No, BFQ is totally different, one task from bfq can be limited to just one request, due to the cgroup policy of bfq. > >> 2) async_depth is nr_requests, hence shallow_depth will always set to >> 1 << bt->sb.shift. For consequence, no tag can be reserved. > > This is not correct. dd->async_depth can be modified via sysfs. So, I'm trying to understand, there are two cases: 1) the default value of async_depth is nr_requests, and no tag can be reserved, right? Do we want to fix this? 2) User must change async_depth to a lower value to make it work. Now, I understand why you want to change min_shallow_depth to 1. However, I think we should also set min_shallow_depth while writing sysfs as well. > > Bart. > . >
Hi,
在 2024/12/10 2:02, Bart Van Assche 写道:
> This is not correct. dd->async_depth can be modified via sysfs.
How about the following patch to fix min_shallow_depth for deadline?
Thanks,
Kuai
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index a9cf8e19f9d1..040ebb0b192d 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -667,8 +667,7 @@ static void dd_depth_updated(struct blk_mq_hw_ctx *hctx)
struct blk_mq_tags *tags = hctx->sched_tags;
dd->async_depth = q->nr_requests;
-
- sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, 1);
+ sbitmap_queue_min_shallow_depth(&tags->bitmap_tags,
dd->async_depth);
}
/* Called by blk_mq_init_hctx() and blk_mq_init_sched(). */
@@ -1012,6 +1011,47 @@ SHOW_INT(deadline_fifo_batch_show, dd->fifo_batch);
#undef SHOW_INT
#undef SHOW_JIFFIES
+static ssize_t deadline_async_depth_store(struct elevator_queue *e,
+ const char *page, size_t count)
+{
+ struct deadline_data *dd = e->elevator_data;
+ struct request_queue *q = dd->q;
+ struct blk_mq_hw_ctx *hctx;
+ unsigned long i;
+ int v;
+ int ret = kstrtoint(page, 0, &v);
+
+ if (ret < 0)
+ return ret;
+
+ if (v < 1)
+ v = 1;
+ else if (v > dd->q->nr_requests)
+ v = dd->q->nr_requests;
+
+ if (v == dd->async_depth)
+ return count;
+
+ blk_mq_freeze_queue(q);
+ blk_mq_quiesce_queue(q);
+
+ dd->async_depth = v;
+ if (blk_mq_is_shared_tags(q->tag_set->flags)) {
+ sbitmap_queue_min_shallow_depth(
+ &q->sched_shared_tags->bitmap_tags,
dd->async_depth);
+ } else {
+ queue_for_each_hw_ctx(q, hctx, i)
+ sbitmap_queue_min_shallow_depth(
+ &hctx->sched_tags->bitmap_tags,
+ dd->async_depth);
+ }
+
+ blk_mq_unquiesce_queue(q);
+ blk_mq_unfreeze_queue(q);
+
+ return count;
+}
+
#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)
\
static ssize_t __FUNC(struct elevator_queue *e, const char *page,
size_t count) \
{ \
@@ -1037,7 +1077,6 @@ STORE_JIFFIES(deadline_write_expire_store,
&dd->fifo_expire[DD_WRITE], 0, INT_MA
STORE_JIFFIES(deadline_prio_aging_expire_store,
&dd->prio_aging_expire, 0, INT_MAX);
STORE_INT(deadline_writes_starved_store, &dd->writes_starved, INT_MIN,
INT_MAX);
STORE_INT(deadline_front_merges_store, &dd->front_merges, 0, 1);
-STORE_INT(deadline_async_depth_store, &dd->async_depth, 1, INT_MAX);
STORE_INT(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX);
#undef STORE_FUNCTION
#undef STORE_INT
Hi, 在 2024/12/10 9:50, Yu Kuai 写道: > Hi, > > 在 2024/12/10 2:02, Bart Van Assche 写道: >> This is not correct. dd->async_depth can be modified via sysfs. > > How about the following patch to fix min_shallow_depth for deadline? > > Thanks, > Kuai > > diff --git a/block/mq-deadline.c b/block/mq-deadline.c > index a9cf8e19f9d1..040ebb0b192d 100644 > --- a/block/mq-deadline.c > +++ b/block/mq-deadline.c > @@ -667,8 +667,7 @@ static void dd_depth_updated(struct blk_mq_hw_ctx > *hctx) > struct blk_mq_tags *tags = hctx->sched_tags; > > dd->async_depth = q->nr_requests; > - > - sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, 1); > + sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, > dd->async_depth); > } > > /* Called by blk_mq_init_hctx() and blk_mq_init_sched(). */ > @@ -1012,6 +1011,47 @@ SHOW_INT(deadline_fifo_batch_show, dd->fifo_batch); > #undef SHOW_INT > #undef SHOW_JIFFIES > > +static ssize_t deadline_async_depth_store(struct elevator_queue *e, > + const char *page, size_t count) > +{ > + struct deadline_data *dd = e->elevator_data; > + struct request_queue *q = dd->q; > + struct blk_mq_hw_ctx *hctx; > + unsigned long i; > + int v; > + int ret = kstrtoint(page, 0, &v); > + > + if (ret < 0) > + return ret; > + > + if (v < 1) > + v = 1; > + else if (v > dd->q->nr_requests) > + v = dd->q->nr_requests; > + > + if (v == dd->async_depth) > + return count; > + > + blk_mq_freeze_queue(q); > + blk_mq_quiesce_queue(q); > + > + dd->async_depth = v; > + if (blk_mq_is_shared_tags(q->tag_set->flags)) { > + sbitmap_queue_min_shallow_depth( > + &q->sched_shared_tags->bitmap_tags, > dd->async_depth); > + } else { > + queue_for_each_hw_ctx(q, hctx, i) > + sbitmap_queue_min_shallow_depth( > + &hctx->sched_tags->bitmap_tags, > + dd->async_depth); > + } Just realized that this is not ok, q->sysfs_lock must be held to protect changing hctx, however, the lock ordering is q->sysfs_lock before eq->sysfs_lock, and this context already hold eq->sysfs_lock. First of all, are we in the agreement that it's not acceptable to sacrifice performance in the default scenario just to make sure functional correctness if async_depth is set to 1? If so, following are the options that I can think of to fix this: 1) make async_depth read-only, if 75% tags will hurt performance in some cases, user can increase nr_requests to prevent it. 2) refactor elevator sysfs api, remove eq->sysfs_lock and replace it with q->sysfs_lock, so deadline_async_depth_store() will be protected against changing hctxs, and min_shallow_depth can be updated here. 3) other options? Thanks, Kuai
On 12/9/24 10:22 PM, Yu Kuai wrote: > First of all, are we in the agreement that it's not acceptable to > sacrifice performance in the default scenario just to make sure > functional correctness if async_depth is set to 1? How much does this affect performance? If this affects performance significantly I agree that this needs to be fixed. > If so, following are the options that I can think of to fix this: > > 1) make async_depth read-only, if 75% tags will hurt performance in some > cases, user can increase nr_requests to prevent it. > 2) refactor elevator sysfs api, remove eq->sysfs_lock and replace it > with q->sysfs_lock, so deadline_async_depth_store() will be protected > against changing hctxs, and min_shallow_depth can be updated here. > 3) other options? Another option is to remove the ability to configure async_depth. If it is too much trouble to get the implementation right without causing regressions for existing workloads, one possibility is to remove support for restricting the number of asynchronous requests in flight. Thanks, Bart.
Bart Van Assche <bvanassche@acm.org> 于2024年12月11日周三 04:33写道: > > On 12/9/24 10:22 PM, Yu Kuai wrote: > > First of all, are we in the agreement that it's not acceptable to > > sacrifice performance in the default scenario just to make sure > > functional correctness if async_depth is set to 1? > > How much does this affect performance? If this affects performance > significantly I agree that this needs to be fixed. > > > If so, following are the options that I can think of to fix this: > > > > 1) make async_depth read-only, if 75% tags will hurt performance in some > > cases, user can increase nr_requests to prevent it. > > 2) refactor elevator sysfs api, remove eq->sysfs_lock and replace it > > with q->sysfs_lock, so deadline_async_depth_store() will be protected > > against changing hctxs, and min_shallow_depth can be updated here. > > 3) other options? > > Another option is to remove the ability to configure async_depth. If it > is too much trouble to get the implementation right without causing > regressions for existing workloads, one possibility is to remove support > for restricting the number of asynchronous requests in flight. Hi Bart, I think it is very useful to restrict asynchronous requests when IO loading is very heavy by aysnc_depth. the following is my androidbench experiment in android device(sched_tag=128): 1. setting heavy IO while true; do fio -directory=/data -direct=0 -rw=write -bs=64M -size=1G -numjobs=5 -name=fiotest 2. run androidbench and results: orignial async_depth async_depth=nr_requests*3/4 delta seq read 33.176 216.49 183.314 seq write 28.57 62.152 33.582 radom read 1.518 1.648 0.13 radom write 3.546 4.27 0.724 and our customer also feedback there is optimization when they test APP cold start and benchmark after tunning async_depth. thanks! > > Thanks, > > Bart. >
Hi, 在 2024/12/11 10:38, Zhiguo Niu 写道: > Bart Van Assche <bvanassche@acm.org> 于2024年12月11日周三 04:33写道: >> >> On 12/9/24 10:22 PM, Yu Kuai wrote: >>> First of all, are we in the agreement that it's not acceptable to >>> sacrifice performance in the default scenario just to make sure >>> functional correctness if async_depth is set to 1? >> >> How much does this affect performance? If this affects performance >> significantly I agree that this needs to be fixed. >> >>> If so, following are the options that I can think of to fix this: >>> >>> 1) make async_depth read-only, if 75% tags will hurt performance in some >>> cases, user can increase nr_requests to prevent it. >>> 2) refactor elevator sysfs api, remove eq->sysfs_lock and replace it >>> with q->sysfs_lock, so deadline_async_depth_store() will be protected >>> against changing hctxs, and min_shallow_depth can be updated here. >>> 3) other options? >> >> Another option is to remove the ability to configure async_depth. If it >> is too much trouble to get the implementation right without causing >> regressions for existing workloads, one possibility is to remove support >> for restricting the number of asynchronous requests in flight. > Hi Bart, > I think it is very useful to restrict asynchronous requests when IO > loading is very heavy by aysnc_depth. > the following is my androidbench experiment in android device(sched_tag=128): > 1. setting heavy IO > while true; do fio -directory=/data -direct=0 -rw=write -bs=64M > -size=1G -numjobs=5 -name=fiotest > 2. run androidbench and results: > orignial async_depth > async_depth=nr_requests*3/4 delta > seq read 33.176 216.49 > 183.314 > seq write 28.57 62.152 > 33.582 > radom read 1.518 1.648 > 0.13 > radom write 3.546 4.27 > 0.724 > and our customer also feedback there is optimization when they test > APP cold start and benchmark after tunning async_depth. So do you guys writing async_depth? Looks like you're using nr_requests*3/4. If this is the case, the above option 1) is still working for you guys. However, in this test, I think the lower async_depth is, the better result you'll get. Thanks, Kuai > thanks! >> >> Thanks, >> >> Bart. >> > . >
Hi, 在 2024/12/11 4:33, Bart Van Assche 写道: > How much does this affect performance? If this affects performance > significantly I agree that this needs to be fixed. We are testing megaraid-sas, with 24 jobs * 1024 iodepth, for randread and randrw tests, v5.10 is about 20% better than v6.6. I verified that just set wake_batch from 1 back to 8 can fix this. Thanks, Kuai
Yu Kuai <yukuai1@huaweicloud.com> 于2024年12月11日周三 10:58写道: > > Hi, > > 在 2024/12/11 10:38, Zhiguo Niu 写道: > > Bart Van Assche <bvanassche@acm.org> 于2024年12月11日周三 04:33写道: > >> > >> On 12/9/24 10:22 PM, Yu Kuai wrote: > >>> First of all, are we in the agreement that it's not acceptable to > >>> sacrifice performance in the default scenario just to make sure > >>> functional correctness if async_depth is set to 1? > >> > >> How much does this affect performance? If this affects performance > >> significantly I agree that this needs to be fixed. > >> > >>> If so, following are the options that I can think of to fix this: > >>> > >>> 1) make async_depth read-only, if 75% tags will hurt performance in some > >>> cases, user can increase nr_requests to prevent it. > >>> 2) refactor elevator sysfs api, remove eq->sysfs_lock and replace it > >>> with q->sysfs_lock, so deadline_async_depth_store() will be protected > >>> against changing hctxs, and min_shallow_depth can be updated here. > >>> 3) other options? > >> > >> Another option is to remove the ability to configure async_depth. If it > >> is too much trouble to get the implementation right without causing > >> regressions for existing workloads, one possibility is to remove support > >> for restricting the number of asynchronous requests in flight. > > Hi Bart, > > I think it is very useful to restrict asynchronous requests when IO > > loading is very heavy by aysnc_depth. > > the following is my androidbench experiment in android device(sched_tag=128): > > 1. setting heavy IO > > while true; do fio -directory=/data -direct=0 -rw=write -bs=64M > > -size=1G -numjobs=5 -name=fiotest > > 2. run androidbench and results: > > orignial async_depth > > async_depth=nr_requests*3/4 delta > > seq read 33.176 216.49 > > 183.314 > > seq write 28.57 62.152 > > 33.582 > > radom read 1.518 1.648 > > 0.13 > > radom write 3.546 4.27 > > 0.724 > > and our customer also feedback there is optimization when they test > > APP cold start and benchmark after tunning async_depth. > > So do you guys writing async_depth? Looks like you're using > nr_requests*3/4. If this is the case, the above option 1) is still > working for you guys. However, in this test, I think the lower > async_depth is, the better result you'll get. Hi Kuai, yes, we modify async_depth to nr_reqeusts*3/4 by sysfs. thanks! > > Thanks, > Kuai > > > > thanks! > >> > >> Thanks, > >> > >> Bart. > >> > > . > > > >
diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 91b3789f710e..1f0d175a941e 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -487,20 +487,6 @@ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) return rq; } -/* - * 'depth' is a number in the range 1..INT_MAX representing a number of - * requests. Scale it with a factor (1 << bt->sb.shift) / q->nr_requests since - * 1..(1 << bt->sb.shift) is the range expected by sbitmap_get_shallow(). - * Values larger than q->nr_requests have the same effect as q->nr_requests. - */ -static int dd_to_word_depth(struct blk_mq_hw_ctx *hctx, unsigned int qdepth) -{ - struct sbitmap_queue *bt = &hctx->sched_tags->bitmap_tags; - const unsigned int nrr = hctx->queue->nr_requests; - - return ((qdepth << bt->sb.shift) + nrr - 1) / nrr; -} - /* * Called by __blk_mq_alloc_request(). The shallow_depth value set by this * function is used by __blk_mq_get_tag(). @@ -517,7 +503,7 @@ static void dd_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data) * Throttle asynchronous requests and writes such that these requests * do not block the allocation of synchronous requests. */ - data->shallow_depth = dd_to_word_depth(data->hctx, dd->async_depth); + data->shallow_depth = dd->async_depth; } /* Called by blk_mq_update_nr_requests(). */ @@ -527,9 +513,9 @@ static void dd_depth_updated(struct blk_mq_hw_ctx *hctx) struct deadline_data *dd = q->elevator->elevator_data; struct blk_mq_tags *tags = hctx->sched_tags; - dd->async_depth = q->nr_requests; + dd->async_depth = max(1UL, 3 * q->nr_requests / 4); - sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, 1); + sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, dd->async_depth); } /* Called by blk_mq_init_hctx() and blk_mq_init_sched(). */