Message ID | 20230618160738.54385-3-yukuai1@huaweicloud.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | blk-mq: improve tag fair sharing | expand |
On 6/18/23 18:07, Yu Kuai wrote: > From: Yu Kuai <yukuai3@huawei.com> > > Start tag fair sharing when a device start to issue io will waste > resources, same number of tags will be assigned to each disk/hctx, > and such tags can't be used for other disk/hctx, which means a disk/hctx > can't use more than assinged tags even if there are still lots of tags > that is assinged to other disks are unused. > > Add a new api blk_mq_driver_tag_busy(), it will be called when get > driver tag failed, and move tag sharing from blk_mq_tag_busy() to > blk_mq_driver_tag_busy(). > > This approch will work well if total tags are not exhausted, and follow > up patches will try to refactor how tag is shared to handle this case. > > Signed-off-by: Yu Kuai <yukuai3@huawei.com> > --- > block/blk-mq-debugfs.c | 4 ++- > block/blk-mq-tag.c | 60 ++++++++++++++++++++++++++++++++++-------- > block/blk-mq.c | 4 ++- > block/blk-mq.h | 13 ++++++--- > include/linux/blk-mq.h | 6 +++-- > include/linux/blkdev.h | 1 + > 6 files changed, 70 insertions(+), 18 deletions(-) > > diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c > index 431aaa3eb181..de5a911b07c2 100644 > --- a/block/blk-mq-debugfs.c > +++ b/block/blk-mq-debugfs.c > @@ -400,8 +400,10 @@ static void blk_mq_debugfs_tags_show(struct seq_file *m, > { > seq_printf(m, "nr_tags=%u\n", tags->nr_tags); > seq_printf(m, "nr_reserved_tags=%u\n", tags->nr_reserved_tags); > - seq_printf(m, "active_queues=%d\n", > + seq_printf(m, "active_queues=%u\n", > READ_ONCE(tags->ctl.active_queues)); > + seq_printf(m, "share_queues=%u\n", > + READ_ONCE(tags->ctl.share_queues)); > > seq_puts(m, "\nbitmap_tags:\n"); > sbitmap_queue_show(&tags->bitmap_tags, m); > diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c > index fe41a0d34fc0..1c2bde917195 100644 > --- a/block/blk-mq-tag.c > +++ b/block/blk-mq-tag.c > @@ -29,6 +29,32 @@ static void blk_mq_update_wake_batch(struct blk_mq_tags *tags, > users); > } > > +void __blk_mq_driver_tag_busy(struct blk_mq_hw_ctx *hctx) > +{ > + struct blk_mq_tags *tags = hctx->tags; > + > + /* > + * calling test_bit() prior to test_and_set_bit() is intentional, > + * it avoids dirtying the cacheline if the queue is already active. > + */ > + if (blk_mq_is_shared_tags(hctx->flags)) { > + struct request_queue *q = hctx->queue; > + > + if (test_bit(QUEUE_FLAG_HCTX_BUSY, &q->queue_flags) || > + test_and_set_bit(QUEUE_FLAG_HCTX_BUSY, &q->queue_flags)) > + return; > + } else { > + if (test_bit(BLK_MQ_S_DTAG_BUSY, &hctx->state) || > + test_and_set_bit(BLK_MQ_S_DTAG_BUSY, &hctx->state)) > + return; > + } > + > + spin_lock_irq(&tags->lock); > + WRITE_ONCE(tags->ctl.share_queues, tags->ctl.active_queues); > + blk_mq_update_wake_batch(tags, tags->ctl.share_queues); > + spin_unlock_irq(&tags->lock); > +} > + > /* > * If a previously inactive queue goes active, bump the active user count. > * We need to do this before try to allocate driver tag, then even if fail > @@ -37,7 +63,6 @@ static void blk_mq_update_wake_batch(struct blk_mq_tags *tags, > */ > void __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) > { > - unsigned int users; > struct blk_mq_tags *tags = hctx->tags; > > /* > @@ -57,9 +82,7 @@ void __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) > } > > spin_lock_irq(&tags->lock); > - users = tags->ctl.active_queues + 1; > - WRITE_ONCE(tags->ctl.active_queues, users); > - blk_mq_update_wake_batch(tags, users); > + WRITE_ONCE(tags->ctl.active_queues, tags->ctl.active_queues + 1); Why did you remove the call to blk_mq_update_wake_batch() here? Cheers, Hannes
Hi, 在 2023/06/19 13:55, Hannes Reinecke 写道: > On 6/18/23 18:07, Yu Kuai wrote: >> From: Yu Kuai <yukuai3@huawei.com> >> >> Start tag fair sharing when a device start to issue io will waste >> resources, same number of tags will be assigned to each disk/hctx, >> and such tags can't be used for other disk/hctx, which means a disk/hctx >> can't use more than assinged tags even if there are still lots of tags >> that is assinged to other disks are unused. >> >> Add a new api blk_mq_driver_tag_busy(), it will be called when get >> driver tag failed, and move tag sharing from blk_mq_tag_busy() to >> blk_mq_driver_tag_busy(). >> >> This approch will work well if total tags are not exhausted, and follow >> up patches will try to refactor how tag is shared to handle this case. >> >> Signed-off-by: Yu Kuai <yukuai3@huawei.com> >> --- >> block/blk-mq-debugfs.c | 4 ++- >> block/blk-mq-tag.c | 60 ++++++++++++++++++++++++++++++++++-------- >> block/blk-mq.c | 4 ++- >> block/blk-mq.h | 13 ++++++--- >> include/linux/blk-mq.h | 6 +++-- >> include/linux/blkdev.h | 1 + >> 6 files changed, 70 insertions(+), 18 deletions(-) >> >> diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c >> index 431aaa3eb181..de5a911b07c2 100644 >> --- a/block/blk-mq-debugfs.c >> +++ b/block/blk-mq-debugfs.c >> @@ -400,8 +400,10 @@ static void blk_mq_debugfs_tags_show(struct >> seq_file *m, >> { >> seq_printf(m, "nr_tags=%u\n", tags->nr_tags); >> seq_printf(m, "nr_reserved_tags=%u\n", tags->nr_reserved_tags); >> - seq_printf(m, "active_queues=%d\n", >> + seq_printf(m, "active_queues=%u\n", >> READ_ONCE(tags->ctl.active_queues)); >> + seq_printf(m, "share_queues=%u\n", >> + READ_ONCE(tags->ctl.share_queues)); >> seq_puts(m, "\nbitmap_tags:\n"); >> sbitmap_queue_show(&tags->bitmap_tags, m); >> diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c >> index fe41a0d34fc0..1c2bde917195 100644 >> --- a/block/blk-mq-tag.c >> +++ b/block/blk-mq-tag.c >> @@ -29,6 +29,32 @@ static void blk_mq_update_wake_batch(struct >> blk_mq_tags *tags, >> users); >> } >> +void __blk_mq_driver_tag_busy(struct blk_mq_hw_ctx *hctx) >> +{ >> + struct blk_mq_tags *tags = hctx->tags; >> + >> + /* >> + * calling test_bit() prior to test_and_set_bit() is intentional, >> + * it avoids dirtying the cacheline if the queue is already active. >> + */ >> + if (blk_mq_is_shared_tags(hctx->flags)) { >> + struct request_queue *q = hctx->queue; >> + >> + if (test_bit(QUEUE_FLAG_HCTX_BUSY, &q->queue_flags) || >> + test_and_set_bit(QUEUE_FLAG_HCTX_BUSY, &q->queue_flags)) >> + return; >> + } else { >> + if (test_bit(BLK_MQ_S_DTAG_BUSY, &hctx->state) || >> + test_and_set_bit(BLK_MQ_S_DTAG_BUSY, &hctx->state)) >> + return; >> + } >> + >> + spin_lock_irq(&tags->lock); >> + WRITE_ONCE(tags->ctl.share_queues, tags->ctl.active_queues); >> + blk_mq_update_wake_batch(tags, tags->ctl.share_queues); >> + spin_unlock_irq(&tags->lock); >> +} >> + >> /* >> * If a previously inactive queue goes active, bump the active user >> count. >> * We need to do this before try to allocate driver tag, then even >> if fail >> @@ -37,7 +63,6 @@ static void blk_mq_update_wake_batch(struct >> blk_mq_tags *tags, >> */ >> void __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) >> { >> - unsigned int users; >> struct blk_mq_tags *tags = hctx->tags; >> /* >> @@ -57,9 +82,7 @@ void __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) >> } >> spin_lock_irq(&tags->lock); >> - users = tags->ctl.active_queues + 1; >> - WRITE_ONCE(tags->ctl.active_queues, users); >> - blk_mq_update_wake_batch(tags, users); >> + WRITE_ONCE(tags->ctl.active_queues, tags->ctl.active_queues + 1); > > Why did you remove the call to blk_mq_update_wake_batch() here? blk_mq_update_wake_batch() should be called when the available tags is changed, however, active_queues is no longer used for hctx_may_queue() to calculate available tags, share_queues is used instead and it's updated in the new helper blk_mq_driver_tag_busy(). Thanks, Kuai > > Cheers, > > Hannes
On 6/18/23 09:07, Yu Kuai wrote: > Start tag fair sharing when a device start to issue io will waste > resources, same number of tags will be assigned to each disk/hctx, > and such tags can't be used for other disk/hctx, which means a disk/hctx > can't use more than assinged tags even if there are still lots of tags ^^^^^^^^ assigned > that is assinged to other disks are unused. ^^^^^^^^ assigned > Add a new api blk_mq_driver_tag_busy(), it will be called when get > driver tag failed, and move tag sharing from blk_mq_tag_busy() to > blk_mq_driver_tag_busy(). > + spin_lock_irq(&tags->lock); > + WRITE_ONCE(tags->ctl.share_queues, tags->ctl.active_queues); > + blk_mq_update_wake_batch(tags, tags->ctl.share_queues); > + spin_unlock_irq(&tags->lock); > +} Are all tags->ctl.share_queues changes protected by tags->lock? If so, please use a regular assignment to update that member variable instead of using WRITE_ONCE(). > @@ -735,6 +736,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, > > struct tag_sharing_ctl { > unsigned int active_queues; > + unsigned int share_queues; > }; Please rename "share_queues" into "shared_queues" such that the names of both struct members start with an adjective. Thanks, Bart.
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 431aaa3eb181..de5a911b07c2 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -400,8 +400,10 @@ static void blk_mq_debugfs_tags_show(struct seq_file *m, { seq_printf(m, "nr_tags=%u\n", tags->nr_tags); seq_printf(m, "nr_reserved_tags=%u\n", tags->nr_reserved_tags); - seq_printf(m, "active_queues=%d\n", + seq_printf(m, "active_queues=%u\n", READ_ONCE(tags->ctl.active_queues)); + seq_printf(m, "share_queues=%u\n", + READ_ONCE(tags->ctl.share_queues)); seq_puts(m, "\nbitmap_tags:\n"); sbitmap_queue_show(&tags->bitmap_tags, m); diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index fe41a0d34fc0..1c2bde917195 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -29,6 +29,32 @@ static void blk_mq_update_wake_batch(struct blk_mq_tags *tags, users); } +void __blk_mq_driver_tag_busy(struct blk_mq_hw_ctx *hctx) +{ + struct blk_mq_tags *tags = hctx->tags; + + /* + * calling test_bit() prior to test_and_set_bit() is intentional, + * it avoids dirtying the cacheline if the queue is already active. + */ + if (blk_mq_is_shared_tags(hctx->flags)) { + struct request_queue *q = hctx->queue; + + if (test_bit(QUEUE_FLAG_HCTX_BUSY, &q->queue_flags) || + test_and_set_bit(QUEUE_FLAG_HCTX_BUSY, &q->queue_flags)) + return; + } else { + if (test_bit(BLK_MQ_S_DTAG_BUSY, &hctx->state) || + test_and_set_bit(BLK_MQ_S_DTAG_BUSY, &hctx->state)) + return; + } + + spin_lock_irq(&tags->lock); + WRITE_ONCE(tags->ctl.share_queues, tags->ctl.active_queues); + blk_mq_update_wake_batch(tags, tags->ctl.share_queues); + spin_unlock_irq(&tags->lock); +} + /* * If a previously inactive queue goes active, bump the active user count. * We need to do this before try to allocate driver tag, then even if fail @@ -37,7 +63,6 @@ static void blk_mq_update_wake_batch(struct blk_mq_tags *tags, */ void __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) { - unsigned int users; struct blk_mq_tags *tags = hctx->tags; /* @@ -57,9 +82,7 @@ void __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) } spin_lock_irq(&tags->lock); - users = tags->ctl.active_queues + 1; - WRITE_ONCE(tags->ctl.active_queues, users); - blk_mq_update_wake_batch(tags, users); + WRITE_ONCE(tags->ctl.active_queues, tags->ctl.active_queues + 1); spin_unlock_irq(&tags->lock); } @@ -73,6 +96,14 @@ void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve) sbitmap_queue_wake_all(&tags->breserved_tags); } +static void __blk_mq_driver_tag_idle(struct blk_mq_hw_ctx *hctx) +{ + if (blk_mq_is_shared_tags(hctx->flags)) + clear_bit(QUEUE_FLAG_HCTX_BUSY, &hctx->queue->queue_flags); + else + clear_bit(BLK_MQ_S_DTAG_BUSY, &hctx->state); +} + /* * If a previously busy queue goes inactive, potential waiters could now * be allowed to queue. Wake them up and check. @@ -80,7 +111,6 @@ void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve) void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) { struct blk_mq_tags *tags = hctx->tags; - unsigned int users; if (blk_mq_is_shared_tags(hctx->flags)) { struct request_queue *q = hctx->queue; @@ -94,9 +124,10 @@ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) } spin_lock_irq(&tags->lock); - users = tags->ctl.active_queues - 1; - WRITE_ONCE(tags->ctl.active_queues, users); - blk_mq_update_wake_batch(tags, users); + __blk_mq_driver_tag_idle(hctx); + WRITE_ONCE(tags->ctl.active_queues, tags->ctl.active_queues - 1); + WRITE_ONCE(tags->ctl.share_queues, tags->ctl.active_queues); + blk_mq_update_wake_batch(tags, tags->ctl.share_queues); spin_unlock_irq(&tags->lock); blk_mq_tag_wakeup_all(tags, false); @@ -105,14 +136,21 @@ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) static int __blk_mq_get_tag(struct blk_mq_alloc_data *data, struct sbitmap_queue *bt) { + int ret = BLK_MQ_NO_TAG; + if (!data->q->elevator && !(data->flags & BLK_MQ_REQ_RESERVED) && !hctx_may_queue(data->hctx, bt)) - return BLK_MQ_NO_TAG; + goto out; + /* shallow_depth is only used for elevator */ if (data->shallow_depth) return sbitmap_queue_get_shallow(bt, data->shallow_depth); - else - return __sbitmap_queue_get(bt); + + ret = __sbitmap_queue_get(bt); +out: + if (ret == BLK_MQ_NO_TAG && !(data->rq_flags & RQF_SCHED_TAGS)) + blk_mq_driver_tag_busy(data->hctx); + return ret; } unsigned long blk_mq_get_tags(struct blk_mq_alloc_data *data, int nr_tags, diff --git a/block/blk-mq.c b/block/blk-mq.c index da650a2c4ca1..171ee4ac97ef 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1753,8 +1753,10 @@ static bool __blk_mq_alloc_driver_tag(struct request *rq) bool __blk_mq_get_driver_tag(struct blk_mq_hw_ctx *hctx, struct request *rq) { - if (rq->tag == BLK_MQ_NO_TAG && !__blk_mq_alloc_driver_tag(rq)) + if (rq->tag == BLK_MQ_NO_TAG && !__blk_mq_alloc_driver_tag(rq)) { + blk_mq_driver_tag_busy(hctx); return false; + } if ((hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) && !(rq->rq_flags & RQF_MQ_INFLIGHT)) { diff --git a/block/blk-mq.h b/block/blk-mq.h index ca1c13127868..01441a5e9910 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -193,8 +193,9 @@ static inline struct sbq_wait_state *bt_wait_ptr(struct sbitmap_queue *bt, return sbq_wait_ptr(bt, &hctx->wait_index); } -void __blk_mq_tag_busy(struct blk_mq_hw_ctx *); -void __blk_mq_tag_idle(struct blk_mq_hw_ctx *); +void __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx); +void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx); +void __blk_mq_driver_tag_busy(struct blk_mq_hw_ctx *hctx); static inline void blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) { @@ -208,6 +209,12 @@ static inline void blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) __blk_mq_tag_idle(hctx); } +static inline void blk_mq_driver_tag_busy(struct blk_mq_hw_ctx *hctx) +{ + if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) + __blk_mq_driver_tag_busy(hctx); +} + static inline bool blk_mq_tag_is_reserved(struct blk_mq_tags *tags, unsigned int tag) { @@ -412,7 +419,7 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, return true; } - users = READ_ONCE(hctx->tags->ctl.active_queues); + users = READ_ONCE(hctx->tags->ctl.share_queues); if (!users) return true; diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 8d2cd6b9d305..bc3ac22edb07 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -675,10 +675,11 @@ enum { BLK_MQ_S_STOPPED = 0, BLK_MQ_S_TAG_ACTIVE = 1, - BLK_MQ_S_SCHED_RESTART = 2, + BLK_MQ_S_DTAG_BUSY = 2, + BLK_MQ_S_SCHED_RESTART = 3, /* hw queue is inactive after all its CPUs become offline */ - BLK_MQ_S_INACTIVE = 3, + BLK_MQ_S_INACTIVE = 4, BLK_MQ_MAX_DEPTH = 10240, @@ -735,6 +736,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, struct tag_sharing_ctl { unsigned int active_queues; + unsigned int share_queues; }; /* diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index ed44a997f629..0994707f6a68 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -546,6 +546,7 @@ struct request_queue { #define QUEUE_FLAG_DAX 19 /* device supports DAX */ #define QUEUE_FLAG_STATS 20 /* track IO start and completion times */ #define QUEUE_FLAG_REGISTERED 22 /* queue has been registered to a disk */ +#define QUEUE_FLAG_HCTX_BUSY 23 /* at least one blk-mq hctx failed to get driver tag */ #define QUEUE_FLAG_QUIESCED 24 /* queue has been quiesced */ #define QUEUE_FLAG_PCI_P2PDMA 25 /* device supports PCI p2p requests */ #define QUEUE_FLAG_ZONE_RESETALL 26 /* supports Zone Reset All */