Message ID | 20210712031818.31918-1-yukuai3@huawei.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | blk-mq: allow hardware queue to get more tag while sharing a tag set | expand |
On 2021/07/12 11:18, Yu Kuai wrote: > If there are multiple active queues while sharing a tag set, it's not > necessary to limit the available tags as same share for each active queue > if no one ever failed to get driver tag. And fall back to same share if > someone do failed to get driver tag. > > This modification will be beneficial if total queue_depth of disks > on the same host is less than total tags. Sorry this should be "more than", I'll correct this in next iteration if anyone think this patch is acceptable. Thanks, Yu Kuai > > Signed-off-by: Yu Kuai <yukuai3@huawei.com> > --- > block/blk-mq-debugfs.c | 2 ++ > block/blk-mq-tag.c | 43 +++++++++++++++++++++++++++++++++++++++++- > block/blk-mq-tag.h | 27 ++++++++++++++++++++++++-- > block/blk-mq.c | 13 ++++++++++--- > block/blk-mq.h | 8 ++++++++ > include/linux/blk-mq.h | 4 ++++ > include/linux/blkdev.h | 1 + > 7 files changed, 92 insertions(+), 6 deletions(-) > > diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c > index 4b66d2776eda..35f1f01d93ae 100644 > --- a/block/blk-mq-debugfs.c > +++ b/block/blk-mq-debugfs.c > @@ -450,6 +450,8 @@ static void blk_mq_debugfs_tags_show(struct seq_file *m, > seq_printf(m, "nr_reserved_tags=%u\n", tags->nr_reserved_tags); > seq_printf(m, "active_queues=%d\n", > atomic_read(&tags->active_queues)); > + seq_printf(m, "pending_queues=%d\n", > + atomic_read(&tags->pending_queues)); > > seq_puts(m, "\nbitmap_tags:\n"); > sbitmap_queue_show(tags->bitmap_tags, m); > diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c > index 86f87346232a..618624b359d6 100644 > --- a/block/blk-mq-tag.c > +++ b/block/blk-mq-tag.c > @@ -40,6 +40,22 @@ bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) > return true; > } > > +void __blk_mq_dtag_busy(struct blk_mq_hw_ctx *hctx) > +{ > + if (blk_mq_is_sbitmap_shared(hctx->flags)) { > + struct request_queue *q = hctx->queue; > + struct blk_mq_tag_set *set = q->tag_set; > + > + if (!test_bit(QUEUE_FLAG_HCTX_WAIT, &q->queue_flags) && > + !test_and_set_bit(QUEUE_FLAG_HCTX_WAIT, &q->queue_flags)) > + atomic_inc(&set->pending_queues_shared_sbitmap); > + } else { > + if (!test_bit(BLK_MQ_S_DTAG_WAIT, &hctx->state) && > + !test_and_set_bit(BLK_MQ_S_DTAG_WAIT, &hctx->state)) > + atomic_inc(&hctx->tags->pending_queues); > + } > +} > + > /* > * Wakeup all potentially sleeping on tags > */ > @@ -74,6 +90,24 @@ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) > blk_mq_tag_wakeup_all(tags, false); > } > > +void __blk_mq_dtag_idle(struct blk_mq_hw_ctx *hctx) > +{ > + struct blk_mq_tags *tags = hctx->tags; > + struct request_queue *q = hctx->queue; > + struct blk_mq_tag_set *set = q->tag_set; > + > + if (blk_mq_is_sbitmap_shared(hctx->flags)) { > + if (!test_and_clear_bit(QUEUE_FLAG_HCTX_WAIT, > + &q->queue_flags)) > + return; > + atomic_dec(&set->pending_queues_shared_sbitmap); > + } else { > + if (!test_and_clear_bit(BLK_MQ_S_DTAG_WAIT, &hctx->state)) > + return; > + atomic_dec(&tags->pending_queues); > + } > +} > + > static int __blk_mq_get_tag(struct blk_mq_alloc_data *data, > struct sbitmap_queue *bt) > { > @@ -112,8 +146,12 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) > if (tag != BLK_MQ_NO_TAG) > goto found_tag; > > - if (data->flags & BLK_MQ_REQ_NOWAIT) > + if (data->flags & BLK_MQ_REQ_NOWAIT) { > + if (!data->q->elevator) > + blk_mq_dtag_busy(data->hctx); > + > return BLK_MQ_NO_TAG; > + } > > ws = bt_wait_ptr(bt, data->hctx); > do { > @@ -140,6 +178,9 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) > if (tag != BLK_MQ_NO_TAG) > break; > > + if (!data->q->elevator) > + blk_mq_dtag_busy(data->hctx); > + > bt_prev = bt; > io_schedule(); > > diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h > index 8ed55af08427..badcf3693749 100644 > --- a/block/blk-mq-tag.h > +++ b/block/blk-mq-tag.h > @@ -10,6 +10,11 @@ struct blk_mq_tags { > unsigned int nr_reserved_tags; > > atomic_t active_queues; > + /* > + * if multiple queues share a tag set, pending_queues record the > + * number of queues that can't get driver tag. > + */ > + atomic_t pending_queues; > > struct sbitmap_queue *bitmap_tags; > struct sbitmap_queue *breserved_tags; > @@ -69,8 +74,10 @@ enum { > BLK_MQ_TAG_MAX = BLK_MQ_NO_TAG - 1, > }; > > -extern bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *); > -extern void __blk_mq_tag_idle(struct blk_mq_hw_ctx *); > +extern bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx); > +extern void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx); > +extern void __blk_mq_dtag_busy(struct blk_mq_hw_ctx *hctx); > +extern void __blk_mq_dtag_idle(struct blk_mq_hw_ctx *hctx); > > static inline bool blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) > { > @@ -88,6 +95,22 @@ static inline void blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) > __blk_mq_tag_idle(hctx); > } > > +static inline void blk_mq_dtag_busy(struct blk_mq_hw_ctx *hctx) > +{ > + if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) > + return; > + > + __blk_mq_dtag_busy(hctx); > +} > + > +static inline void blk_mq_dtag_idle(struct blk_mq_hw_ctx *hctx) > +{ > + if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) > + return; > + > + __blk_mq_dtag_idle(hctx); > +} > + > static inline bool blk_mq_tag_is_reserved(struct blk_mq_tags *tags, > unsigned int tag) > { > diff --git a/block/blk-mq.c b/block/blk-mq.c > index 2c4ac51e54eb..1bb52bd71da8 100644 > --- a/block/blk-mq.c > +++ b/block/blk-mq.c > @@ -991,8 +991,10 @@ static void blk_mq_timeout_work(struct work_struct *work) > */ > queue_for_each_hw_ctx(q, hctx, i) { > /* the hctx may be unmapped, so check it here */ > - if (blk_mq_hw_queue_mapped(hctx)) > + if (blk_mq_hw_queue_mapped(hctx)) { > blk_mq_tag_idle(hctx); > + blk_mq_dtag_idle(hctx); > + } > } > } > blk_queue_exit(q); > @@ -1097,8 +1099,10 @@ static bool __blk_mq_get_driver_tag(struct request *rq) > } > > tag = __sbitmap_queue_get(bt); > - if (tag == BLK_MQ_NO_TAG) > + if (tag == BLK_MQ_NO_TAG) { > + blk_mq_dtag_busy(rq->mq_hctx); > return false; > + } > > rq->tag = tag + tag_offset; > return true; > @@ -2676,8 +2680,10 @@ static void blk_mq_exit_hctx(struct request_queue *q, > { > struct request *flush_rq = hctx->fq->flush_rq; > > - if (blk_mq_hw_queue_mapped(hctx)) > + if (blk_mq_hw_queue_mapped(hctx)) { > blk_mq_tag_idle(hctx); > + blk_mq_dtag_idle(hctx); > + } > > blk_mq_clear_flush_rq_mapping(set->tags[hctx_idx], > set->queue_depth, flush_rq); > @@ -3536,6 +3542,7 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) > > if (blk_mq_is_sbitmap_shared(set->flags)) { > atomic_set(&set->active_queues_shared_sbitmap, 0); > + atomic_set(&set->pending_queues_shared_sbitmap, 0); > > if (blk_mq_init_shared_sbitmap(set)) { > ret = -ENOMEM; > diff --git a/block/blk-mq.h b/block/blk-mq.h > index d08779f77a26..9e646ade81a8 100644 > --- a/block/blk-mq.h > +++ b/block/blk-mq.h > @@ -337,10 +337,18 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, > > if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags)) > return true; > + > + if (!atomic_read(&set->pending_queues_shared_sbitmap)) > + return true; > + > users = atomic_read(&set->active_queues_shared_sbitmap); > } else { > if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) > return true; > + > + if (!atomic_read(&hctx->tags->pending_queues)) > + return true; > + > users = atomic_read(&hctx->tags->active_queues); > } > > diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h > index 1d18447ebebc..3bc0faf0e2cf 100644 > --- a/include/linux/blk-mq.h > +++ b/include/linux/blk-mq.h > @@ -256,6 +256,7 @@ struct blk_mq_tag_set { > unsigned int flags; > void *driver_data; > atomic_t active_queues_shared_sbitmap; > + atomic_t pending_queues_shared_sbitmap; > > struct sbitmap_queue __bitmap_tags; > struct sbitmap_queue __breserved_tags; > @@ -415,6 +416,9 @@ enum { > /* hw queue is inactive after all its CPUs become offline */ > BLK_MQ_S_INACTIVE = 3, > > + /* hw queue is waiting for driver tag */ > + BLK_MQ_S_DTAG_WAIT = 1, > + > BLK_MQ_MAX_DEPTH = 10240, > > BLK_MQ_CPU_WORK_BATCH = 8, > diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h > index 3177181c4326..55e0965c9c3c 100644 > --- a/include/linux/blkdev.h > +++ b/include/linux/blkdev.h > @@ -603,6 +603,7 @@ struct request_queue { > #define QUEUE_FLAG_RQ_ALLOC_TIME 27 /* record rq->alloc_time_ns */ > #define QUEUE_FLAG_HCTX_ACTIVE 28 /* at least one blk-mq hctx is active */ > #define QUEUE_FLAG_NOWAIT 29 /* device supports NOWAIT */ > +#define QUEUE_FLAG_HCTX_WAIT 30 /* at least one blk-mq hctx can't get driver tag */ > > #define QUEUE_FLAG_MQ_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ > (1 << QUEUE_FLAG_SAME_COMP) | \ >
On 2021/07/12 11:18, Yu Kuai wrote: > If there are multiple active queues while sharing a tag set, it's not > necessary to limit the available tags as same share for each active queue > if no one ever failed to get driver tag. And fall back to same share if > someone do failed to get driver tag. > > This modification will be beneficial if total queue_depth of disks > on the same host is less than total tags. > > Signed-off-by: Yu Kuai <yukuai3@huawei.com> > --- > block/blk-mq-debugfs.c | 2 ++ > block/blk-mq-tag.c | 43 +++++++++++++++++++++++++++++++++++++++++- > block/blk-mq-tag.h | 27 ++++++++++++++++++++++++-- > block/blk-mq.c | 13 ++++++++++--- > block/blk-mq.h | 8 ++++++++ > include/linux/blk-mq.h | 4 ++++ > include/linux/blkdev.h | 1 + > 7 files changed, 92 insertions(+), 6 deletions(-) > > diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c > index 4b66d2776eda..35f1f01d93ae 100644 > --- a/block/blk-mq-debugfs.c > +++ b/block/blk-mq-debugfs.c > @@ -450,6 +450,8 @@ static void blk_mq_debugfs_tags_show(struct seq_file *m, > seq_printf(m, "nr_reserved_tags=%u\n", tags->nr_reserved_tags); > seq_printf(m, "active_queues=%d\n", > atomic_read(&tags->active_queues)); > + seq_printf(m, "pending_queues=%d\n", > + atomic_read(&tags->pending_queues)); > > seq_puts(m, "\nbitmap_tags:\n"); > sbitmap_queue_show(tags->bitmap_tags, m); > diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c > index 86f87346232a..618624b359d6 100644 > --- a/block/blk-mq-tag.c > +++ b/block/blk-mq-tag.c > @@ -40,6 +40,22 @@ bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) > return true; > } > > +void __blk_mq_dtag_busy(struct blk_mq_hw_ctx *hctx) > +{ > + if (blk_mq_is_sbitmap_shared(hctx->flags)) { > + struct request_queue *q = hctx->queue; > + struct blk_mq_tag_set *set = q->tag_set; > + > + if (!test_bit(QUEUE_FLAG_HCTX_WAIT, &q->queue_flags) && > + !test_and_set_bit(QUEUE_FLAG_HCTX_WAIT, &q->queue_flags)) > + atomic_inc(&set->pending_queues_shared_sbitmap); > + } else { > + if (!test_bit(BLK_MQ_S_DTAG_WAIT, &hctx->state) && > + !test_and_set_bit(BLK_MQ_S_DTAG_WAIT, &hctx->state)) > + atomic_inc(&hctx->tags->pending_queues); > + } > +} > + > /* > * Wakeup all potentially sleeping on tags > */ > @@ -74,6 +90,24 @@ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) > blk_mq_tag_wakeup_all(tags, false); > } > > +void __blk_mq_dtag_idle(struct blk_mq_hw_ctx *hctx) > +{ > + struct blk_mq_tags *tags = hctx->tags; > + struct request_queue *q = hctx->queue; > + struct blk_mq_tag_set *set = q->tag_set; > + > + if (blk_mq_is_sbitmap_shared(hctx->flags)) { > + if (!test_and_clear_bit(QUEUE_FLAG_HCTX_WAIT, > + &q->queue_flags)) > + return; > + atomic_dec(&set->pending_queues_shared_sbitmap); > + } else { > + if (!test_and_clear_bit(BLK_MQ_S_DTAG_WAIT, &hctx->state)) > + return; > + atomic_dec(&tags->pending_queues); > + } > +} > + > static int __blk_mq_get_tag(struct blk_mq_alloc_data *data, > struct sbitmap_queue *bt) > { > @@ -112,8 +146,12 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) > if (tag != BLK_MQ_NO_TAG) > goto found_tag; > > - if (data->flags & BLK_MQ_REQ_NOWAIT) > + if (data->flags & BLK_MQ_REQ_NOWAIT) { > + if (!data->q->elevator) > + blk_mq_dtag_busy(data->hctx); > + > return BLK_MQ_NO_TAG; > + } > > ws = bt_wait_ptr(bt, data->hctx); > do { > @@ -140,6 +178,9 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) > if (tag != BLK_MQ_NO_TAG) > break; > > + if (!data->q->elevator) > + blk_mq_dtag_busy(data->hctx); > + > bt_prev = bt; > io_schedule(); > > diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h > index 8ed55af08427..badcf3693749 100644 > --- a/block/blk-mq-tag.h > +++ b/block/blk-mq-tag.h > @@ -10,6 +10,11 @@ struct blk_mq_tags { > unsigned int nr_reserved_tags; > > atomic_t active_queues; > + /* > + * if multiple queues share a tag set, pending_queues record the > + * number of queues that can't get driver tag. > + */ > + atomic_t pending_queues; > > struct sbitmap_queue *bitmap_tags; > struct sbitmap_queue *breserved_tags; > @@ -69,8 +74,10 @@ enum { > BLK_MQ_TAG_MAX = BLK_MQ_NO_TAG - 1, > }; > > -extern bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *); > -extern void __blk_mq_tag_idle(struct blk_mq_hw_ctx *); > +extern bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx); > +extern void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx); > +extern void __blk_mq_dtag_busy(struct blk_mq_hw_ctx *hctx); > +extern void __blk_mq_dtag_idle(struct blk_mq_hw_ctx *hctx); > > static inline bool blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) > { > @@ -88,6 +95,22 @@ static inline void blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) > __blk_mq_tag_idle(hctx); > } > > +static inline void blk_mq_dtag_busy(struct blk_mq_hw_ctx *hctx) > +{ > + if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) > + return; > + > + __blk_mq_dtag_busy(hctx); > +} > + > +static inline void blk_mq_dtag_idle(struct blk_mq_hw_ctx *hctx) > +{ > + if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) > + return; > + > + __blk_mq_dtag_idle(hctx); > +} > + > static inline bool blk_mq_tag_is_reserved(struct blk_mq_tags *tags, > unsigned int tag) > { > diff --git a/block/blk-mq.c b/block/blk-mq.c > index 2c4ac51e54eb..1bb52bd71da8 100644 > --- a/block/blk-mq.c > +++ b/block/blk-mq.c > @@ -991,8 +991,10 @@ static void blk_mq_timeout_work(struct work_struct *work) > */ > queue_for_each_hw_ctx(q, hctx, i) { > /* the hctx may be unmapped, so check it here */ > - if (blk_mq_hw_queue_mapped(hctx)) > + if (blk_mq_hw_queue_mapped(hctx)) { > blk_mq_tag_idle(hctx); > + blk_mq_dtag_idle(hctx); > + } > } > } > blk_queue_exit(q); > @@ -1097,8 +1099,10 @@ static bool __blk_mq_get_driver_tag(struct request *rq) > } > > tag = __sbitmap_queue_get(bt); > - if (tag == BLK_MQ_NO_TAG) > + if (tag == BLK_MQ_NO_TAG) { > + blk_mq_dtag_busy(rq->mq_hctx); > return false; > + } > > rq->tag = tag + tag_offset; > return true; > @@ -2676,8 +2680,10 @@ static void blk_mq_exit_hctx(struct request_queue *q, > { > struct request *flush_rq = hctx->fq->flush_rq; > > - if (blk_mq_hw_queue_mapped(hctx)) > + if (blk_mq_hw_queue_mapped(hctx)) { > blk_mq_tag_idle(hctx); > + blk_mq_dtag_idle(hctx); > + } > > blk_mq_clear_flush_rq_mapping(set->tags[hctx_idx], > set->queue_depth, flush_rq); > @@ -3536,6 +3542,7 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) > > if (blk_mq_is_sbitmap_shared(set->flags)) { > atomic_set(&set->active_queues_shared_sbitmap, 0); > + atomic_set(&set->pending_queues_shared_sbitmap, 0); > > if (blk_mq_init_shared_sbitmap(set)) { > ret = -ENOMEM; > diff --git a/block/blk-mq.h b/block/blk-mq.h > index d08779f77a26..9e646ade81a8 100644 > --- a/block/blk-mq.h > +++ b/block/blk-mq.h > @@ -337,10 +337,18 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, > > if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags)) > return true; > + > + if (!atomic_read(&set->pending_queues_shared_sbitmap)) > + return true; > + > users = atomic_read(&set->active_queues_shared_sbitmap); > } else { > if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) > return true; > + > + if (!atomic_read(&hctx->tags->pending_queues)) > + return true; > + > users = atomic_read(&hctx->tags->active_queues); > } > > diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h > index 1d18447ebebc..3bc0faf0e2cf 100644 > --- a/include/linux/blk-mq.h > +++ b/include/linux/blk-mq.h > @@ -256,6 +256,7 @@ struct blk_mq_tag_set { > unsigned int flags; > void *driver_data; > atomic_t active_queues_shared_sbitmap; > + atomic_t pending_queues_shared_sbitmap; > > struct sbitmap_queue __bitmap_tags; > struct sbitmap_queue __breserved_tags; > @@ -415,6 +416,9 @@ enum { > /* hw queue is inactive after all its CPUs become offline */ > BLK_MQ_S_INACTIVE = 3, > > + /* hw queue is waiting for driver tag */ > + BLK_MQ_S_DTAG_WAIT = 1, > + > BLK_MQ_MAX_DEPTH = 10240, > > BLK_MQ_CPU_WORK_BATCH = 8, > diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h > index 3177181c4326..55e0965c9c3c 100644 > --- a/include/linux/blkdev.h > +++ b/include/linux/blkdev.h > @@ -603,6 +603,7 @@ struct request_queue { > #define QUEUE_FLAG_RQ_ALLOC_TIME 27 /* record rq->alloc_time_ns */ > #define QUEUE_FLAG_HCTX_ACTIVE 28 /* at least one blk-mq hctx is active */ > #define QUEUE_FLAG_NOWAIT 29 /* device supports NOWAIT */ > +#define QUEUE_FLAG_HCTX_WAIT 30 /* at least one blk-mq hctx can't get driver tag */ > > #define QUEUE_FLAG_MQ_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ > (1 << QUEUE_FLAG_SAME_COMP) | \ > ping ...
On 2021/07/20 20:33, yukuai (C) wrote: > On 2021/07/12 11:18, Yu Kuai wrote: >> If there are multiple active queues while sharing a tag set, it's not >> necessary to limit the available tags as same share for each active queue >> if no one ever failed to get driver tag. And fall back to same share if >> someone do failed to get driver tag. >> >> This modification will be beneficial if total queue_depth of disks >> on the same host is less than total tags. >> >> Signed-off-by: Yu Kuai <yukuai3@huawei.com> >> --- >> block/blk-mq-debugfs.c | 2 ++ >> block/blk-mq-tag.c | 43 +++++++++++++++++++++++++++++++++++++++++- >> block/blk-mq-tag.h | 27 ++++++++++++++++++++++++-- >> block/blk-mq.c | 13 ++++++++++--- >> block/blk-mq.h | 8 ++++++++ >> include/linux/blk-mq.h | 4 ++++ >> include/linux/blkdev.h | 1 + >> 7 files changed, 92 insertions(+), 6 deletions(-) >> >> diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c >> index 4b66d2776eda..35f1f01d93ae 100644 >> --- a/block/blk-mq-debugfs.c >> +++ b/block/blk-mq-debugfs.c >> @@ -450,6 +450,8 @@ static void blk_mq_debugfs_tags_show(struct >> seq_file *m, >> seq_printf(m, "nr_reserved_tags=%u\n", tags->nr_reserved_tags); >> seq_printf(m, "active_queues=%d\n", >> atomic_read(&tags->active_queues)); >> + seq_printf(m, "pending_queues=%d\n", >> + atomic_read(&tags->pending_queues)); >> seq_puts(m, "\nbitmap_tags:\n"); >> sbitmap_queue_show(tags->bitmap_tags, m); >> diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c >> index 86f87346232a..618624b359d6 100644 >> --- a/block/blk-mq-tag.c >> +++ b/block/blk-mq-tag.c >> @@ -40,6 +40,22 @@ bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) >> return true; >> } >> +void __blk_mq_dtag_busy(struct blk_mq_hw_ctx *hctx) >> +{ >> + if (blk_mq_is_sbitmap_shared(hctx->flags)) { >> + struct request_queue *q = hctx->queue; >> + struct blk_mq_tag_set *set = q->tag_set; >> + >> + if (!test_bit(QUEUE_FLAG_HCTX_WAIT, &q->queue_flags) && >> + !test_and_set_bit(QUEUE_FLAG_HCTX_WAIT, &q->queue_flags)) >> + atomic_inc(&set->pending_queues_shared_sbitmap); >> + } else { >> + if (!test_bit(BLK_MQ_S_DTAG_WAIT, &hctx->state) && >> + !test_and_set_bit(BLK_MQ_S_DTAG_WAIT, &hctx->state)) >> + atomic_inc(&hctx->tags->pending_queues); >> + } >> +} >> + >> /* >> * Wakeup all potentially sleeping on tags >> */ >> @@ -74,6 +90,24 @@ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) >> blk_mq_tag_wakeup_all(tags, false); >> } >> +void __blk_mq_dtag_idle(struct blk_mq_hw_ctx *hctx) >> +{ >> + struct blk_mq_tags *tags = hctx->tags; >> + struct request_queue *q = hctx->queue; >> + struct blk_mq_tag_set *set = q->tag_set; >> + >> + if (blk_mq_is_sbitmap_shared(hctx->flags)) { >> + if (!test_and_clear_bit(QUEUE_FLAG_HCTX_WAIT, >> + &q->queue_flags)) >> + return; >> + atomic_dec(&set->pending_queues_shared_sbitmap); >> + } else { >> + if (!test_and_clear_bit(BLK_MQ_S_DTAG_WAIT, &hctx->state)) >> + return; >> + atomic_dec(&tags->pending_queues); >> + } >> +} >> + >> static int __blk_mq_get_tag(struct blk_mq_alloc_data *data, >> struct sbitmap_queue *bt) >> { >> @@ -112,8 +146,12 @@ unsigned int blk_mq_get_tag(struct >> blk_mq_alloc_data *data) >> if (tag != BLK_MQ_NO_TAG) >> goto found_tag; >> - if (data->flags & BLK_MQ_REQ_NOWAIT) >> + if (data->flags & BLK_MQ_REQ_NOWAIT) { >> + if (!data->q->elevator) >> + blk_mq_dtag_busy(data->hctx); >> + >> return BLK_MQ_NO_TAG; >> + } >> ws = bt_wait_ptr(bt, data->hctx); >> do { >> @@ -140,6 +178,9 @@ unsigned int blk_mq_get_tag(struct >> blk_mq_alloc_data *data) >> if (tag != BLK_MQ_NO_TAG) >> break; >> + if (!data->q->elevator) >> + blk_mq_dtag_busy(data->hctx); >> + >> bt_prev = bt; >> io_schedule(); >> diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h >> index 8ed55af08427..badcf3693749 100644 >> --- a/block/blk-mq-tag.h >> +++ b/block/blk-mq-tag.h >> @@ -10,6 +10,11 @@ struct blk_mq_tags { >> unsigned int nr_reserved_tags; >> atomic_t active_queues; >> + /* >> + * if multiple queues share a tag set, pending_queues record the >> + * number of queues that can't get driver tag. >> + */ >> + atomic_t pending_queues; >> struct sbitmap_queue *bitmap_tags; >> struct sbitmap_queue *breserved_tags; >> @@ -69,8 +74,10 @@ enum { >> BLK_MQ_TAG_MAX = BLK_MQ_NO_TAG - 1, >> }; >> -extern bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *); >> -extern void __blk_mq_tag_idle(struct blk_mq_hw_ctx *); >> +extern bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx); >> +extern void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx); >> +extern void __blk_mq_dtag_busy(struct blk_mq_hw_ctx *hctx); >> +extern void __blk_mq_dtag_idle(struct blk_mq_hw_ctx *hctx); >> static inline bool blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) >> { >> @@ -88,6 +95,22 @@ static inline void blk_mq_tag_idle(struct >> blk_mq_hw_ctx *hctx) >> __blk_mq_tag_idle(hctx); >> } >> +static inline void blk_mq_dtag_busy(struct blk_mq_hw_ctx *hctx) >> +{ >> + if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) >> + return; >> + >> + __blk_mq_dtag_busy(hctx); >> +} >> + >> +static inline void blk_mq_dtag_idle(struct blk_mq_hw_ctx *hctx) >> +{ >> + if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) >> + return; >> + >> + __blk_mq_dtag_idle(hctx); >> +} >> + >> static inline bool blk_mq_tag_is_reserved(struct blk_mq_tags *tags, >> unsigned int tag) >> { >> diff --git a/block/blk-mq.c b/block/blk-mq.c >> index 2c4ac51e54eb..1bb52bd71da8 100644 >> --- a/block/blk-mq.c >> +++ b/block/blk-mq.c >> @@ -991,8 +991,10 @@ static void blk_mq_timeout_work(struct >> work_struct *work) >> */ >> queue_for_each_hw_ctx(q, hctx, i) { >> /* the hctx may be unmapped, so check it here */ >> - if (blk_mq_hw_queue_mapped(hctx)) >> + if (blk_mq_hw_queue_mapped(hctx)) { >> blk_mq_tag_idle(hctx); >> + blk_mq_dtag_idle(hctx); >> + } >> } >> } >> blk_queue_exit(q); >> @@ -1097,8 +1099,10 @@ static bool __blk_mq_get_driver_tag(struct >> request *rq) >> } >> tag = __sbitmap_queue_get(bt); >> - if (tag == BLK_MQ_NO_TAG) >> + if (tag == BLK_MQ_NO_TAG) { >> + blk_mq_dtag_busy(rq->mq_hctx); >> return false; >> + } >> rq->tag = tag + tag_offset; >> return true; >> @@ -2676,8 +2680,10 @@ static void blk_mq_exit_hctx(struct >> request_queue *q, >> { >> struct request *flush_rq = hctx->fq->flush_rq; >> - if (blk_mq_hw_queue_mapped(hctx)) >> + if (blk_mq_hw_queue_mapped(hctx)) { >> blk_mq_tag_idle(hctx); >> + blk_mq_dtag_idle(hctx); >> + } >> blk_mq_clear_flush_rq_mapping(set->tags[hctx_idx], >> set->queue_depth, flush_rq); >> @@ -3536,6 +3542,7 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set >> *set) >> if (blk_mq_is_sbitmap_shared(set->flags)) { >> atomic_set(&set->active_queues_shared_sbitmap, 0); >> + atomic_set(&set->pending_queues_shared_sbitmap, 0); >> if (blk_mq_init_shared_sbitmap(set)) { >> ret = -ENOMEM; >> diff --git a/block/blk-mq.h b/block/blk-mq.h >> index d08779f77a26..9e646ade81a8 100644 >> --- a/block/blk-mq.h >> +++ b/block/blk-mq.h >> @@ -337,10 +337,18 @@ static inline bool hctx_may_queue(struct >> blk_mq_hw_ctx *hctx, >> if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags)) >> return true; >> + >> + if (!atomic_read(&set->pending_queues_shared_sbitmap)) >> + return true; >> + >> users = atomic_read(&set->active_queues_shared_sbitmap); >> } else { >> if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) >> return true; >> + >> + if (!atomic_read(&hctx->tags->pending_queues)) >> + return true; >> + >> users = atomic_read(&hctx->tags->active_queues); >> } >> diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h >> index 1d18447ebebc..3bc0faf0e2cf 100644 >> --- a/include/linux/blk-mq.h >> +++ b/include/linux/blk-mq.h >> @@ -256,6 +256,7 @@ struct blk_mq_tag_set { >> unsigned int flags; >> void *driver_data; >> atomic_t active_queues_shared_sbitmap; >> + atomic_t pending_queues_shared_sbitmap; >> struct sbitmap_queue __bitmap_tags; >> struct sbitmap_queue __breserved_tags; >> @@ -415,6 +416,9 @@ enum { >> /* hw queue is inactive after all its CPUs become offline */ >> BLK_MQ_S_INACTIVE = 3, >> + /* hw queue is waiting for driver tag */ >> + BLK_MQ_S_DTAG_WAIT = 1, >> + >> BLK_MQ_MAX_DEPTH = 10240, >> BLK_MQ_CPU_WORK_BATCH = 8, >> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h >> index 3177181c4326..55e0965c9c3c 100644 >> --- a/include/linux/blkdev.h >> +++ b/include/linux/blkdev.h >> @@ -603,6 +603,7 @@ struct request_queue { >> #define QUEUE_FLAG_RQ_ALLOC_TIME 27 /* record rq->alloc_time_ns */ >> #define QUEUE_FLAG_HCTX_ACTIVE 28 /* at least one blk-mq hctx >> is active */ >> #define QUEUE_FLAG_NOWAIT 29 /* device supports NOWAIT */ >> +#define QUEUE_FLAG_HCTX_WAIT 30 /* at least one blk-mq hctx >> can't get driver tag */ >> #define QUEUE_FLAG_MQ_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ >> (1 << QUEUE_FLAG_SAME_COMP) | \ >> > > ping ... ping ...
On 7/11/21 8:18 PM, Yu Kuai wrote: > If there are multiple active queues while sharing a tag set, it's not > necessary to limit the available tags as same share for each active queue > if no one ever failed to get driver tag. And fall back to same share if > someone do failed to get driver tag. > > This modification will be beneficial if total queue_depth of disks > on the same host is less than total tags. This patch adds new atomic operations in the hot path and hence probably has a negative performance impact. What is the performance impact of this patch for e.g. null_blk when submitting I/O from all CPU cores? Thanks, Bart.
On 2021/08/01 1:15, Bart Van Assche wrote: > On 7/11/21 8:18 PM, Yu Kuai wrote: >> If there are multiple active queues while sharing a tag set, it's not >> necessary to limit the available tags as same share for each active queue >> if no one ever failed to get driver tag. And fall back to same share if >> someone do failed to get driver tag. >> >> This modification will be beneficial if total queue_depth of disks >> on the same host is less than total tags. > > This patch adds new atomic operations in the hot path and hence probably > has a negative performance impact. What is the performance impact of > this patch for e.g. null_blk when submitting I/O from all CPU cores? > > Thanks, > > Bart. > . > Hi, Bart I run a test on both null_blk and nvme, results show that there are no performance degradation: test platform: x86 test cpu: 2 nodes, total 72 test scheduler: none test device: null_blk / nvme test cmd: fio -filename=/dev/xxx -name=test -ioengine=libaio -direct=1 -numjobs=72 -iodepth=16 -bs=4k -rw=write -offset_increment=1G -cpus_allowed=0:71 -cpus_allowed_policy=split -group_reporting -runtime=120 test results: iops 1) null_blk before this patch: 280k 2) null_blk after this patch: 282k 3) nvme before this patch: 378k 4) nvme after this patch: 384k details: 1) null_blk before this patch: test: (g=0): rw=write, bs=(R) 4096B-4096B, (W) 4096B-4096B, (T) 4096B-4096B, ioengine=libaio, iodepth=16 ... fio-3.13-42-g8066f Starting 72 processes Jobs: 72 (f=72): [W(72)][100.0%][w=1095MiB/s][w=280k IOPS][eta 00m:00s] test: (groupid=0, jobs=72): err= 0: pid=4986: Mon Aug 2 11:25:33 2021 write: IOPS=279k, BW=1091MiB/s (1144MB/s)(128GiB/120009msec); 0 zone resets slat (nsec): min=1069, max=1837.6M, avg=240280.55, stdev=3604257.00 clat (usec): min=89, max=1837.9k, avg=3882.70, stdev=13528.70 lat (usec): min=175, max=1837.9k, avg=4123.03, stdev=13939.66 clat percentiles (usec): | 1.00th=[ 223], 5.00th=[ 223], 10.00th=[ 225], 20.00th=[ 231], | 30.00th=[ 478], 40.00th=[ 873], 50.00th=[ 1811], 60.00th=[ 2737], | 70.00th=[ 4293], 80.00th=[ 6915], 90.00th=[ 9372], 95.00th=[ 12780], | 99.00th=[ 18482], 99.50th=[ 22676], 99.90th=[ 62129], 99.95th=[231736], | 99.99th=[641729] bw ( MiB/s): min= 32, max= 3681, per=100.00%, avg=1106.55, stdev=25.25, samples=17006 iops : min= 8405, max=942588, avg=283276.25, stdev=6464.60, samples=17006 lat (usec) : 100=0.01%, 250=24.18%, 500=8.74%, 750=4.72%, 1000=4.01% lat (msec) : 2=12.28%, 4=12.86%, 10=24.23%, 20=8.06%, 50=0.81% lat (msec) : 100=0.02%, 250=0.04%, 500=0.03%, 750=0.01%, 1000=0.01% lat (msec) : 2000=0.01% cpu : usr=0.35%, sys=0.79%, ctx=35473919, majf=0, minf=1419 IO depths : 1=0.1%, 2=0.1%, 4=0.1%, 8=0.1%, 16=100.0%, 32=0.0%, >=64=0.0% submit : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0% complete : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.1%, 32=0.0%, 64=0.0%, >=64=0.0% issued rwts: total=0,33525688,0,0 short=0,0,0,0 dropped=0,0,0,0 latency : target=0, window=0, percentile=100.00%, depth=16 Run status group 0 (all jobs): WRITE: bw=1091MiB/s (1144MB/s), 1091MiB/s-1091MiB/s (1144MB/s-1144MB/s), io=128GiB (137GB), run=120009-120009msec Disk stats (read/write): nullb0: ios=0/33485328, merge=0/0, ticks=0/4817009, in_queue=4817009, util=100.00% 2) null_blk after this patch: test: (g=0): rw=write, bs=(R) 4096B-4096B, (W) 4096B-4096B, (T) 4096B-4096B, ioengine=libaio, iodepth=16 ... fio-3.13-42-g8066f Starting 72 processes Jobs: 72 (f=72): [W(72)][100.0%][w=1101MiB/s][w=282k IOPS][eta 00m:00s] test: (groupid=0, jobs=72): err= 0: pid=5001: Mon Aug 2 10:36:52 2021 write: IOPS=281k, BW=1097MiB/s (1150MB/s)(129GiB/120009msec); 0 zone resets slat (nsec): min=1104, max=5358.9M, avg=239050.23, stdev=4040598.71 clat (usec): min=2, max=5359.3k, avg=3862.86, stdev=15270.20 lat (usec): min=4, max=5359.3k, avg=4101.96, stdev=15742.32 clat percentiles (usec): | 1.00th=[ 221], 5.00th=[ 223], 10.00th=[ 225], 20.00th=[ 231], | 30.00th=[ 482], 40.00th=[ 1106], 50.00th=[ 1909], 60.00th=[ 3163], | 70.00th=[ 4490], 80.00th=[ 5538], 90.00th=[ 10683], 95.00th=[ 14877], | 99.00th=[ 16450], 99.50th=[ 19530], 99.90th=[ 30802], 99.95th=[ 34341], | 99.99th=[650118] bw ( MiB/s): min= 23, max= 4395, per=100.00%, avg=1119.48, stdev=27.64, samples=16872 iops : min= 5906, max=1125367, avg=286585.88, stdev=7075.29, samples=16872 lat (usec) : 4=0.01%, 10=0.01%, 20=0.01%, 50=0.01%, 250=24.77% lat (usec) : 500=6.12%, 750=4.51%, 1000=3.97% lat (msec) : 2=11.02%, 4=15.75%, 10=23.34%, 20=10.15%, 50=0.34% lat (msec) : 100=0.01%, 250=0.01%, 500=0.01%, 750=0.01%, 1000=0.01% lat (msec) : 2000=0.01%, >=2000=0.01% cpu : usr=0.36%, sys=0.79%, ctx=35506798, majf=0, minf=966 IO depths : 1=0.1%, 2=0.1%, 4=0.1%, 8=0.1%, 16=100.0%, 32=0.0%, >=64=0.0% submit : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0% complete : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.1%, 32=0.0%, 64=0.0%, >=64=0.0% issued rwts: total=0,33697894,0,0 short=0,0,0,0 dropped=0,0,0,0 latency : target=0, window=0, percentile=100.00%, depth=16 Run status group 0 (all jobs): WRITE: bw=1097MiB/s (1150MB/s), 1097MiB/s-1097MiB/s (1150MB/s-1150MB/s), io=129GiB (138GB), run=120009-120009msec Disk stats (read/write): nullb0: ios=0/33657152, merge=0/0, ticks=0/4812746, in_queue=4812745, util=100.00% 3) nvme before this patch: test: (g=0): rw=write, bs=(R) 4096B-4096B, (W) 4096B-4096B, (T) 4096B-4096B, ioengine=libaio, iodepth=16 ... fio-3.13-42-g8066f Starting 72 processes Jobs: 72 (f=72): [W(72)][100.0%][w=1478MiB/s][w=378k IOPS][eta 00m:00s] test: (groupid=0, jobs=72): err= 0: pid=4780: Mon Aug 2 11:22:36 2021 write: IOPS=382k, BW=1491MiB/s (1564MB/s)(175GiB/120113msec); 0 zone resets slat (nsec): min=1234, max=328006k, avg=102467.85, stdev=4967629.26 clat (nsec): min=1788, max=329044k, avg=2899631.83, stdev=24819488.97 lat (usec): min=31, max=424004, avg=3004.41, stdev=25334.53 clat percentiles (usec): | 1.00th=[ 39], 5.00th=[ 39], 10.00th=[ 39], 20.00th=[ 39], | 30.00th=[ 40], 40.00th=[ 40], 50.00th=[ 40], 60.00th=[ 40], | 70.00th=[ 41], 80.00th=[ 41], 90.00th=[ 42], 95.00th=[ 45], | 99.00th=[132645], 99.50th=[252707], 99.90th=[287310], 99.95th=[291505], | 99.99th=[304088] bw ( MiB/s): min= 783, max= 2394, per=100.00%, avg=1492.49, stdev= 5.56, samples=17278 iops : min=200590, max=613014, avg=382076.48, stdev=1424.37, samples=17278 lat (usec) : 2=0.01%, 4=0.01%, 20=0.01%, 50=95.89%, 100=0.06% lat (usec) : 250=0.06%, 500=0.15%, 750=0.18%, 1000=0.22% lat (msec) : 2=0.96%, 4=0.60%, 10=0.21%, 20=0.05%, 50=0.18% lat (msec) : 100=0.27%, 250=0.65%, 500=0.51% cpu : usr=0.44%, sys=0.94%, ctx=123991, majf=0, minf=988 IO depths : 1=0.1%, 2=0.1%, 4=0.1%, 8=0.1%, 16=100.0%, 32=0.0%, >=64=0.0% submit : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0% complete : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.1%, 32=0.0%, 64=0.0%, >=64=0.0% issued rwts: total=0,45859799,0,0 short=0,0,0,0 dropped=0,0,0,0 latency : target=0, window=0, percentile=100.00%, depth=16 Run status group 0 (all jobs): WRITE: bw=1491MiB/s (1564MB/s), 1491MiB/s-1491MiB/s (1564MB/s-1564MB/s), io=175GiB (188GB), run=120113-120113msec Disk stats (read/write): nvme0n1: ios=308/45807739, merge=0/0, ticks=57/2334550, in_queue=2334607, util=100.00% 4) nvme after this patch: after: nvme test: (g=0): rw=write, bs=(R) 4096B-4096B, (W) 4096B-4096B, (T) 4096B-4096B, ioengine=libaio, iodepth=16 ... fio-3.13-42-g8066f Starting 72 processes Jobs: 72 (f=72): [W(72)][100.0%][w=1502MiB/s][w=384k IOPS][eta 00m:00s] test: (groupid=0, jobs=72): err= 0: pid=5320: Mon Aug 2 10:42:07 2021 write: IOPS=383k, BW=1496MiB/s (1569MB/s)(175GiB/120098msec); 0 zone resets slat (nsec): min=1229, max=370007k, avg=100549.47, stdev=4919208.81 clat (nsec): min=1634, max=370050k, avg=2892105.62, stdev=24891976.05 lat (usec): min=31, max=380005, avg=2995.16, stdev=25391.59 clat percentiles (usec): | 1.00th=[ 38], 5.00th=[ 39], 10.00th=[ 39], 20.00th=[ 39], | 30.00th=[ 39], 40.00th=[ 40], 50.00th=[ 40], 60.00th=[ 40], | 70.00th=[ 41], 80.00th=[ 41], 90.00th=[ 42], 95.00th=[ 44], | 99.00th=[135267], 99.50th=[252707], 99.90th=[287310], 99.95th=[291505], | 99.99th=[304088] bw ( MiB/s): min= 827, max= 2248, per=100.00%, avg=1496.99, stdev= 5.51, samples=17278 iops : min=211931, max=575591, avg=383228.21, stdev=1411.19, samples=17278 lat (usec) : 2=0.01%, 4=0.01%, 10=0.01%, 20=0.01%, 50=95.83% lat (usec) : 100=0.15%, 250=0.05%, 500=0.13%, 750=0.18%, 1000=0.21% lat (msec) : 2=0.85%, 4=0.84%, 10=0.14%, 20=0.05%, 50=0.14% lat (msec) : 100=0.25%, 250=0.65%, 500=0.51% cpu : usr=0.43%, sys=0.95%, ctx=123368, majf=0, minf=989 IO depths : 1=0.1%, 2=0.1%, 4=0.1%, 8=0.1%, 16=100.0%, 32=0.0%, >=64=0.0% submit : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0% complete : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.1%, 32=0.0%, 64=0.0%, >=64=0.0% issued rwts: total=0,45995620,0,0 short=0,0,0,0 dropped=0,0,0,0 latency : target=0, window=0, percentile=100.00%, depth=16 Run status group 0 (all jobs): WRITE: bw=1496MiB/s (1569MB/s), 1496MiB/s-1496MiB/s (1569MB/s-1569MB/s), io=175GiB (188GB), run=120098-120098msec Disk stats (read/write): nvme0n1: ios=190/45976809, merge=0/0, ticks=34/2374865, in_queue=2374900, util=100.00% Thanks Kuai
On 8/2/21 6:34 AM, yukuai (C) wrote: > I run a test on both null_blk and nvme, results show that there are no > performance degradation: > > test platform: x86 > test cpu: 2 nodes, total 72 > test scheduler: none > test device: null_blk / nvme > > test cmd: fio -filename=/dev/xxx -name=test -ioengine=libaio -direct=1 > -numjobs=72 -iodepth=16 -bs=4k -rw=write -offset_increment=1G > -cpus_allowed=0:71 -cpus_allowed_policy=split -group_reporting > -runtime=120 > > test results: iops > 1) null_blk before this patch: 280k > 2) null_blk after this patch: 282k > 3) nvme before this patch: 378k > 4) nvme after this patch: 384k Please use io_uring for performance tests. The null_blk numbers seem way too low to me. If I run a null_blk performance test inside a VM with 6 CPU cores (Xeon W-2135 CPU) I see about 6 million IOPS for synchronous I/O and about 4.4 million IOPS when using libaio. The options I used and that are not in the above command line are: --thread --gtod_reduce=1 --ioscheduler=none. Thanks, Bart.
On 2021/08/03 0:17, Bart Van Assche wrote: > On 8/2/21 6:34 AM, yukuai (C) wrote: >> I run a test on both null_blk and nvme, results show that there are no >> performance degradation: >> >> test platform: x86 >> test cpu: 2 nodes, total 72 >> test scheduler: none >> test device: null_blk / nvme >> >> test cmd: fio -filename=/dev/xxx -name=test -ioengine=libaio -direct=1 >> -numjobs=72 -iodepth=16 -bs=4k -rw=write -offset_increment=1G >> -cpus_allowed=0:71 -cpus_allowed_policy=split -group_reporting >> -runtime=120 >> >> test results: iops >> 1) null_blk before this patch: 280k >> 2) null_blk after this patch: 282k >> 3) nvme before this patch: 378k >> 4) nvme after this patch: 384k > > Please use io_uring for performance tests. > > The null_blk numbers seem way too low to me. If I run a null_blk > performance test inside a VM with 6 CPU cores (Xeon W-2135 CPU) I see > about 6 million IOPS for synchronous I/O and about 4.4 million IOPS when > using libaio. The options I used and that are not in the above command > line are: --thread --gtod_reduce=1 --ioscheduler=none. > Hi, Bart The cpu I'm testing is Intel(R) Xeon(R) Gold 6140 CPU @ 2.30GHz, and after switching to io_uring with "--thread --gtod_reduce=1 --ioscheduler=none", the numbers can increase to 330k, yet still far behind 6000k. The new atomic operations in the hot path is atomic_read() from hctx_may_queue(), and the atomic variable will change in two situations: a. fail to get driver tag with dbusy not set, increase and set dbusy. b. if dbusy is set when queue switch from busy to dile, decrease and clear dbusy. During the period a device "idle -> busy -> idle", the new atomic variable can be writen twice at most, which means this is almost readonly in the above test situation. So I guess the impact on performance is minimal ? Thanks! Kuai
On 8/2/21 7:57 PM, yukuai (C) wrote: > The cpu I'm testing is Intel(R) Xeon(R) Gold 6140 CPU @ 2.30GHz, and > after switching to io_uring with "--thread --gtod_reduce=1 > --ioscheduler=none", the numbers can increase to 330k, yet still > far behind 6000k. On https://ark.intel.com/content/www/us/en/ark/products/120485/intel-xeon-gold-6140-processor-24-75m-cache-2-30-ghz.html I found the following information about that CPU: 18 CPU cores 36 hyperthreads so 36 fio jobs should be sufficient. Maybe IOPS are lower than expected because of how null_blk has been configured? This is the configuration that I used in my test: modprobe null_blk nr_devices=0 && udevadm settle && cd /sys/kernel/config/nullb && mkdir nullb0 && cd nullb0 && echo 0 > completion_nsec && echo 512 > blocksize && echo 0 > home_node && echo 0 > irqmode && echo 1024 > size && echo 0 > memory_backed && echo 2 > queue_mode && echo 1 > power || exit $? > The new atomic operations in the hot path is atomic_read() from > hctx_may_queue(), and the atomic variable will change in two > situations: > > a. fail to get driver tag with dbusy not set, increase and set dbusy. > b. if dbusy is set when queue switch from busy to dile, decrease and > clear dbusy. > > During the period a device "idle -> busy -> idle", the new atomic > variable can be writen twice at most, which means this is almost > readonly in the above test situation. So I guess the impact on > performance is minimal ? Please measure the performance impact of your patch. Thanks, Bart.
On 2021/08/04 2:38, Bart Van Assche wrote: > On 8/2/21 7:57 PM, yukuai (C) wrote: >> The cpu I'm testing is Intel(R) Xeon(R) Gold 6140 CPU @ 2.30GHz, and >> after switching to io_uring with "--thread --gtod_reduce=1 >> --ioscheduler=none", the numbers can increase to 330k, yet still >> far behind 6000k. > > On > https://ark.intel.com/content/www/us/en/ark/products/120485/intel-xeon-gold-6140-processor-24-75m-cache-2-30-ghz.html > I found the following information about that CPU: > 18 CPU cores > 36 hyperthreads > > so 36 fio jobs should be sufficient. Maybe IOPS are lower than expected > because of how null_blk has been configured? This is the configuration > that I used in my test: > > modprobe null_blk nr_devices=0 && > udevadm settle && > cd /sys/kernel/config/nullb && > mkdir nullb0 && > cd nullb0 && > echo 0 > completion_nsec && > echo 512 > blocksize && > echo 0 > home_node && > echo 0 > irqmode && > echo 1024 > size && > echo 0 > memory_backed && > echo 2 > queue_mode && > echo 1 > power || > exit $? hi Bart, After applying this configuration, the number of null_blk in my machine is about 650k(330k before). Is this still too low? By the way, there are no performance degradation. Thanks Kuai > >> The new atomic operations in the hot path is atomic_read() from >> hctx_may_queue(), and the atomic variable will change in two >> situations: >> >> a. fail to get driver tag with dbusy not set, increase and set dbusy. >> b. if dbusy is set when queue switch from busy to dile, decrease and >> clear dbusy. >> >> During the period a device "idle -> busy -> idle", the new atomic >> variable can be writen twice at most, which means this is almost >> readonly in the above test situation. So I guess the impact on >> performance is minimal ? > > Please measure the performance impact of your patch. > > Thanks, > > Bart. > > . >
On 8/5/21 6:50 PM, yukuai (C) wrote: > After applying this configuration, the number of null_blk in my > machine is about 650k(330k before). Is this still too low? That seems low to me. If I run the attached script on a six year old desktop with an eight core i7-4790 CPU it reports a little more than 5 million IOPS. Has kernel debugging perhaps been enabled in the kernel on the test setup? Or is the system perhaps slowed down by security mitigations? > By the way, there are no performance degradation. Please wait with drawing a conclusion until you can run a workload on your setup of several million IOPS. Thanks, Bart. #!/bin/bash if [ -e /sys/kernel/config/nullb ]; then for d in /sys/kernel/config/nullb/*; do [ -d "$d" ] && rmdir "$d" done fi numcpus=$(grep -c ^processor /proc/cpuinfo) modprobe -r null_blk [ -e /sys/module/null_blk ] && exit $? modprobe null_blk nr_devices=0 && udevadm settle && cd /sys/kernel/config/nullb && mkdir nullb0 && cd nullb0 && echo 0 > completion_nsec && echo 512 > blocksize && echo 0 > home_node && echo 0 > irqmode && echo 1024 > size && echo 0 > memory_backed && echo 2 > queue_mode && echo 1 > power || exit $? ( cd /sys/block/nullb0/queue && echo 2 > rq_affinity ) || exit $? iodepth=${1:-1} runtime=30 args=() if [ "$iodepth" = 1 ]; then args+=(--ioengine=psync) else args+=(--ioengine=io_uring --iodepth_batch=$((iodepth/2))) fi args+=(--iodepth=$iodepth --name=nullb0 --filename=/dev/nullb0\ --rw=read --bs=4096 --loops=$((1<<20)) --direct=1 --numjobs=$numcpus \ --thread --runtime=$runtime --invalidate=1 --gtod_reduce=1 \ --group_reporting=1 --ioscheduler=none) if numactl -m 0 -N 0 echo >&/dev/null; then numactl -m 0 -N 0 -- fio "${args[@]}" else fio "${args[@]}" fi
On 2021/08/06 10:43, Bart Van Assche wrote: > On 8/5/21 6:50 PM, yukuai (C) wrote: >> After applying this configuration, the number of null_blk in my >> machine is about 650k(330k before). Is this still too low? > > That seems low to me. If I run the attached script on a six year old > desktop with an eight core i7-4790 CPU it reports a little more than 5 > million IOPS. Has kernel debugging perhaps been enabled in the kernel on > the test setup? Or is the system perhaps slowed down by security > mitigations? > Hi, Bart Sorry for the delay. I was too busy with other things recently. After disable all the kernel debuging config I can think of, the numbers can increase to millions. setup cmd: modprobe null_blk nr_devices=0 && udevadm settle && cd /sys/kernel/config/nullb && mkdir nullb0 && cd nullb0 && echo 0 > completion_nsec && echo 512 > blocksize && echo 0 > home_node && echo 0 > irqmode && echo 1024 > size && echo 0 > memory_backed && echo 2 > queue_mode && echo 1 > power || exit $? test cmd: fio -filename=/dev/nullb0 -name=test -ioengine=io_uring -direct=1 -numjobs=32 -iodepth=32 -bs=4k -rw=write -group_reporting -runtime=30 --thread --gtod_reduce=1 --ioscheduler=none -time_based test result: | test round | with this patch | without this patch | | ---------- | --------------- | ------------------ | | 1 | 4310k | 4265k | | 2 | 4295k | 4327k | | 3 | 4217k | 4213k | | 4 | 4355k | 4236k | | 5 | 4315k | 4337k | | average | 4294k | 4275k | Thanks Kuai
On 8/14/21 2:43 AM, yukuai (C) wrote: > test result: > | test round | with this patch | without this patch | > | ---------- | --------------- | ------------------ | > | 1 | 4310k | 4265k | > | 2 | 4295k | 4327k | > | 3 | 4217k | 4213k | > | 4 | 4355k | 4236k | > | 5 | 4315k | 4337k | > | average | 4294k | 4275k | Hi Kuai, Thank you for having taken the time to rerun the IOPS measurements with kernel debugging disabled. According to my calculations the standard deviation (50K) is larger than the difference between the averages (19K). Unfortunately that makes it hard to draw any conclusion ... Bart.
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 4b66d2776eda..35f1f01d93ae 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -450,6 +450,8 @@ static void blk_mq_debugfs_tags_show(struct seq_file *m, seq_printf(m, "nr_reserved_tags=%u\n", tags->nr_reserved_tags); seq_printf(m, "active_queues=%d\n", atomic_read(&tags->active_queues)); + seq_printf(m, "pending_queues=%d\n", + atomic_read(&tags->pending_queues)); seq_puts(m, "\nbitmap_tags:\n"); sbitmap_queue_show(tags->bitmap_tags, m); diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 86f87346232a..618624b359d6 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -40,6 +40,22 @@ bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) return true; } +void __blk_mq_dtag_busy(struct blk_mq_hw_ctx *hctx) +{ + if (blk_mq_is_sbitmap_shared(hctx->flags)) { + struct request_queue *q = hctx->queue; + struct blk_mq_tag_set *set = q->tag_set; + + if (!test_bit(QUEUE_FLAG_HCTX_WAIT, &q->queue_flags) && + !test_and_set_bit(QUEUE_FLAG_HCTX_WAIT, &q->queue_flags)) + atomic_inc(&set->pending_queues_shared_sbitmap); + } else { + if (!test_bit(BLK_MQ_S_DTAG_WAIT, &hctx->state) && + !test_and_set_bit(BLK_MQ_S_DTAG_WAIT, &hctx->state)) + atomic_inc(&hctx->tags->pending_queues); + } +} + /* * Wakeup all potentially sleeping on tags */ @@ -74,6 +90,24 @@ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) blk_mq_tag_wakeup_all(tags, false); } +void __blk_mq_dtag_idle(struct blk_mq_hw_ctx *hctx) +{ + struct blk_mq_tags *tags = hctx->tags; + struct request_queue *q = hctx->queue; + struct blk_mq_tag_set *set = q->tag_set; + + if (blk_mq_is_sbitmap_shared(hctx->flags)) { + if (!test_and_clear_bit(QUEUE_FLAG_HCTX_WAIT, + &q->queue_flags)) + return; + atomic_dec(&set->pending_queues_shared_sbitmap); + } else { + if (!test_and_clear_bit(BLK_MQ_S_DTAG_WAIT, &hctx->state)) + return; + atomic_dec(&tags->pending_queues); + } +} + static int __blk_mq_get_tag(struct blk_mq_alloc_data *data, struct sbitmap_queue *bt) { @@ -112,8 +146,12 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) if (tag != BLK_MQ_NO_TAG) goto found_tag; - if (data->flags & BLK_MQ_REQ_NOWAIT) + if (data->flags & BLK_MQ_REQ_NOWAIT) { + if (!data->q->elevator) + blk_mq_dtag_busy(data->hctx); + return BLK_MQ_NO_TAG; + } ws = bt_wait_ptr(bt, data->hctx); do { @@ -140,6 +178,9 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) if (tag != BLK_MQ_NO_TAG) break; + if (!data->q->elevator) + blk_mq_dtag_busy(data->hctx); + bt_prev = bt; io_schedule(); diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index 8ed55af08427..badcf3693749 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h @@ -10,6 +10,11 @@ struct blk_mq_tags { unsigned int nr_reserved_tags; atomic_t active_queues; + /* + * if multiple queues share a tag set, pending_queues record the + * number of queues that can't get driver tag. + */ + atomic_t pending_queues; struct sbitmap_queue *bitmap_tags; struct sbitmap_queue *breserved_tags; @@ -69,8 +74,10 @@ enum { BLK_MQ_TAG_MAX = BLK_MQ_NO_TAG - 1, }; -extern bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *); -extern void __blk_mq_tag_idle(struct blk_mq_hw_ctx *); +extern bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx); +extern void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx); +extern void __blk_mq_dtag_busy(struct blk_mq_hw_ctx *hctx); +extern void __blk_mq_dtag_idle(struct blk_mq_hw_ctx *hctx); static inline bool blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) { @@ -88,6 +95,22 @@ static inline void blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) __blk_mq_tag_idle(hctx); } +static inline void blk_mq_dtag_busy(struct blk_mq_hw_ctx *hctx) +{ + if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) + return; + + __blk_mq_dtag_busy(hctx); +} + +static inline void blk_mq_dtag_idle(struct blk_mq_hw_ctx *hctx) +{ + if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) + return; + + __blk_mq_dtag_idle(hctx); +} + static inline bool blk_mq_tag_is_reserved(struct blk_mq_tags *tags, unsigned int tag) { diff --git a/block/blk-mq.c b/block/blk-mq.c index 2c4ac51e54eb..1bb52bd71da8 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -991,8 +991,10 @@ static void blk_mq_timeout_work(struct work_struct *work) */ queue_for_each_hw_ctx(q, hctx, i) { /* the hctx may be unmapped, so check it here */ - if (blk_mq_hw_queue_mapped(hctx)) + if (blk_mq_hw_queue_mapped(hctx)) { blk_mq_tag_idle(hctx); + blk_mq_dtag_idle(hctx); + } } } blk_queue_exit(q); @@ -1097,8 +1099,10 @@ static bool __blk_mq_get_driver_tag(struct request *rq) } tag = __sbitmap_queue_get(bt); - if (tag == BLK_MQ_NO_TAG) + if (tag == BLK_MQ_NO_TAG) { + blk_mq_dtag_busy(rq->mq_hctx); return false; + } rq->tag = tag + tag_offset; return true; @@ -2676,8 +2680,10 @@ static void blk_mq_exit_hctx(struct request_queue *q, { struct request *flush_rq = hctx->fq->flush_rq; - if (blk_mq_hw_queue_mapped(hctx)) + if (blk_mq_hw_queue_mapped(hctx)) { blk_mq_tag_idle(hctx); + blk_mq_dtag_idle(hctx); + } blk_mq_clear_flush_rq_mapping(set->tags[hctx_idx], set->queue_depth, flush_rq); @@ -3536,6 +3542,7 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) if (blk_mq_is_sbitmap_shared(set->flags)) { atomic_set(&set->active_queues_shared_sbitmap, 0); + atomic_set(&set->pending_queues_shared_sbitmap, 0); if (blk_mq_init_shared_sbitmap(set)) { ret = -ENOMEM; diff --git a/block/blk-mq.h b/block/blk-mq.h index d08779f77a26..9e646ade81a8 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -337,10 +337,18 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags)) return true; + + if (!atomic_read(&set->pending_queues_shared_sbitmap)) + return true; + users = atomic_read(&set->active_queues_shared_sbitmap); } else { if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) return true; + + if (!atomic_read(&hctx->tags->pending_queues)) + return true; + users = atomic_read(&hctx->tags->active_queues); } diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 1d18447ebebc..3bc0faf0e2cf 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -256,6 +256,7 @@ struct blk_mq_tag_set { unsigned int flags; void *driver_data; atomic_t active_queues_shared_sbitmap; + atomic_t pending_queues_shared_sbitmap; struct sbitmap_queue __bitmap_tags; struct sbitmap_queue __breserved_tags; @@ -415,6 +416,9 @@ enum { /* hw queue is inactive after all its CPUs become offline */ BLK_MQ_S_INACTIVE = 3, + /* hw queue is waiting for driver tag */ + BLK_MQ_S_DTAG_WAIT = 1, + BLK_MQ_MAX_DEPTH = 10240, BLK_MQ_CPU_WORK_BATCH = 8, diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 3177181c4326..55e0965c9c3c 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -603,6 +603,7 @@ struct request_queue { #define QUEUE_FLAG_RQ_ALLOC_TIME 27 /* record rq->alloc_time_ns */ #define QUEUE_FLAG_HCTX_ACTIVE 28 /* at least one blk-mq hctx is active */ #define QUEUE_FLAG_NOWAIT 29 /* device supports NOWAIT */ +#define QUEUE_FLAG_HCTX_WAIT 30 /* at least one blk-mq hctx can't get driver tag */ #define QUEUE_FLAG_MQ_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ (1 << QUEUE_FLAG_SAME_COMP) | \
If there are multiple active queues while sharing a tag set, it's not necessary to limit the available tags as same share for each active queue if no one ever failed to get driver tag. And fall back to same share if someone do failed to get driver tag. This modification will be beneficial if total queue_depth of disks on the same host is less than total tags. Signed-off-by: Yu Kuai <yukuai3@huawei.com> --- block/blk-mq-debugfs.c | 2 ++ block/blk-mq-tag.c | 43 +++++++++++++++++++++++++++++++++++++++++- block/blk-mq-tag.h | 27 ++++++++++++++++++++++++-- block/blk-mq.c | 13 ++++++++++--- block/blk-mq.h | 8 ++++++++ include/linux/blk-mq.h | 4 ++++ include/linux/blkdev.h | 1 + 7 files changed, 92 insertions(+), 6 deletions(-)