Message ID | 20230407235822.1672286-7-bvanassche@acm.org (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Submit zoned writes in order | expand |
On 4/8/23 08:58, Bart Van Assche wrote: > If a queue is run before all requeued requests have been sent to the I/O > scheduler, the I/O scheduler may dispatch the wrong request. Fix this by > making __blk_mq_run_hw_queue() process the requeue_list instead of > blk_mq_requeue_work(). I think that the part of patch 5 that move the requeue work to per hctx should be together with this patch. That would make the review easier. I am guessing that the move to per hctx is to try to reduce lock contention ? That is not clearly explained. Given that requeue events should be infrequent exceptions, is that really necessary ? > > Cc: Christoph Hellwig <hch@lst.de> > Cc: Damien Le Moal <damien.lemoal@opensource.wdc.com> > Cc: Ming Lei <ming.lei@redhat.com> > Cc: Mike Snitzer <snitzer@kernel.org> > Signed-off-by: Bart Van Assche <bvanassche@acm.org> > --- > block/blk-mq.c | 35 ++++++++++------------------------- > include/linux/blk-mq.h | 1 - > 2 files changed, 10 insertions(+), 26 deletions(-) > > diff --git a/block/blk-mq.c b/block/blk-mq.c > index deb3d08a6b26..562868dff43f 100644 > --- a/block/blk-mq.c > +++ b/block/blk-mq.c > @@ -64,6 +64,7 @@ static inline blk_qc_t blk_rq_to_qc(struct request *rq) > static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx) > { > return !list_empty_careful(&hctx->dispatch) || > + !list_empty_careful(&hctx->requeue_list) || > sbitmap_any_bit_set(&hctx->ctx_map) || > blk_mq_sched_has_work(hctx); > } > @@ -1409,10 +1410,8 @@ void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list) > } > EXPORT_SYMBOL(blk_mq_requeue_request); > > -static void blk_mq_requeue_work(struct work_struct *work) > +static void blk_mq_process_requeue_list(struct blk_mq_hw_ctx *hctx) > { > - struct blk_mq_hw_ctx *hctx = > - container_of(work, struct blk_mq_hw_ctx, requeue_work.work); > LIST_HEAD(rq_list); > struct request *rq, *next; > > @@ -1437,8 +1436,6 @@ static void blk_mq_requeue_work(struct work_struct *work) > list_del_init(&rq->queuelist); > blk_mq_sched_insert_request(rq, false, false, false); > } > - > - blk_mq_run_hw_queue(hctx, false); > } > > void blk_mq_add_to_requeue_list(struct request *rq, bool at_head, > @@ -1464,30 +1461,19 @@ void blk_mq_add_to_requeue_list(struct request *rq, bool at_head, > spin_unlock_irqrestore(&hctx->requeue_lock, flags); > > if (kick_requeue_list) > - blk_mq_kick_requeue_list(rq->q); > + blk_mq_run_hw_queue(hctx, /*async=*/true); > } > > void blk_mq_kick_requeue_list(struct request_queue *q) > { > - struct blk_mq_hw_ctx *hctx; > - unsigned long i; > - > - queue_for_each_hw_ctx(q, hctx, i) > - kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, > - &hctx->requeue_work, 0); > + blk_mq_run_hw_queues(q, true); > } > EXPORT_SYMBOL(blk_mq_kick_requeue_list); > > void blk_mq_delay_kick_requeue_list(struct request_queue *q, > unsigned long msecs) > { > - struct blk_mq_hw_ctx *hctx; > - unsigned long i; > - > - queue_for_each_hw_ctx(q, hctx, i) > - kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, > - &hctx->requeue_work, > - msecs_to_jiffies(msecs)); > + blk_mq_delay_run_hw_queues(q, msecs); > } > EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list); > > @@ -2146,6 +2132,8 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) > */ > WARN_ON_ONCE(in_interrupt()); > > + blk_mq_process_requeue_list(hctx); > + > blk_mq_run_dispatch_ops(hctx->queue, > blk_mq_sched_dispatch_requests(hctx)); > } > @@ -2317,7 +2305,7 @@ void blk_mq_run_hw_queues(struct request_queue *q, bool async) > * scheduler. > */ > if (!sq_hctx || sq_hctx == hctx || > - !list_empty_careful(&hctx->dispatch)) > + blk_mq_hctx_has_pending(hctx)) > blk_mq_run_hw_queue(hctx, async); > } > } > @@ -2353,7 +2341,7 @@ void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs) > * scheduler. > */ > if (!sq_hctx || sq_hctx == hctx || > - !list_empty_careful(&hctx->dispatch)) > + blk_mq_hctx_has_pending(hctx)) > blk_mq_delay_run_hw_queue(hctx, msecs); > } > } > @@ -3608,7 +3596,6 @@ static int blk_mq_init_hctx(struct request_queue *q, > struct blk_mq_tag_set *set, > struct blk_mq_hw_ctx *hctx, unsigned hctx_idx) > { > - INIT_DELAYED_WORK(&hctx->requeue_work, blk_mq_requeue_work); > INIT_LIST_HEAD(&hctx->requeue_list); > spin_lock_init(&hctx->requeue_lock); > > @@ -4771,10 +4758,8 @@ void blk_mq_cancel_work_sync(struct request_queue *q) > struct blk_mq_hw_ctx *hctx; > unsigned long i; > > - queue_for_each_hw_ctx(q, hctx, i) { > - cancel_delayed_work_sync(&hctx->requeue_work); > + queue_for_each_hw_ctx(q, hctx, i) > cancel_delayed_work_sync(&hctx->run_work); > - } > } > > static int __init blk_mq_init(void) > diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h > index 0157f1569980..e62feb17af96 100644 > --- a/include/linux/blk-mq.h > +++ b/include/linux/blk-mq.h > @@ -313,7 +313,6 @@ struct blk_mq_hw_ctx { > > struct list_head requeue_list; > spinlock_t requeue_lock; > - struct delayed_work requeue_work; > > /** > * @run_work: Used for scheduling a hardware queue run at a later time.
I think this should be merged with the previous patch. > void blk_mq_kick_requeue_list(struct request_queue *q) > { > + blk_mq_run_hw_queues(q, true); > } > EXPORT_SYMBOL(blk_mq_kick_requeue_list); Pleae just remove blk_mq_kick_requeue_list entirely. > > > void blk_mq_delay_kick_requeue_list(struct request_queue *q, > unsigned long msecs) > { > + blk_mq_delay_run_hw_queues(q, msecs); > } > EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list); Same for blk_mq_delay_kick_requeue_list. > if (!sq_hctx || sq_hctx == hctx || > - !list_empty_careful(&hctx->dispatch)) > + blk_mq_hctx_has_pending(hctx)) > blk_mq_run_hw_queue(hctx, async); > } > } > @@ -2353,7 +2341,7 @@ void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs) > * scheduler. > */ > if (!sq_hctx || sq_hctx == hctx || > - !list_empty_careful(&hctx->dispatch)) > + blk_mq_hctx_has_pending(hctx)) This check would probably benefit from being factored into a helper.
diff --git a/block/blk-mq.c b/block/blk-mq.c index deb3d08a6b26..562868dff43f 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -64,6 +64,7 @@ static inline blk_qc_t blk_rq_to_qc(struct request *rq) static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx) { return !list_empty_careful(&hctx->dispatch) || + !list_empty_careful(&hctx->requeue_list) || sbitmap_any_bit_set(&hctx->ctx_map) || blk_mq_sched_has_work(hctx); } @@ -1409,10 +1410,8 @@ void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list) } EXPORT_SYMBOL(blk_mq_requeue_request); -static void blk_mq_requeue_work(struct work_struct *work) +static void blk_mq_process_requeue_list(struct blk_mq_hw_ctx *hctx) { - struct blk_mq_hw_ctx *hctx = - container_of(work, struct blk_mq_hw_ctx, requeue_work.work); LIST_HEAD(rq_list); struct request *rq, *next; @@ -1437,8 +1436,6 @@ static void blk_mq_requeue_work(struct work_struct *work) list_del_init(&rq->queuelist); blk_mq_sched_insert_request(rq, false, false, false); } - - blk_mq_run_hw_queue(hctx, false); } void blk_mq_add_to_requeue_list(struct request *rq, bool at_head, @@ -1464,30 +1461,19 @@ void blk_mq_add_to_requeue_list(struct request *rq, bool at_head, spin_unlock_irqrestore(&hctx->requeue_lock, flags); if (kick_requeue_list) - blk_mq_kick_requeue_list(rq->q); + blk_mq_run_hw_queue(hctx, /*async=*/true); } void blk_mq_kick_requeue_list(struct request_queue *q) { - struct blk_mq_hw_ctx *hctx; - unsigned long i; - - queue_for_each_hw_ctx(q, hctx, i) - kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, - &hctx->requeue_work, 0); + blk_mq_run_hw_queues(q, true); } EXPORT_SYMBOL(blk_mq_kick_requeue_list); void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs) { - struct blk_mq_hw_ctx *hctx; - unsigned long i; - - queue_for_each_hw_ctx(q, hctx, i) - kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, - &hctx->requeue_work, - msecs_to_jiffies(msecs)); + blk_mq_delay_run_hw_queues(q, msecs); } EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list); @@ -2146,6 +2132,8 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) */ WARN_ON_ONCE(in_interrupt()); + blk_mq_process_requeue_list(hctx); + blk_mq_run_dispatch_ops(hctx->queue, blk_mq_sched_dispatch_requests(hctx)); } @@ -2317,7 +2305,7 @@ void blk_mq_run_hw_queues(struct request_queue *q, bool async) * scheduler. */ if (!sq_hctx || sq_hctx == hctx || - !list_empty_careful(&hctx->dispatch)) + blk_mq_hctx_has_pending(hctx)) blk_mq_run_hw_queue(hctx, async); } } @@ -2353,7 +2341,7 @@ void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs) * scheduler. */ if (!sq_hctx || sq_hctx == hctx || - !list_empty_careful(&hctx->dispatch)) + blk_mq_hctx_has_pending(hctx)) blk_mq_delay_run_hw_queue(hctx, msecs); } } @@ -3608,7 +3596,6 @@ static int blk_mq_init_hctx(struct request_queue *q, struct blk_mq_tag_set *set, struct blk_mq_hw_ctx *hctx, unsigned hctx_idx) { - INIT_DELAYED_WORK(&hctx->requeue_work, blk_mq_requeue_work); INIT_LIST_HEAD(&hctx->requeue_list); spin_lock_init(&hctx->requeue_lock); @@ -4771,10 +4758,8 @@ void blk_mq_cancel_work_sync(struct request_queue *q) struct blk_mq_hw_ctx *hctx; unsigned long i; - queue_for_each_hw_ctx(q, hctx, i) { - cancel_delayed_work_sync(&hctx->requeue_work); + queue_for_each_hw_ctx(q, hctx, i) cancel_delayed_work_sync(&hctx->run_work); - } } static int __init blk_mq_init(void) diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 0157f1569980..e62feb17af96 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -313,7 +313,6 @@ struct blk_mq_hw_ctx { struct list_head requeue_list; spinlock_t requeue_lock; - struct delayed_work requeue_work; /** * @run_work: Used for scheduling a hardware queue run at a later time.
If a queue is run before all requeued requests have been sent to the I/O scheduler, the I/O scheduler may dispatch the wrong request. Fix this by making __blk_mq_run_hw_queue() process the requeue_list instead of blk_mq_requeue_work(). Cc: Christoph Hellwig <hch@lst.de> Cc: Damien Le Moal <damien.lemoal@opensource.wdc.com> Cc: Ming Lei <ming.lei@redhat.com> Cc: Mike Snitzer <snitzer@kernel.org> Signed-off-by: Bart Van Assche <bvanassche@acm.org> --- block/blk-mq.c | 35 ++++++++++------------------------- include/linux/blk-mq.h | 1 - 2 files changed, 10 insertions(+), 26 deletions(-)