diff mbox series

[v3,5/7] block: Preserve the order of requeued requests

Message ID 20230522183845.354920-6-bvanassche@acm.org (mailing list archive)
State New, archived
Headers show
Series Submit zoned writes in order | expand

Commit Message

Bart Van Assche May 22, 2023, 6:38 p.m. UTC
If a queue is run before all requeued requests have been sent to the I/O
scheduler, the I/O scheduler may dispatch the wrong request. Fix this by
making blk_mq_run_hw_queue() process the requeue_list instead of
blk_mq_requeue_work().

Cc: Christoph Hellwig <hch@lst.de>
Cc: Damien Le Moal <dlemoal@kernel.org>
Cc: Ming Lei <ming.lei@redhat.com>
Cc: Mike Snitzer <snitzer@kernel.org>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
 block/blk-mq.c         | 63 +++++++++++++++++++++---------------------
 include/linux/blkdev.h |  1 -
 2 files changed, 32 insertions(+), 32 deletions(-)
diff mbox series

Patch

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 9ef6fa5d7471..52dffdc70480 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -68,6 +68,8 @@  static inline blk_qc_t blk_rq_to_qc(struct request *rq)
 static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
 {
 	return !list_empty_careful(&hctx->dispatch) ||
+		!list_empty_careful(&hctx->queue->requeue_list) ||
+		!list_empty_careful(&hctx->queue->flush_list) ||
 		sbitmap_any_bit_set(&hctx->ctx_map) ||
 			blk_mq_sched_has_work(hctx);
 }
@@ -1432,52 +1434,52 @@  void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list)
 }
 EXPORT_SYMBOL(blk_mq_requeue_request);
 
-static void blk_mq_requeue_work(struct work_struct *work)
+static void blk_mq_process_requeue_list(struct blk_mq_hw_ctx *hctx)
 {
-	struct request_queue *q =
-		container_of(work, struct request_queue, requeue_work.work);
-	LIST_HEAD(requeue_list);
-	LIST_HEAD(flush_list);
+	struct request_queue *q = hctx->queue;
 	struct request *rq, *next;
+	LIST_HEAD(at_head);
+	LIST_HEAD(at_tail);
 
-	spin_lock_irq(&q->requeue_lock);
-	list_splice_init(&q->requeue_list, &requeue_list);
-	list_splice_init(&q->flush_list, &flush_list);
-	spin_unlock_irq(&q->requeue_lock);
+	if (list_empty_careful(&q->requeue_list) &&
+	    list_empty_careful(&q->flush_list))
+		return;
 
-	list_for_each_entry_safe(rq, next, &requeue_list, queuelist) {
-		if (!(rq->rq_flags & RQF_DONTPREP)) {
+	spin_lock_irq(&q->requeue_lock);
+	list_for_each_entry_safe(rq, next, &q->requeue_list, queuelist) {
+		if (!blk_queue_sq_sched(q) && rq->mq_hctx != hctx)
+			continue;
+		if (rq->rq_flags & RQF_DONTPREP) {
+			list_move_tail(&rq->queuelist, &at_tail);
+		} else {
 			list_del_init(&rq->queuelist);
-			blk_mq_insert_request(rq, BLK_MQ_INSERT_AT_HEAD);
+			list_move_tail(&rq->queuelist, &at_head);
 		}
 	}
-
-	while (!list_empty(&requeue_list)) {
-		rq = list_entry(requeue_list.next, struct request, queuelist);
-		list_del_init(&rq->queuelist);
-		blk_mq_insert_request(rq, 0);
+	list_for_each_entry_safe(rq, next, &q->flush_list, queuelist) {
+		if (!blk_queue_sq_sched(q) && rq->mq_hctx != hctx)
+			continue;
+		list_move_tail(&rq->queuelist, &at_tail);
 	}
+	spin_unlock_irq(&q->requeue_lock);
 
-	while (!list_empty(&flush_list)) {
-		rq = list_entry(flush_list.next, struct request, queuelist);
-		list_del_init(&rq->queuelist);
-		blk_mq_insert_request(rq, 0);
-	}
+	list_for_each_entry_safe(rq, next, &at_head, queuelist)
+		blk_mq_insert_request(rq, BLK_MQ_INSERT_AT_HEAD);
 
-	blk_mq_run_hw_queues(q, false);
+	list_for_each_entry_safe(rq, next, &at_tail, queuelist)
+		blk_mq_insert_request(rq, 0);
 }
 
 void blk_mq_kick_requeue_list(struct request_queue *q)
 {
-	kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work, 0);
+	blk_mq_run_hw_queues(q, true);
 }
 EXPORT_SYMBOL(blk_mq_kick_requeue_list);
 
 void blk_mq_delay_kick_requeue_list(struct request_queue *q,
 				    unsigned long msecs)
 {
-	kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work,
-				    msecs_to_jiffies(msecs));
+	blk_mq_delay_run_hw_queues(q, msecs);
 }
 EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list);
 
@@ -2244,6 +2246,7 @@  void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
 		return;
 	}
 
+	blk_mq_process_requeue_list(hctx);
 	blk_mq_run_dispatch_ops(hctx->queue,
 				blk_mq_sched_dispatch_requests(hctx));
 }
@@ -2292,7 +2295,7 @@  void blk_mq_run_hw_queues(struct request_queue *q, bool async)
 		 * scheduler.
 		 */
 		if (!sq_hctx || sq_hctx == hctx ||
-		    !list_empty_careful(&hctx->dispatch))
+		    blk_mq_hctx_has_pending(hctx))
 			blk_mq_run_hw_queue(hctx, async);
 	}
 }
@@ -2328,7 +2331,7 @@  void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs)
 		 * scheduler.
 		 */
 		if (!sq_hctx || sq_hctx == hctx ||
-		    !list_empty_careful(&hctx->dispatch))
+		    blk_mq_hctx_has_pending(hctx))
 			blk_mq_delay_run_hw_queue(hctx, msecs);
 	}
 }
@@ -2413,6 +2416,7 @@  static void blk_mq_run_work_fn(struct work_struct *work)
 	struct blk_mq_hw_ctx *hctx =
 		container_of(work, struct blk_mq_hw_ctx, run_work.work);
 
+	blk_mq_process_requeue_list(hctx);
 	blk_mq_run_dispatch_ops(hctx->queue,
 				blk_mq_sched_dispatch_requests(hctx));
 }
@@ -4237,7 +4241,6 @@  int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 	q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
 	blk_mq_update_poll_flag(q);
 
-	INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work);
 	INIT_LIST_HEAD(&q->flush_list);
 	INIT_LIST_HEAD(&q->requeue_list);
 	spin_lock_init(&q->requeue_lock);
@@ -4786,8 +4789,6 @@  void blk_mq_cancel_work_sync(struct request_queue *q)
 	struct blk_mq_hw_ctx *hctx;
 	unsigned long i;
 
-	cancel_delayed_work_sync(&q->requeue_work);
-
 	queue_for_each_hw_ctx(q, hctx, i)
 		cancel_delayed_work_sync(&hctx->run_work);
 }
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index fe99948688df..f410cce7289b 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -491,7 +491,6 @@  struct request_queue {
 
 	struct list_head	requeue_list;
 	spinlock_t		requeue_lock;
-	struct delayed_work	requeue_work;
 
 	struct mutex		sysfs_lock;
 	struct mutex		sysfs_dir_lock;