diff mbox series

[v4,5/7] block: Preserve the order of requeued requests

Message ID 20230621201237.796902-6-bvanassche@acm.org (mailing list archive)
State New, archived
Headers show
Series Submit zoned writes in order | expand

Commit Message

Bart Van Assche June 21, 2023, 8:12 p.m. UTC
If a queue is run before all requeued requests have been sent to the I/O
scheduler, the I/O scheduler may dispatch the wrong request. Fix this by
making blk_mq_run_hw_queue() process the requeue_list instead of
blk_mq_requeue_work().

Cc: Christoph Hellwig <hch@lst.de>
Cc: Damien Le Moal <dlemoal@kernel.org>
Cc: Ming Lei <ming.lei@redhat.com>
Cc: Mike Snitzer <snitzer@kernel.org>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
 block/blk-mq.c         | 31 +++++++++----------------------
 include/linux/blk-mq.h |  1 -
 2 files changed, 9 insertions(+), 23 deletions(-)

Comments

Christoph Hellwig June 23, 2023, 5:50 a.m. UTC | #1
> +	blk_mq_process_requeue_list(hctx);

Should this do list_empty_careful checks on ->requeue_list and
->flush_list so that we can avoid taking the requeue lock when
these conditions aren't met before calling into
blk_mq_process_requeue_list?
Bart Van Assche June 23, 2023, 8:08 p.m. UTC | #2
On 6/22/23 22:50, Christoph Hellwig wrote:
>> +	blk_mq_process_requeue_list(hctx);
> 
> Should this do list_empty_careful checks on ->requeue_list and
> ->flush_list so that we can avoid taking the requeue lock when
> these conditions aren't met before calling into
> blk_mq_process_requeue_list?

Hi Christoph,

I agree that checks whether or not requeue_list and flush_list are empty 
should be added.

Does it matter in this context whether list_empty_careful() or 
list_empty() is used? If blk_mq_process_requeue_list() is called 
concurrently with code that adds an element to one of these two lists it 
is guaranteed that the queue will be run again. This is why I think that 
it is fine that the list checks in blk_mq_process_requeue_list() race 
with concurrent list additions.

Thanks,

Bart.
Christoph Hellwig June 26, 2023, 8:01 a.m. UTC | #3
On Fri, Jun 23, 2023 at 01:08:33PM -0700, Bart Van Assche wrote:
> Does it matter in this context whether list_empty_careful() or list_empty() 
> is used? If blk_mq_process_requeue_list() is called concurrently with code 
> that adds an element to one of these two lists it is guaranteed that the 
> queue will be run again. This is why I think that it is fine that the list 
> checks in blk_mq_process_requeue_list() race with concurrent list 
> additions.

As far as I can tell we're not holding the relevant lock, so I think we
need list_empty_careful.  Or am I missing something?
Ming Lei June 26, 2023, 8:16 a.m. UTC | #4
On Wed, Jun 21, 2023 at 01:12:32PM -0700, Bart Van Assche wrote:
> If a queue is run before all requeued requests have been sent to the I/O
> scheduler, the I/O scheduler may dispatch the wrong request. Fix this by

Can you explain in more details about the issue? What is the wrong
request? How?

If you mean write order for requeued write request, there isn't such issue,
given new write request from same zone can be issued before the old
requeued one is completed.

Thanks, 
Ming
diff mbox series

Patch

diff --git a/block/blk-mq.c b/block/blk-mq.c
index c359a28d9b25..de39984d17c4 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -68,6 +68,8 @@  static inline blk_qc_t blk_rq_to_qc(struct request *rq)
 static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
 {
 	return !list_empty_careful(&hctx->dispatch) ||
+		!list_empty_careful(&hctx->requeue_list) ||
+		!list_empty_careful(&hctx->flush_list) ||
 		sbitmap_any_bit_set(&hctx->ctx_map) ||
 			blk_mq_sched_has_work(hctx);
 }
@@ -1438,10 +1440,8 @@  void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list)
 }
 EXPORT_SYMBOL(blk_mq_requeue_request);
 
-static void blk_mq_requeue_work(struct work_struct *work)
+static void blk_mq_process_requeue_list(struct blk_mq_hw_ctx *hctx)
 {
-	struct blk_mq_hw_ctx *hctx =
-		container_of(work, struct blk_mq_hw_ctx, requeue_work.work);
 	LIST_HEAD(requeue_list);
 	LIST_HEAD(flush_list);
 	struct request *rq;
@@ -1471,31 +1471,18 @@  static void blk_mq_requeue_work(struct work_struct *work)
 		list_del_init(&rq->queuelist);
 		blk_mq_insert_request(rq, 0);
 	}
-
-	blk_mq_run_hw_queue(hctx, false);
 }
 
 void blk_mq_kick_requeue_list(struct request_queue *q)
 {
-	struct blk_mq_hw_ctx *hctx;
-	unsigned long i;
-
-	queue_for_each_hw_ctx(q, hctx, i)
-		kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND,
-					    &hctx->requeue_work, 0);
+	blk_mq_run_hw_queues(q, true);
 }
 EXPORT_SYMBOL(blk_mq_kick_requeue_list);
 
 void blk_mq_delay_kick_requeue_list(struct request_queue *q,
 				    unsigned long msecs)
 {
-	struct blk_mq_hw_ctx *hctx;
-	unsigned long i;
-
-	queue_for_each_hw_ctx(q, hctx, i)
-		kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND,
-					    &hctx->requeue_work,
-					    msecs_to_jiffies(msecs));
+	blk_mq_delay_run_hw_queues(q, msecs);
 }
 EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list);
 
@@ -2248,6 +2235,7 @@  void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
 		return;
 	}
 
+	blk_mq_process_requeue_list(hctx);
 	blk_mq_run_dispatch_ops(hctx->queue,
 				blk_mq_sched_dispatch_requests(hctx));
 }
@@ -2296,7 +2284,7 @@  void blk_mq_run_hw_queues(struct request_queue *q, bool async)
 		 * scheduler.
 		 */
 		if (!sq_hctx || sq_hctx == hctx ||
-		    !list_empty_careful(&hctx->dispatch))
+		    blk_mq_hctx_has_pending(hctx))
 			blk_mq_run_hw_queue(hctx, async);
 	}
 }
@@ -2332,7 +2320,7 @@  void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs)
 		 * scheduler.
 		 */
 		if (!sq_hctx || sq_hctx == hctx ||
-		    !list_empty_careful(&hctx->dispatch))
+		    blk_mq_hctx_has_pending(hctx))
 			blk_mq_delay_run_hw_queue(hctx, msecs);
 	}
 }
@@ -2417,6 +2405,7 @@  static void blk_mq_run_work_fn(struct work_struct *work)
 	struct blk_mq_hw_ctx *hctx =
 		container_of(work, struct blk_mq_hw_ctx, run_work.work);
 
+	blk_mq_process_requeue_list(hctx);
 	blk_mq_run_dispatch_ops(hctx->queue,
 				blk_mq_sched_dispatch_requests(hctx));
 }
@@ -3625,7 +3614,6 @@  static int blk_mq_init_hctx(struct request_queue *q,
 		struct blk_mq_tag_set *set,
 		struct blk_mq_hw_ctx *hctx, unsigned hctx_idx)
 {
-	INIT_DELAYED_WORK(&hctx->requeue_work, blk_mq_requeue_work);
 	INIT_LIST_HEAD(&hctx->flush_list);
 	INIT_LIST_HEAD(&hctx->requeue_list);
 	spin_lock_init(&hctx->requeue_lock);
@@ -4794,7 +4782,6 @@  void blk_mq_cancel_work_sync(struct request_queue *q)
 	unsigned long i;
 
 	queue_for_each_hw_ctx(q, hctx, i) {
-		cancel_delayed_work_sync(&hctx->requeue_work);
 		cancel_delayed_work_sync(&hctx->run_work);
 	}
 }
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 672e8880f9e2..b919de53dc28 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -312,7 +312,6 @@  struct blk_mq_hw_ctx {
 
 	struct list_head	requeue_list;
 	spinlock_t		requeue_lock;
-	struct delayed_work	requeue_work;
 
 	/**
 	 * @run_work: Used for scheduling a hardware queue run at a later time.