[v2,06/12] block: Preserve the order of requeued requests

Message ID	20230407235822.1672286-7-bvanassche@acm.org (mailing list archive)
State	New, archived
Headers	show Return-Path: <linux-block-owner@vger.kernel.org> From: Bart Van Assche <bvanassche@acm.org> To: Jens Axboe <axboe@kernel.dk> Cc: linux-block@vger.kernel.org, Jaegeuk Kim <jaegeuk@kernel.org>, Christoph Hellwig <hch@lst.de>, Bart Van Assche <bvanassche@acm.org>, Damien Le Moal <damien.lemoal@opensource.wdc.com>, Ming Lei <ming.lei@redhat.com>, Mike Snitzer <snitzer@kernel.org> Subject: [PATCH v2 06/12] block: Preserve the order of requeued requests Date: Fri, 7 Apr 2023 16:58:16 -0700 Message-Id: <20230407235822.1672286-7-bvanassche@acm.org> In-Reply-To: <20230407235822.1672286-1-bvanassche@acm.org> References: <20230407235822.1672286-1-bvanassche@acm.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Precedence: bulk
Series	Submit zoned writes in order \| expand [v2,00/12] Submit zoned writes in order [v2,01/12] block: Send zoned writes to the I/O scheduler [v2,02/12] block: Send flush requests to the I/O scheduler [v2,03/12] block: Send requeued requests to the I/O scheduler [v2,04/12] block: Requeue requests if a CPU is unplugged [v2,05/12] block: One requeue list per hctx [v2,06/12] block: Preserve the order of requeued requests [v2,07/12] block: Make it easier to debug zoned write reordering [v2,08/12] block: mq-deadline: Simplify deadline_skip_seq_writes() [v2,09/12] block: mq-deadline: Disable head insertion for zoned writes [v2,10/12] block: mq-deadline: Introduce a local variable [v2,11/12] block: mq-deadline: Fix a race condition related to zoned writes [v2,12/12] block: mq-deadline: Handle requeued requests correctly

Message ID

20230407235822.1672286-7-bvanassche@acm.org (mailing list archive)

State

New, archived

Headers

From: Bart Van Assche <bvanassche@acm.org>
To: Jens Axboe <axboe@kernel.dk>
Cc: linux-block@vger.kernel.org, Jaegeuk Kim <jaegeuk@kernel.org>,
        Christoph Hellwig <hch@lst.de>,
        Bart Van Assche <bvanassche@acm.org>,
        Damien Le Moal <damien.lemoal@opensource.wdc.com>,
        Ming Lei <ming.lei@redhat.com>,
        Mike Snitzer <snitzer@kernel.org>
Subject: [PATCH v2 06/12] block: Preserve the order of requeued requests
Date: Fri,  7 Apr 2023 16:58:16 -0700
Message-Id: <20230407235822.1672286-7-bvanassche@acm.org>
In-Reply-To: <20230407235822.1672286-1-bvanassche@acm.org>
References: <20230407235822.1672286-1-bvanassche@acm.org>
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit
Precedence: bulk

Series

Submit zoned writes in order | expand

Commit Message

Bart Van Assche April 7, 2023, 11:58 p.m. UTC

If a queue is run before all requeued requests have been sent to the I/O
scheduler, the I/O scheduler may dispatch the wrong request. Fix this by
making __blk_mq_run_hw_queue() process the requeue_list instead of
blk_mq_requeue_work().

Cc: Christoph Hellwig <hch@lst.de>
Cc: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Cc: Ming Lei <ming.lei@redhat.com>
Cc: Mike Snitzer <snitzer@kernel.org>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
 block/blk-mq.c         | 35 ++++++++++-------------------------
 include/linux/blk-mq.h |  1 -
 2 files changed, 10 insertions(+), 26 deletions(-)

Comments

Damien Le Moal April 10, 2023, 8:01 a.m. UTC | #1

On 4/8/23 08:58, Bart Van Assche wrote:
> If a queue is run before all requeued requests have been sent to the I/O
> scheduler, the I/O scheduler may dispatch the wrong request. Fix this by
> making __blk_mq_run_hw_queue() process the requeue_list instead of
> blk_mq_requeue_work().

I think that the part of patch 5 that move the requeue work to per hctx should
be together with this patch. That would make the review easier.
I am guessing that the move to per hctx is to try to reduce lock contention ?
That is not clearly explained. Given that requeue events should be infrequent
exceptions, is that really necessary ?

> 
> Cc: Christoph Hellwig <hch@lst.de>
> Cc: Damien Le Moal <damien.lemoal@opensource.wdc.com>
> Cc: Ming Lei <ming.lei@redhat.com>
> Cc: Mike Snitzer <snitzer@kernel.org>
> Signed-off-by: Bart Van Assche <bvanassche@acm.org>
> ---
>  block/blk-mq.c         | 35 ++++++++++-------------------------
>  include/linux/blk-mq.h |  1 -
>  2 files changed, 10 insertions(+), 26 deletions(-)
> 
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index deb3d08a6b26..562868dff43f 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -64,6 +64,7 @@ static inline blk_qc_t blk_rq_to_qc(struct request *rq)
>  static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
>  {
>  	return !list_empty_careful(&hctx->dispatch) ||
> +		!list_empty_careful(&hctx->requeue_list) ||
>  		sbitmap_any_bit_set(&hctx->ctx_map) ||
>  			blk_mq_sched_has_work(hctx);
>  }
> @@ -1409,10 +1410,8 @@ void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list)
>  }
>  EXPORT_SYMBOL(blk_mq_requeue_request);
>  
> -static void blk_mq_requeue_work(struct work_struct *work)
> +static void blk_mq_process_requeue_list(struct blk_mq_hw_ctx *hctx)
>  {
> -	struct blk_mq_hw_ctx *hctx =
> -		container_of(work, struct blk_mq_hw_ctx, requeue_work.work);
>  	LIST_HEAD(rq_list);
>  	struct request *rq, *next;
>  
> @@ -1437,8 +1436,6 @@ static void blk_mq_requeue_work(struct work_struct *work)
>  		list_del_init(&rq->queuelist);
>  		blk_mq_sched_insert_request(rq, false, false, false);
>  	}
> -
> -	blk_mq_run_hw_queue(hctx, false);
>  }
>  
>  void blk_mq_add_to_requeue_list(struct request *rq, bool at_head,
> @@ -1464,30 +1461,19 @@ void blk_mq_add_to_requeue_list(struct request *rq, bool at_head,
>  	spin_unlock_irqrestore(&hctx->requeue_lock, flags);
>  
>  	if (kick_requeue_list)
> -		blk_mq_kick_requeue_list(rq->q);
> +		blk_mq_run_hw_queue(hctx, /*async=*/true);
>  }
>  
>  void blk_mq_kick_requeue_list(struct request_queue *q)
>  {
> -	struct blk_mq_hw_ctx *hctx;
> -	unsigned long i;
> -
> -	queue_for_each_hw_ctx(q, hctx, i)
> -		kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND,
> -					    &hctx->requeue_work, 0);
> +	blk_mq_run_hw_queues(q, true);
>  }
>  EXPORT_SYMBOL(blk_mq_kick_requeue_list);
>  
>  void blk_mq_delay_kick_requeue_list(struct request_queue *q,
>  				    unsigned long msecs)
>  {
> -	struct blk_mq_hw_ctx *hctx;
> -	unsigned long i;
> -
> -	queue_for_each_hw_ctx(q, hctx, i)
> -		kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND,
> -					    &hctx->requeue_work,
> -					    msecs_to_jiffies(msecs));
> +	blk_mq_delay_run_hw_queues(q, msecs);
>  }
>  EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list);
>  
> @@ -2146,6 +2132,8 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
>  	 */
>  	WARN_ON_ONCE(in_interrupt());
>  
> +	blk_mq_process_requeue_list(hctx);
> +
>  	blk_mq_run_dispatch_ops(hctx->queue,
>  			blk_mq_sched_dispatch_requests(hctx));
>  }
> @@ -2317,7 +2305,7 @@ void blk_mq_run_hw_queues(struct request_queue *q, bool async)
>  		 * scheduler.
>  		 */
>  		if (!sq_hctx || sq_hctx == hctx ||
> -		    !list_empty_careful(&hctx->dispatch))
> +		    blk_mq_hctx_has_pending(hctx))
>  			blk_mq_run_hw_queue(hctx, async);
>  	}
>  }
> @@ -2353,7 +2341,7 @@ void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs)
>  		 * scheduler.
>  		 */
>  		if (!sq_hctx || sq_hctx == hctx ||
> -		    !list_empty_careful(&hctx->dispatch))
> +		    blk_mq_hctx_has_pending(hctx))
>  			blk_mq_delay_run_hw_queue(hctx, msecs);
>  	}
>  }
> @@ -3608,7 +3596,6 @@ static int blk_mq_init_hctx(struct request_queue *q,
>  		struct blk_mq_tag_set *set,
>  		struct blk_mq_hw_ctx *hctx, unsigned hctx_idx)
>  {
> -	INIT_DELAYED_WORK(&hctx->requeue_work, blk_mq_requeue_work);
>  	INIT_LIST_HEAD(&hctx->requeue_list);
>  	spin_lock_init(&hctx->requeue_lock);
>  
> @@ -4771,10 +4758,8 @@ void blk_mq_cancel_work_sync(struct request_queue *q)
>  	struct blk_mq_hw_ctx *hctx;
>  	unsigned long i;
>  
> -	queue_for_each_hw_ctx(q, hctx, i) {
> -		cancel_delayed_work_sync(&hctx->requeue_work);
> +	queue_for_each_hw_ctx(q, hctx, i)
>  		cancel_delayed_work_sync(&hctx->run_work);
> -	}
>  }
>  
>  static int __init blk_mq_init(void)
> diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
> index 0157f1569980..e62feb17af96 100644
> --- a/include/linux/blk-mq.h
> +++ b/include/linux/blk-mq.h
> @@ -313,7 +313,6 @@ struct blk_mq_hw_ctx {
>  
>  	struct list_head	requeue_list;
>  	spinlock_t		requeue_lock;
> -	struct delayed_work	requeue_work;
>  
>  	/**
>  	 * @run_work: Used for scheduling a hardware queue run at a later time.

Christoph Hellwig April 11, 2023, 12:43 p.m. UTC | #2

I think this should be merged with the previous patch.

>  void blk_mq_kick_requeue_list(struct request_queue *q)
>  {
> +	blk_mq_run_hw_queues(q, true);
>  }
>  EXPORT_SYMBOL(blk_mq_kick_requeue_list);

Pleae just remove blk_mq_kick_requeue_list entirely.

>
>  
>  void blk_mq_delay_kick_requeue_list(struct request_queue *q,
>  				    unsigned long msecs)
>  {
> +	blk_mq_delay_run_hw_queues(q, msecs);
>  }
>  EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list);

Same for blk_mq_delay_kick_requeue_list.

>  		if (!sq_hctx || sq_hctx == hctx ||
> -		    !list_empty_careful(&hctx->dispatch))
> +		    blk_mq_hctx_has_pending(hctx))
>  			blk_mq_run_hw_queue(hctx, async);
>  	}
>  }
> @@ -2353,7 +2341,7 @@ void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs)
>  		 * scheduler.
>  		 */
>  		if (!sq_hctx || sq_hctx == hctx ||
> -		    !list_empty_careful(&hctx->dispatch))
> +		    blk_mq_hctx_has_pending(hctx))

This check would probably benefit from being factored into a helper.

diff --git a/block/blk-mq.c b/block/blk-mq.c
index deb3d08a6b26..562868dff43f 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -64,6 +64,7 @@  static inline blk_qc_t blk_rq_to_qc(struct request *rq)
 static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
 {
 	return !list_empty_careful(&hctx->dispatch) ||
+		!list_empty_careful(&hctx->requeue_list) ||
 		sbitmap_any_bit_set(&hctx->ctx_map) ||
 			blk_mq_sched_has_work(hctx);
 }
@@ -1409,10 +1410,8 @@  void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list)
 }
 EXPORT_SYMBOL(blk_mq_requeue_request);
 
-static void blk_mq_requeue_work(struct work_struct *work)
+static void blk_mq_process_requeue_list(struct blk_mq_hw_ctx *hctx)
 {
-	struct blk_mq_hw_ctx *hctx =
-		container_of(work, struct blk_mq_hw_ctx, requeue_work.work);
 	LIST_HEAD(rq_list);
 	struct request *rq, *next;
 
@@ -1437,8 +1436,6 @@  static void blk_mq_requeue_work(struct work_struct *work)
 		list_del_init(&rq->queuelist);
 		blk_mq_sched_insert_request(rq, false, false, false);
 	}
-
-	blk_mq_run_hw_queue(hctx, false);
 }
 
 void blk_mq_add_to_requeue_list(struct request *rq, bool at_head,
@@ -1464,30 +1461,19 @@  void blk_mq_add_to_requeue_list(struct request *rq, bool at_head,
 	spin_unlock_irqrestore(&hctx->requeue_lock, flags);
 
 	if (kick_requeue_list)
-		blk_mq_kick_requeue_list(rq->q);
+		blk_mq_run_hw_queue(hctx, /*async=*/true);
 }
 
 void blk_mq_kick_requeue_list(struct request_queue *q)
 {
-	struct blk_mq_hw_ctx *hctx;
-	unsigned long i;
-
-	queue_for_each_hw_ctx(q, hctx, i)
-		kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND,
-					    &hctx->requeue_work, 0);
+	blk_mq_run_hw_queues(q, true);
 }
 EXPORT_SYMBOL(blk_mq_kick_requeue_list);
 
 void blk_mq_delay_kick_requeue_list(struct request_queue *q,
 				    unsigned long msecs)
 {
-	struct blk_mq_hw_ctx *hctx;
-	unsigned long i;
-
-	queue_for_each_hw_ctx(q, hctx, i)
-		kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND,
-					    &hctx->requeue_work,
-					    msecs_to_jiffies(msecs));
+	blk_mq_delay_run_hw_queues(q, msecs);
 }
 EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list);
 
@@ -2146,6 +2132,8 @@  static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 	 */
 	WARN_ON_ONCE(in_interrupt());
 
+	blk_mq_process_requeue_list(hctx);
+
 	blk_mq_run_dispatch_ops(hctx->queue,
 			blk_mq_sched_dispatch_requests(hctx));
 }
@@ -2317,7 +2305,7 @@  void blk_mq_run_hw_queues(struct request_queue *q, bool async)
 		 * scheduler.
 		 */
 		if (!sq_hctx || sq_hctx == hctx ||
-		    !list_empty_careful(&hctx->dispatch))
+		    blk_mq_hctx_has_pending(hctx))
 			blk_mq_run_hw_queue(hctx, async);
 	}
 }
@@ -2353,7 +2341,7 @@  void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs)
 		 * scheduler.
 		 */
 		if (!sq_hctx || sq_hctx == hctx ||
-		    !list_empty_careful(&hctx->dispatch))
+		    blk_mq_hctx_has_pending(hctx))
 			blk_mq_delay_run_hw_queue(hctx, msecs);
 	}
 }
@@ -3608,7 +3596,6 @@  static int blk_mq_init_hctx(struct request_queue *q,
 		struct blk_mq_tag_set *set,
 		struct blk_mq_hw_ctx *hctx, unsigned hctx_idx)
 {
-	INIT_DELAYED_WORK(&hctx->requeue_work, blk_mq_requeue_work);
 	INIT_LIST_HEAD(&hctx->requeue_list);
 	spin_lock_init(&hctx->requeue_lock);
 
@@ -4771,10 +4758,8 @@  void blk_mq_cancel_work_sync(struct request_queue *q)
 	struct blk_mq_hw_ctx *hctx;
 	unsigned long i;
 
-	queue_for_each_hw_ctx(q, hctx, i) {
-		cancel_delayed_work_sync(&hctx->requeue_work);
+	queue_for_each_hw_ctx(q, hctx, i)
 		cancel_delayed_work_sync(&hctx->run_work);
-	}
 }
 
 static int __init blk_mq_init(void)
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 0157f1569980..e62feb17af96 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -313,7 +313,6 @@  struct blk_mq_hw_ctx {
 
 	struct list_head	requeue_list;
 	spinlock_t		requeue_lock;
-	struct delayed_work	requeue_work;
 
 	/**
 	 * @run_work: Used for scheduling a hardware queue run at a later time.

[v2,06/12] block: Preserve the order of requeued requests

Commit Message

Comments

Patch