diff mbox series

[v2,03/12] block: Send requeued requests to the I/O scheduler

Message ID 20230407235822.1672286-4-bvanassche@acm.org (mailing list archive)
State New, archived
Headers show
Series Submit zoned writes in order | expand

Commit Message

Bart Van Assche April 7, 2023, 11:58 p.m. UTC
Let the I/O scheduler control which requests are dispatched.

Cc: Christoph Hellwig <hch@lst.de>
Cc: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Cc: Ming Lei <ming.lei@redhat.com>
Cc: Mike Snitzer <snitzer@kernel.org>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
 block/blk-mq.c         | 21 +++++++++------------
 include/linux/blk-mq.h |  5 +++--
 2 files changed, 12 insertions(+), 14 deletions(-)

Comments

Damien Le Moal April 10, 2023, 7:53 a.m. UTC | #1
On 4/8/23 08:58, Bart Van Assche wrote:
> Let the I/O scheduler control which requests are dispatched.

Well, that is the main function of the IO scheduler already. So could you
develop the commit message here to explain in more details which case this is
changing ?

> 
> Cc: Christoph Hellwig <hch@lst.de>
> Cc: Damien Le Moal <damien.lemoal@opensource.wdc.com>
> Cc: Ming Lei <ming.lei@redhat.com>
> Cc: Mike Snitzer <snitzer@kernel.org>
> Signed-off-by: Bart Van Assche <bvanassche@acm.org>
> ---
>  block/blk-mq.c         | 21 +++++++++------------
>  include/linux/blk-mq.h |  5 +++--
>  2 files changed, 12 insertions(+), 14 deletions(-)
> 
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 250556546bbf..57315395434b 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -1426,15 +1426,7 @@ static void blk_mq_requeue_work(struct work_struct *work)
>  
>  		rq->rq_flags &= ~RQF_SOFTBARRIER;
>  		list_del_init(&rq->queuelist);
> -		/*
> -		 * If RQF_DONTPREP, rq has contained some driver specific
> -		 * data, so insert it to hctx dispatch list to avoid any
> -		 * merge.
> -		 */
> -		if (rq->rq_flags & RQF_DONTPREP)
> -			blk_mq_request_bypass_insert(rq, false, false);
> -		else
> -			blk_mq_sched_insert_request(rq, true, false, false);
> +		blk_mq_sched_insert_request(rq, /*at_head=*/true, false, false);
>  	}
>  
>  	while (!list_empty(&rq_list)) {
> @@ -2065,9 +2057,14 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
>  		if (nr_budgets)
>  			blk_mq_release_budgets(q, list);
>  
> -		spin_lock(&hctx->lock);
> -		list_splice_tail_init(list, &hctx->dispatch);
> -		spin_unlock(&hctx->lock);
> +		if (!q->elevator) {
> +			spin_lock(&hctx->lock);
> +			list_splice_tail_init(list, &hctx->dispatch);
> +			spin_unlock(&hctx->lock);
> +		} else {
> +			q->elevator->type->ops.insert_requests(hctx, list,
> +							/*at_head=*/true);

Dispatch at head = true ? Why ? This seems wrong. It may be valid for the
requeue case (even then, I am not convinced), but looks very wrong for the
regular dispatch case.

> +		}
>  
>  		/*
>  		 * Order adding requests to hctx->dispatch and checking
> diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
> index 5e6c79ad83d2..3a3bee9085e3 100644
> --- a/include/linux/blk-mq.h
> +++ b/include/linux/blk-mq.h
> @@ -64,8 +64,9 @@ typedef __u32 __bitwise req_flags_t;
>  #define RQF_RESV			((__force req_flags_t)(1 << 23))
>  
>  /* flags that prevent us from merging requests: */
> -#define RQF_NOMERGE_FLAGS \
> -	(RQF_STARTED | RQF_SOFTBARRIER | RQF_FLUSH_SEQ | RQF_SPECIAL_PAYLOAD)
> +#define RQF_NOMERGE_FLAGS                                               \
> +	(RQF_STARTED | RQF_SOFTBARRIER | RQF_FLUSH_SEQ | RQF_DONTPREP | \
> +	 RQF_SPECIAL_PAYLOAD)
>  
>  enum mq_rq_state {
>  	MQ_RQ_IDLE		= 0,
Bart Van Assche April 10, 2023, 4:59 p.m. UTC | #2
On 4/10/23 00:53, Damien Le Moal wrote:
> On 4/8/23 08:58, Bart Van Assche wrote:
>> @@ -2065,9 +2057,14 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
>>   		if (nr_budgets)
>>   			blk_mq_release_budgets(q, list);
>>   
>> -		spin_lock(&hctx->lock);
>> -		list_splice_tail_init(list, &hctx->dispatch);
>> -		spin_unlock(&hctx->lock);
>> +		if (!q->elevator) {
>> +			spin_lock(&hctx->lock);
>> +			list_splice_tail_init(list, &hctx->dispatch);
>> +			spin_unlock(&hctx->lock);
>> +		} else {
>> +			q->elevator->type->ops.insert_requests(hctx, list,
>> +							/*at_head=*/true);
> 
> Dispatch at head = true ? Why ? This seems wrong. It may be valid for the
> requeue case (even then, I am not convinced), but looks very wrong for the
> regular dispatch case.

Hi Damien,

blk_mq_sched_dispatch_requests() dispatches requests from hctx->dispatch 
before it checks whether any requests can be dispatched from the I/O 
scheduler. As one can see in the quoted change above, 
blk_mq_dispatch_rq_list() moves any requests that could not be 
dispatched to the hctx->dispatch dispatch list. Since 
blk_mq_sched_dispatch_requests() processes the dispatch list first, this 
comes down to insertion at the head of the list. Hence the at_head=true 
argument in the call to insert_requests().

Thanks,

Bart.
Christoph Hellwig April 11, 2023, 12:38 p.m. UTC | #3
On Fri, Apr 07, 2023 at 04:58:13PM -0700, Bart Van Assche wrote:
> -		/*
> -		 * If RQF_DONTPREP, rq has contained some driver specific
> -		 * data, so insert it to hctx dispatch list to avoid any
> -		 * merge.
> -		 */
> -		if (rq->rq_flags & RQF_DONTPREP)
> -			blk_mq_request_bypass_insert(rq, false, false);
> -		else
> -			blk_mq_sched_insert_request(rq, true, false, false);
> +		blk_mq_sched_insert_request(rq, /*at_head=*/true, false, false);

This effectively undoes commit aef1897cd36d, and instead you add
RQF_DONTPREP to RQF_NOMERGE_FLAGS.  This makes sense to be, but should
probably be more clearly documented in the commit log.

> @@ -2065,9 +2057,14 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
>  		if (nr_budgets)
>  			blk_mq_release_budgets(q, list);
>  
> -		spin_lock(&hctx->lock);
> -		list_splice_tail_init(list, &hctx->dispatch);
> -		spin_unlock(&hctx->lock);
> +		if (!q->elevator) {
> +			spin_lock(&hctx->lock);
> +			list_splice_tail_init(list, &hctx->dispatch);
> +			spin_unlock(&hctx->lock);
> +		} else {
> +			q->elevator->type->ops.insert_requests(hctx, list,
> +							/*at_head=*/true);
> +		}

But I have no idea how this is related in any way.
Christoph Hellwig April 11, 2023, 1:14 p.m. UTC | #4
On Fri, Apr 07, 2023 at 04:58:13PM -0700, Bart Van Assche wrote:
> +		blk_mq_sched_insert_request(rq, /*at_head=*/true, false, false);

The whole usage of at_head in the request_list-related code looks
suspicious to me.

All callers of blk_mq_add_to_requeue_list except for blk_kick_flush
pass at_head=true.  So the request_list is basically LIFO except for
that one caller.

blk_mq_requeue_work than does a HEAD insert for them, unless they
are marked RQF_DONTPREP because the driver already did some setup.
So except for the RQF_DONTPREP we basically revert the at_head insert.

This all feels wrong to me.  I think we need to get to a point where
the request_list itself is always added to at the tail, processed
head to tail, but inserted into the scheduler or the hctx rq_list
before other pending requests, probaly using similar code as
blk_mq_flush_plug_list / blk_mq_dispatch_plug_list.
Bart Van Assche April 11, 2023, 5:17 p.m. UTC | #5
On 4/11/23 05:38, Christoph Hellwig wrote:
> On Fri, Apr 07, 2023 at 04:58:13PM -0700, Bart Van Assche wrote:
>> @@ -2065,9 +2057,14 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
>>   		if (nr_budgets)
>>   			blk_mq_release_budgets(q, list);
>>   
>> -		spin_lock(&hctx->lock);
>> -		list_splice_tail_init(list, &hctx->dispatch);
>> -		spin_unlock(&hctx->lock);
>> +		if (!q->elevator) {
>> +			spin_lock(&hctx->lock);
>> +			list_splice_tail_init(list, &hctx->dispatch);
>> +			spin_unlock(&hctx->lock);
>> +		} else {
>> +			q->elevator->type->ops.insert_requests(hctx, list,
>> +							/*at_head=*/true);
>> +		}
> 
> But I have no idea how this is related in any way.

Hi Christoph,

The I/O scheduler can only control the order in which requests are 
dispatched if:
- blk_mq_run_hw_queue() moves requests from the requeue list to the I/O
   scheduler before it asks the I/O scheduler to dispatch a request.
- No requests end up on any other queue than the I/O scheduler queue or
   the requeue list.

The scope of this patch is to send requeued requests back to the I/O 
scheduler. Hence the above change.

Thanks,

Bart.
diff mbox series

Patch

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 250556546bbf..57315395434b 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1426,15 +1426,7 @@  static void blk_mq_requeue_work(struct work_struct *work)
 
 		rq->rq_flags &= ~RQF_SOFTBARRIER;
 		list_del_init(&rq->queuelist);
-		/*
-		 * If RQF_DONTPREP, rq has contained some driver specific
-		 * data, so insert it to hctx dispatch list to avoid any
-		 * merge.
-		 */
-		if (rq->rq_flags & RQF_DONTPREP)
-			blk_mq_request_bypass_insert(rq, false, false);
-		else
-			blk_mq_sched_insert_request(rq, true, false, false);
+		blk_mq_sched_insert_request(rq, /*at_head=*/true, false, false);
 	}
 
 	while (!list_empty(&rq_list)) {
@@ -2065,9 +2057,14 @@  bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
 		if (nr_budgets)
 			blk_mq_release_budgets(q, list);
 
-		spin_lock(&hctx->lock);
-		list_splice_tail_init(list, &hctx->dispatch);
-		spin_unlock(&hctx->lock);
+		if (!q->elevator) {
+			spin_lock(&hctx->lock);
+			list_splice_tail_init(list, &hctx->dispatch);
+			spin_unlock(&hctx->lock);
+		} else {
+			q->elevator->type->ops.insert_requests(hctx, list,
+							/*at_head=*/true);
+		}
 
 		/*
 		 * Order adding requests to hctx->dispatch and checking
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 5e6c79ad83d2..3a3bee9085e3 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -64,8 +64,9 @@  typedef __u32 __bitwise req_flags_t;
 #define RQF_RESV			((__force req_flags_t)(1 << 23))
 
 /* flags that prevent us from merging requests: */
-#define RQF_NOMERGE_FLAGS \
-	(RQF_STARTED | RQF_SOFTBARRIER | RQF_FLUSH_SEQ | RQF_SPECIAL_PAYLOAD)
+#define RQF_NOMERGE_FLAGS                                               \
+	(RQF_STARTED | RQF_SOFTBARRIER | RQF_FLUSH_SEQ | RQF_DONTPREP | \
+	 RQF_SPECIAL_PAYLOAD)
 
 enum mq_rq_state {
 	MQ_RQ_IDLE		= 0,