diff mbox series

[v16,14/26] blk-mq: Restore the zoned write order when requeuing

Message ID 20241119002815.600608-15-bvanassche@acm.org (mailing list archive)
State Not Applicable
Headers show
Series Improve write performance for zoned UFS devices | expand

Commit Message

Bart Van Assche Nov. 19, 2024, 12:28 a.m. UTC
Zoned writes may be requeued, e.g. if a block driver returns
BLK_STS_RESOURCE. Requests may be requeued in another order than
submitted. Restore the request order if requests are requeued.

Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
 block/bfq-iosched.c    |  2 ++
 block/blk-mq.c         | 20 +++++++++++++++++++-
 block/blk-mq.h         |  2 ++
 block/kyber-iosched.c  |  2 ++
 block/mq-deadline.c    |  7 ++++++-
 include/linux/blk-mq.h |  2 +-
 6 files changed, 32 insertions(+), 3 deletions(-)

Comments

Damien Le Moal Nov. 19, 2024, 7:52 a.m. UTC | #1
On 11/19/24 09:28, Bart Van Assche wrote:
> Zoned writes may be requeued, e.g. if a block driver returns
> BLK_STS_RESOURCE. Requests may be requeued in another order than
> submitted. Restore the request order if requests are requeued.
> 
> Signed-off-by: Bart Van Assche <bvanassche@acm.org>
> ---
>  block/bfq-iosched.c    |  2 ++
>  block/blk-mq.c         | 20 +++++++++++++++++++-
>  block/blk-mq.h         |  2 ++
>  block/kyber-iosched.c  |  2 ++
>  block/mq-deadline.c    |  7 ++++++-
>  include/linux/blk-mq.h |  2 +-
>  6 files changed, 32 insertions(+), 3 deletions(-)
> 
> diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
> index 0747d9d0e48c..13bedbf03bd2 100644
> --- a/block/bfq-iosched.c
> +++ b/block/bfq-iosched.c
> @@ -6265,6 +6265,8 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
>  
>  	if (flags & BLK_MQ_INSERT_AT_HEAD) {
>  		list_add(&rq->queuelist, &bfqd->dispatch);
> +	} else if (flags & BLK_MQ_INSERT_ORDERED) {
> +		blk_mq_insert_ordered(rq, &bfqd->dispatch);
>  	} else if (!bfqq) {
>  		list_add_tail(&rq->queuelist, &bfqd->dispatch);
>  	} else {
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index f134d5e1c4a1..1302ccbf2a7d 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -1564,7 +1564,9 @@ static void blk_mq_requeue_work(struct work_struct *work)
>  		 * already.  Insert it into the hctx dispatch list to avoid
>  		 * block layer merges for the request.
>  		 */
> -		if (rq->rq_flags & RQF_DONTPREP)
> +		if (blk_rq_is_seq_zoned_write(rq))
> +			blk_mq_insert_request(rq, BLK_MQ_INSERT_ORDERED);

Is this OK to do without any starvation prevention ? A high LBA write that
constantly gets requeued behind low LBA writes could end up in a timeout
situation, no ?

> +		else if (rq->rq_flags & RQF_DONTPREP)
>  			blk_mq_request_bypass_insert(rq, 0);
>  		else
>  			blk_mq_insert_request(rq, BLK_MQ_INSERT_AT_HEAD);
> @@ -2599,6 +2601,20 @@ static void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx,
>  	blk_mq_run_hw_queue(hctx, run_queue_async);
>  }
>  
> +void blk_mq_insert_ordered(struct request *rq, struct list_head *list)
> +{
> +	struct request_queue *q = rq->q;
> +	struct request *rq2;
> +
> +	list_for_each_entry(rq2, list, queuelist)
> +		if (rq2->q == q && blk_rq_pos(rq2) > blk_rq_pos(rq))
> +			break;
> +
> +	/* Insert rq before rq2. If rq2 is the list head, append at the end. */
> +	list_add_tail(&rq->queuelist, &rq2->queuelist);
> +}
> +EXPORT_SYMBOL_GPL(blk_mq_insert_ordered);
> +
>  static void blk_mq_insert_request(struct request *rq, blk_insert_t flags)
>  {
>  	struct request_queue *q = rq->q;
> @@ -2653,6 +2669,8 @@ static void blk_mq_insert_request(struct request *rq, blk_insert_t flags)
>  		spin_lock(&ctx->lock);
>  		if (flags & BLK_MQ_INSERT_AT_HEAD)
>  			list_add(&rq->queuelist, &ctx->rq_lists[hctx->type]);
> +		else if (flags & BLK_MQ_INSERT_ORDERED)
> +			blk_mq_insert_ordered(rq, &ctx->rq_lists[hctx->type]);
>  		else
>  			list_add_tail(&rq->queuelist,
>  				      &ctx->rq_lists[hctx->type]);
> diff --git a/block/blk-mq.h b/block/blk-mq.h
> index 309db553aba6..10b9fb3ca762 100644
> --- a/block/blk-mq.h
> +++ b/block/blk-mq.h
> @@ -40,8 +40,10 @@ enum {
>  
>  typedef unsigned int __bitwise blk_insert_t;
>  #define BLK_MQ_INSERT_AT_HEAD		((__force blk_insert_t)0x01)
> +#define BLK_MQ_INSERT_ORDERED		((__force blk_insert_t)0x02)
>  
>  void blk_mq_submit_bio(struct bio *bio);
> +void blk_mq_insert_ordered(struct request *rq, struct list_head *list);
>  int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob,
>  		unsigned int flags);
>  void blk_mq_exit_queue(struct request_queue *q);
> diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c
> index 4155594aefc6..77bb41bab68d 100644
> --- a/block/kyber-iosched.c
> +++ b/block/kyber-iosched.c
> @@ -603,6 +603,8 @@ static void kyber_insert_requests(struct blk_mq_hw_ctx *hctx,
>  		trace_block_rq_insert(rq);
>  		if (flags & BLK_MQ_INSERT_AT_HEAD)
>  			list_move(&rq->queuelist, head);
> +		else if (flags & BLK_MQ_INSERT_ORDERED)
> +			blk_mq_insert_ordered(rq, head);
>  		else
>  			list_move_tail(&rq->queuelist, head);
>  		sbitmap_set_bit(&khd->kcq_map[sched_domain],
> diff --git a/block/mq-deadline.c b/block/mq-deadline.c
> index 2edf84b1bc2a..200e5a2928ce 100644
> --- a/block/mq-deadline.c
> +++ b/block/mq-deadline.c
> @@ -711,7 +711,12 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
>  		 * set expire time and add to fifo list
>  		 */
>  		rq->fifo_time = jiffies + dd->fifo_expire[data_dir];
> -		list_add_tail(&rq->queuelist, &per_prio->fifo_list[data_dir]);
> +		if (flags & BLK_MQ_INSERT_ORDERED)
> +			blk_mq_insert_ordered(rq,
> +					      &per_prio->fifo_list[data_dir]);
> +		else
> +			list_add_tail(&rq->queuelist,
> +				      &per_prio->fifo_list[data_dir]);
>  	}
>  }
>  
> diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
> index ac05974f08f9..f7514eefccfd 100644
> --- a/include/linux/blk-mq.h
> +++ b/include/linux/blk-mq.h
> @@ -85,7 +85,7 @@ enum {
>  
>  /* flags that prevent us from merging requests: */
>  #define RQF_NOMERGE_FLAGS \
> -	(RQF_STARTED | RQF_FLUSH_SEQ | RQF_SPECIAL_PAYLOAD)
> +	(RQF_STARTED | RQF_FLUSH_SEQ | RQF_DONTPREP | RQF_SPECIAL_PAYLOAD)
>  
>  enum mq_rq_state {
>  	MQ_RQ_IDLE		= 0,
Bart Van Assche Nov. 19, 2024, 9:16 p.m. UTC | #2
On 11/18/24 11:52 PM, Damien Le Moal wrote:
> On 11/19/24 09:28, Bart Van Assche wrote:
>> diff --git a/block/blk-mq.c b/block/blk-mq.c
>> index f134d5e1c4a1..1302ccbf2a7d 100644
>> --- a/block/blk-mq.c
>> +++ b/block/blk-mq.c
>> @@ -1564,7 +1564,9 @@ static void blk_mq_requeue_work(struct work_struct *work)
>>   		 * already.  Insert it into the hctx dispatch list to avoid
>>   		 * block layer merges for the request.
>>   		 */
>> -		if (rq->rq_flags & RQF_DONTPREP)
>> +		if (blk_rq_is_seq_zoned_write(rq))
>> +			blk_mq_insert_request(rq, BLK_MQ_INSERT_ORDERED);
> 
> Is this OK to do without any starvation prevention ? A high LBA write that
> constantly gets requeued behind low LBA writes could end up in a timeout
> situation, no ?

Hi Damien,

Requeuing zoned writes should be exceptional and shouldn't happen often.
Such starvation can only happen if zoned writes for two different zones
are requeued over and over again. If that happens there will not only be
starvation for the write with the higher LBA but also retry count
exhaustion for the write with the lower LBA. If we agree that zoned
write retries are rare then I don't think we have to worry about
this kind of starvation.

Thanks,

Bart.
diff mbox series

Patch

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 0747d9d0e48c..13bedbf03bd2 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -6265,6 +6265,8 @@  static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
 
 	if (flags & BLK_MQ_INSERT_AT_HEAD) {
 		list_add(&rq->queuelist, &bfqd->dispatch);
+	} else if (flags & BLK_MQ_INSERT_ORDERED) {
+		blk_mq_insert_ordered(rq, &bfqd->dispatch);
 	} else if (!bfqq) {
 		list_add_tail(&rq->queuelist, &bfqd->dispatch);
 	} else {
diff --git a/block/blk-mq.c b/block/blk-mq.c
index f134d5e1c4a1..1302ccbf2a7d 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1564,7 +1564,9 @@  static void blk_mq_requeue_work(struct work_struct *work)
 		 * already.  Insert it into the hctx dispatch list to avoid
 		 * block layer merges for the request.
 		 */
-		if (rq->rq_flags & RQF_DONTPREP)
+		if (blk_rq_is_seq_zoned_write(rq))
+			blk_mq_insert_request(rq, BLK_MQ_INSERT_ORDERED);
+		else if (rq->rq_flags & RQF_DONTPREP)
 			blk_mq_request_bypass_insert(rq, 0);
 		else
 			blk_mq_insert_request(rq, BLK_MQ_INSERT_AT_HEAD);
@@ -2599,6 +2601,20 @@  static void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx,
 	blk_mq_run_hw_queue(hctx, run_queue_async);
 }
 
+void blk_mq_insert_ordered(struct request *rq, struct list_head *list)
+{
+	struct request_queue *q = rq->q;
+	struct request *rq2;
+
+	list_for_each_entry(rq2, list, queuelist)
+		if (rq2->q == q && blk_rq_pos(rq2) > blk_rq_pos(rq))
+			break;
+
+	/* Insert rq before rq2. If rq2 is the list head, append at the end. */
+	list_add_tail(&rq->queuelist, &rq2->queuelist);
+}
+EXPORT_SYMBOL_GPL(blk_mq_insert_ordered);
+
 static void blk_mq_insert_request(struct request *rq, blk_insert_t flags)
 {
 	struct request_queue *q = rq->q;
@@ -2653,6 +2669,8 @@  static void blk_mq_insert_request(struct request *rq, blk_insert_t flags)
 		spin_lock(&ctx->lock);
 		if (flags & BLK_MQ_INSERT_AT_HEAD)
 			list_add(&rq->queuelist, &ctx->rq_lists[hctx->type]);
+		else if (flags & BLK_MQ_INSERT_ORDERED)
+			blk_mq_insert_ordered(rq, &ctx->rq_lists[hctx->type]);
 		else
 			list_add_tail(&rq->queuelist,
 				      &ctx->rq_lists[hctx->type]);
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 309db553aba6..10b9fb3ca762 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -40,8 +40,10 @@  enum {
 
 typedef unsigned int __bitwise blk_insert_t;
 #define BLK_MQ_INSERT_AT_HEAD		((__force blk_insert_t)0x01)
+#define BLK_MQ_INSERT_ORDERED		((__force blk_insert_t)0x02)
 
 void blk_mq_submit_bio(struct bio *bio);
+void blk_mq_insert_ordered(struct request *rq, struct list_head *list);
 int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob,
 		unsigned int flags);
 void blk_mq_exit_queue(struct request_queue *q);
diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c
index 4155594aefc6..77bb41bab68d 100644
--- a/block/kyber-iosched.c
+++ b/block/kyber-iosched.c
@@ -603,6 +603,8 @@  static void kyber_insert_requests(struct blk_mq_hw_ctx *hctx,
 		trace_block_rq_insert(rq);
 		if (flags & BLK_MQ_INSERT_AT_HEAD)
 			list_move(&rq->queuelist, head);
+		else if (flags & BLK_MQ_INSERT_ORDERED)
+			blk_mq_insert_ordered(rq, head);
 		else
 			list_move_tail(&rq->queuelist, head);
 		sbitmap_set_bit(&khd->kcq_map[sched_domain],
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index 2edf84b1bc2a..200e5a2928ce 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -711,7 +711,12 @@  static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
 		 * set expire time and add to fifo list
 		 */
 		rq->fifo_time = jiffies + dd->fifo_expire[data_dir];
-		list_add_tail(&rq->queuelist, &per_prio->fifo_list[data_dir]);
+		if (flags & BLK_MQ_INSERT_ORDERED)
+			blk_mq_insert_ordered(rq,
+					      &per_prio->fifo_list[data_dir]);
+		else
+			list_add_tail(&rq->queuelist,
+				      &per_prio->fifo_list[data_dir]);
 	}
 }
 
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index ac05974f08f9..f7514eefccfd 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -85,7 +85,7 @@  enum {
 
 /* flags that prevent us from merging requests: */
 #define RQF_NOMERGE_FLAGS \
-	(RQF_STARTED | RQF_FLUSH_SEQ | RQF_SPECIAL_PAYLOAD)
+	(RQF_STARTED | RQF_FLUSH_SEQ | RQF_DONTPREP | RQF_SPECIAL_PAYLOAD)
 
 enum mq_rq_state {
 	MQ_RQ_IDLE		= 0,