Message ID | 20241119002815.600608-15-bvanassche@acm.org (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | Improve write performance for zoned UFS devices | expand |
On 11/19/24 09:28, Bart Van Assche wrote: > Zoned writes may be requeued, e.g. if a block driver returns > BLK_STS_RESOURCE. Requests may be requeued in another order than > submitted. Restore the request order if requests are requeued. > > Signed-off-by: Bart Van Assche <bvanassche@acm.org> > --- > block/bfq-iosched.c | 2 ++ > block/blk-mq.c | 20 +++++++++++++++++++- > block/blk-mq.h | 2 ++ > block/kyber-iosched.c | 2 ++ > block/mq-deadline.c | 7 ++++++- > include/linux/blk-mq.h | 2 +- > 6 files changed, 32 insertions(+), 3 deletions(-) > > diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c > index 0747d9d0e48c..13bedbf03bd2 100644 > --- a/block/bfq-iosched.c > +++ b/block/bfq-iosched.c > @@ -6265,6 +6265,8 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, > > if (flags & BLK_MQ_INSERT_AT_HEAD) { > list_add(&rq->queuelist, &bfqd->dispatch); > + } else if (flags & BLK_MQ_INSERT_ORDERED) { > + blk_mq_insert_ordered(rq, &bfqd->dispatch); > } else if (!bfqq) { > list_add_tail(&rq->queuelist, &bfqd->dispatch); > } else { > diff --git a/block/blk-mq.c b/block/blk-mq.c > index f134d5e1c4a1..1302ccbf2a7d 100644 > --- a/block/blk-mq.c > +++ b/block/blk-mq.c > @@ -1564,7 +1564,9 @@ static void blk_mq_requeue_work(struct work_struct *work) > * already. Insert it into the hctx dispatch list to avoid > * block layer merges for the request. > */ > - if (rq->rq_flags & RQF_DONTPREP) > + if (blk_rq_is_seq_zoned_write(rq)) > + blk_mq_insert_request(rq, BLK_MQ_INSERT_ORDERED); Is this OK to do without any starvation prevention ? A high LBA write that constantly gets requeued behind low LBA writes could end up in a timeout situation, no ? > + else if (rq->rq_flags & RQF_DONTPREP) > blk_mq_request_bypass_insert(rq, 0); > else > blk_mq_insert_request(rq, BLK_MQ_INSERT_AT_HEAD); > @@ -2599,6 +2601,20 @@ static void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, > blk_mq_run_hw_queue(hctx, run_queue_async); > } > > +void blk_mq_insert_ordered(struct request *rq, struct list_head *list) > +{ > + struct request_queue *q = rq->q; > + struct request *rq2; > + > + list_for_each_entry(rq2, list, queuelist) > + if (rq2->q == q && blk_rq_pos(rq2) > blk_rq_pos(rq)) > + break; > + > + /* Insert rq before rq2. If rq2 is the list head, append at the end. */ > + list_add_tail(&rq->queuelist, &rq2->queuelist); > +} > +EXPORT_SYMBOL_GPL(blk_mq_insert_ordered); > + > static void blk_mq_insert_request(struct request *rq, blk_insert_t flags) > { > struct request_queue *q = rq->q; > @@ -2653,6 +2669,8 @@ static void blk_mq_insert_request(struct request *rq, blk_insert_t flags) > spin_lock(&ctx->lock); > if (flags & BLK_MQ_INSERT_AT_HEAD) > list_add(&rq->queuelist, &ctx->rq_lists[hctx->type]); > + else if (flags & BLK_MQ_INSERT_ORDERED) > + blk_mq_insert_ordered(rq, &ctx->rq_lists[hctx->type]); > else > list_add_tail(&rq->queuelist, > &ctx->rq_lists[hctx->type]); > diff --git a/block/blk-mq.h b/block/blk-mq.h > index 309db553aba6..10b9fb3ca762 100644 > --- a/block/blk-mq.h > +++ b/block/blk-mq.h > @@ -40,8 +40,10 @@ enum { > > typedef unsigned int __bitwise blk_insert_t; > #define BLK_MQ_INSERT_AT_HEAD ((__force blk_insert_t)0x01) > +#define BLK_MQ_INSERT_ORDERED ((__force blk_insert_t)0x02) > > void blk_mq_submit_bio(struct bio *bio); > +void blk_mq_insert_ordered(struct request *rq, struct list_head *list); > int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob, > unsigned int flags); > void blk_mq_exit_queue(struct request_queue *q); > diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c > index 4155594aefc6..77bb41bab68d 100644 > --- a/block/kyber-iosched.c > +++ b/block/kyber-iosched.c > @@ -603,6 +603,8 @@ static void kyber_insert_requests(struct blk_mq_hw_ctx *hctx, > trace_block_rq_insert(rq); > if (flags & BLK_MQ_INSERT_AT_HEAD) > list_move(&rq->queuelist, head); > + else if (flags & BLK_MQ_INSERT_ORDERED) > + blk_mq_insert_ordered(rq, head); > else > list_move_tail(&rq->queuelist, head); > sbitmap_set_bit(&khd->kcq_map[sched_domain], > diff --git a/block/mq-deadline.c b/block/mq-deadline.c > index 2edf84b1bc2a..200e5a2928ce 100644 > --- a/block/mq-deadline.c > +++ b/block/mq-deadline.c > @@ -711,7 +711,12 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, > * set expire time and add to fifo list > */ > rq->fifo_time = jiffies + dd->fifo_expire[data_dir]; > - list_add_tail(&rq->queuelist, &per_prio->fifo_list[data_dir]); > + if (flags & BLK_MQ_INSERT_ORDERED) > + blk_mq_insert_ordered(rq, > + &per_prio->fifo_list[data_dir]); > + else > + list_add_tail(&rq->queuelist, > + &per_prio->fifo_list[data_dir]); > } > } > > diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h > index ac05974f08f9..f7514eefccfd 100644 > --- a/include/linux/blk-mq.h > +++ b/include/linux/blk-mq.h > @@ -85,7 +85,7 @@ enum { > > /* flags that prevent us from merging requests: */ > #define RQF_NOMERGE_FLAGS \ > - (RQF_STARTED | RQF_FLUSH_SEQ | RQF_SPECIAL_PAYLOAD) > + (RQF_STARTED | RQF_FLUSH_SEQ | RQF_DONTPREP | RQF_SPECIAL_PAYLOAD) > > enum mq_rq_state { > MQ_RQ_IDLE = 0,
On 11/18/24 11:52 PM, Damien Le Moal wrote: > On 11/19/24 09:28, Bart Van Assche wrote: >> diff --git a/block/blk-mq.c b/block/blk-mq.c >> index f134d5e1c4a1..1302ccbf2a7d 100644 >> --- a/block/blk-mq.c >> +++ b/block/blk-mq.c >> @@ -1564,7 +1564,9 @@ static void blk_mq_requeue_work(struct work_struct *work) >> * already. Insert it into the hctx dispatch list to avoid >> * block layer merges for the request. >> */ >> - if (rq->rq_flags & RQF_DONTPREP) >> + if (blk_rq_is_seq_zoned_write(rq)) >> + blk_mq_insert_request(rq, BLK_MQ_INSERT_ORDERED); > > Is this OK to do without any starvation prevention ? A high LBA write that > constantly gets requeued behind low LBA writes could end up in a timeout > situation, no ? Hi Damien, Requeuing zoned writes should be exceptional and shouldn't happen often. Such starvation can only happen if zoned writes for two different zones are requeued over and over again. If that happens there will not only be starvation for the write with the higher LBA but also retry count exhaustion for the write with the lower LBA. If we agree that zoned write retries are rare then I don't think we have to worry about this kind of starvation. Thanks, Bart.
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 0747d9d0e48c..13bedbf03bd2 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -6265,6 +6265,8 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, if (flags & BLK_MQ_INSERT_AT_HEAD) { list_add(&rq->queuelist, &bfqd->dispatch); + } else if (flags & BLK_MQ_INSERT_ORDERED) { + blk_mq_insert_ordered(rq, &bfqd->dispatch); } else if (!bfqq) { list_add_tail(&rq->queuelist, &bfqd->dispatch); } else { diff --git a/block/blk-mq.c b/block/blk-mq.c index f134d5e1c4a1..1302ccbf2a7d 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1564,7 +1564,9 @@ static void blk_mq_requeue_work(struct work_struct *work) * already. Insert it into the hctx dispatch list to avoid * block layer merges for the request. */ - if (rq->rq_flags & RQF_DONTPREP) + if (blk_rq_is_seq_zoned_write(rq)) + blk_mq_insert_request(rq, BLK_MQ_INSERT_ORDERED); + else if (rq->rq_flags & RQF_DONTPREP) blk_mq_request_bypass_insert(rq, 0); else blk_mq_insert_request(rq, BLK_MQ_INSERT_AT_HEAD); @@ -2599,6 +2601,20 @@ static void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, blk_mq_run_hw_queue(hctx, run_queue_async); } +void blk_mq_insert_ordered(struct request *rq, struct list_head *list) +{ + struct request_queue *q = rq->q; + struct request *rq2; + + list_for_each_entry(rq2, list, queuelist) + if (rq2->q == q && blk_rq_pos(rq2) > blk_rq_pos(rq)) + break; + + /* Insert rq before rq2. If rq2 is the list head, append at the end. */ + list_add_tail(&rq->queuelist, &rq2->queuelist); +} +EXPORT_SYMBOL_GPL(blk_mq_insert_ordered); + static void blk_mq_insert_request(struct request *rq, blk_insert_t flags) { struct request_queue *q = rq->q; @@ -2653,6 +2669,8 @@ static void blk_mq_insert_request(struct request *rq, blk_insert_t flags) spin_lock(&ctx->lock); if (flags & BLK_MQ_INSERT_AT_HEAD) list_add(&rq->queuelist, &ctx->rq_lists[hctx->type]); + else if (flags & BLK_MQ_INSERT_ORDERED) + blk_mq_insert_ordered(rq, &ctx->rq_lists[hctx->type]); else list_add_tail(&rq->queuelist, &ctx->rq_lists[hctx->type]); diff --git a/block/blk-mq.h b/block/blk-mq.h index 309db553aba6..10b9fb3ca762 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -40,8 +40,10 @@ enum { typedef unsigned int __bitwise blk_insert_t; #define BLK_MQ_INSERT_AT_HEAD ((__force blk_insert_t)0x01) +#define BLK_MQ_INSERT_ORDERED ((__force blk_insert_t)0x02) void blk_mq_submit_bio(struct bio *bio); +void blk_mq_insert_ordered(struct request *rq, struct list_head *list); int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob, unsigned int flags); void blk_mq_exit_queue(struct request_queue *q); diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c index 4155594aefc6..77bb41bab68d 100644 --- a/block/kyber-iosched.c +++ b/block/kyber-iosched.c @@ -603,6 +603,8 @@ static void kyber_insert_requests(struct blk_mq_hw_ctx *hctx, trace_block_rq_insert(rq); if (flags & BLK_MQ_INSERT_AT_HEAD) list_move(&rq->queuelist, head); + else if (flags & BLK_MQ_INSERT_ORDERED) + blk_mq_insert_ordered(rq, head); else list_move_tail(&rq->queuelist, head); sbitmap_set_bit(&khd->kcq_map[sched_domain], diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 2edf84b1bc2a..200e5a2928ce 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -711,7 +711,12 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, * set expire time and add to fifo list */ rq->fifo_time = jiffies + dd->fifo_expire[data_dir]; - list_add_tail(&rq->queuelist, &per_prio->fifo_list[data_dir]); + if (flags & BLK_MQ_INSERT_ORDERED) + blk_mq_insert_ordered(rq, + &per_prio->fifo_list[data_dir]); + else + list_add_tail(&rq->queuelist, + &per_prio->fifo_list[data_dir]); } } diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index ac05974f08f9..f7514eefccfd 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -85,7 +85,7 @@ enum { /* flags that prevent us from merging requests: */ #define RQF_NOMERGE_FLAGS \ - (RQF_STARTED | RQF_FLUSH_SEQ | RQF_SPECIAL_PAYLOAD) + (RQF_STARTED | RQF_FLUSH_SEQ | RQF_DONTPREP | RQF_SPECIAL_PAYLOAD) enum mq_rq_state { MQ_RQ_IDLE = 0,
Zoned writes may be requeued, e.g. if a block driver returns BLK_STS_RESOURCE. Requests may be requeued in another order than submitted. Restore the request order if requests are requeued. Signed-off-by: Bart Van Assche <bvanassche@acm.org> --- block/bfq-iosched.c | 2 ++ block/blk-mq.c | 20 +++++++++++++++++++- block/blk-mq.h | 2 ++ block/kyber-iosched.c | 2 ++ block/mq-deadline.c | 7 ++++++- include/linux/blk-mq.h | 2 +- 6 files changed, 32 insertions(+), 3 deletions(-)