diff mbox series

[v2,4/4] block/mq-deadline: Prevent zoned write reordering due to I/O prioritization

Message ID 20231218211342.2179689-5-bvanassche@acm.org (mailing list archive)
State New, archived
Headers show
Series Improve I/O priority support in mq-deadline for zoned writes | expand

Commit Message

Bart Van Assche Dec. 18, 2023, 9:13 p.m. UTC
Assigning I/O priorities with the ioprio cgroup policy may cause
different I/O priorities to be assigned to write requests for the same
zone. Prevent that this causes unaligned write errors by adding zoned
writes for the same zone in the same priority queue as prior zoned
writes.

Cc: Damien Le Moal <dlemoal@kernel.org>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
 block/Kconfig.iosched    |   5 ++
 block/mq-deadline.c      |  81 +++++++++++++++---
 block/mq-deadline_test.c | 175 +++++++++++++++++++++++++++++++++++++++
 include/linux/blk-mq.h   |  17 ++++
 4 files changed, 268 insertions(+), 10 deletions(-)
 create mode 100644 block/mq-deadline_test.c

Comments

Christoph Hellwig Dec. 19, 2023, 12:10 p.m. UTC | #1
On Mon, Dec 18, 2023 at 01:13:42PM -0800, Bart Van Assche wrote:
> Assigning I/O priorities with the ioprio cgroup policy may cause
> different I/O priorities to be assigned to write requests for the same
> zone. Prevent that this causes unaligned write errors by adding zoned
> writes for the same zone in the same priority queue as prior zoned
> writes.

I still think this is fundamentally the wrong thing to do.  If you set
different priorities, you want I/O to be reordered, so ignoring that
is a bad thing.
Bart Van Assche Dec. 19, 2023, 5:42 p.m. UTC | #2
On 12/19/23 04:10, Christoph Hellwig wrote:
> On Mon, Dec 18, 2023 at 01:13:42PM -0800, Bart Van Assche wrote:
>> Assigning I/O priorities with the ioprio cgroup policy may cause
>> different I/O priorities to be assigned to write requests for the same
>> zone. Prevent that this causes unaligned write errors by adding zoned
>> writes for the same zone in the same priority queue as prior zoned
>> writes.
> 
> I still think this is fundamentally the wrong thing to do.  If you set
> different priorities, you want I/O to be reordered, so ignoring that
> is a bad thing.

Hi Christoph,

How about not setting the I/O priority of sequential zoned writes as in
the (untested) patch below?

Thanks,

Bart.


[PATCH] block: Do not set the I/O priority for sequential zoned writes

---
  block/blk-mq.c         |  7 +++++++
  include/linux/blk-mq.h | 17 +++++++++++++++++
  2 files changed, 24 insertions(+)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index c11c97afa0bc..668888103a47 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2922,6 +2922,13 @@ static bool blk_mq_can_use_cached_rq(struct request *rq, struct blk_plug *plug,

  static void bio_set_ioprio(struct bio *bio)
  {
+	/*
+	 * Do not set the I/O priority of sequential zoned write bios because
+	 * this could lead to reordering and hence to unaligned write errors.
+	 */
+	if (blk_bio_is_seq_zoned_write(bio))
+		return;
+
  	/* Nobody set ioprio so far? Initialize it based on task's nice value */
  	if (IOPRIO_PRIO_CLASS(bio->bi_ioprio) == IOPRIO_CLASS_NONE)
  		bio->bi_ioprio = get_current_ioprio();
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 1ab3081c82ed..e7fa81170b7c 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -1149,6 +1149,18 @@ static inline unsigned int blk_rq_zone_no(struct request *rq)
  	return disk_zone_no(rq->q->disk, blk_rq_pos(rq));
  }

+/**
+ * blk_bio_is_seq_zoned_write() - Check if @bio requires write serialization.
+ * @bio: Bio to examine.
+ *
+ * Note: REQ_OP_ZONE_APPEND bios do not require serialization.
+ */
+static inline bool blk_bio_is_seq_zoned_write(struct bio *bio)
+{
+	return disk_zone_is_seq(bio->bi_bdev->bd_disk, bio->bi_iter.bi_sector) &&
+		op_needs_zoned_write_locking(bio_op(bio));
+}
+
  static inline unsigned int blk_rq_zone_is_seq(struct request *rq)
  {
  	return disk_zone_is_seq(rq->q->disk, blk_rq_pos(rq));
@@ -1196,6 +1208,11 @@ static inline bool blk_req_can_dispatch_to_zone(struct request *rq)
  	return !blk_req_zone_is_write_locked(rq);
  }
  #else /* CONFIG_BLK_DEV_ZONED */
+static inline bool blk_bio_is_seq_zoned_write(struct bio *bio)
+{
+	return false;
+}
+
  static inline bool blk_rq_is_seq_zoned_write(struct request *rq)
  {
  	return false;
Damien Le Moal Dec. 20, 2023, 12:05 a.m. UTC | #3
On 12/20/23 02:42, Bart Van Assche wrote:
> On 12/19/23 04:10, Christoph Hellwig wrote:
>> On Mon, Dec 18, 2023 at 01:13:42PM -0800, Bart Van Assche wrote:
>>> Assigning I/O priorities with the ioprio cgroup policy may cause
>>> different I/O priorities to be assigned to write requests for the same
>>> zone. Prevent that this causes unaligned write errors by adding zoned
>>> writes for the same zone in the same priority queue as prior zoned
>>> writes.
>>
>> I still think this is fundamentally the wrong thing to do.  If you set
>> different priorities, you want I/O to be reordered, so ignoring that
>> is a bad thing.
> 
> Hi Christoph,
> 
> How about not setting the I/O priority of sequential zoned writes as in
> the (untested) patch below?
> 
> Thanks,
> 
> Bart.
> 
> 
> [PATCH] block: Do not set the I/O priority for sequential zoned writes
> 
> ---
>   block/blk-mq.c         |  7 +++++++
>   include/linux/blk-mq.h | 17 +++++++++++++++++
>   2 files changed, 24 insertions(+)
> 
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index c11c97afa0bc..668888103a47 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -2922,6 +2922,13 @@ static bool blk_mq_can_use_cached_rq(struct request *rq, struct blk_plug *plug,
> 
>   static void bio_set_ioprio(struct bio *bio)
>   {
> +	/*
> +	 * Do not set the I/O priority of sequential zoned write bios because
> +	 * this could lead to reordering and hence to unaligned write errors.
> +	 */
> +	if (blk_bio_is_seq_zoned_write(bio))
> +		return;

That is not acceptable as that will ignore priorities passed for async direct
IOs through aio->aio_reqprio. That one is a perfectly acceptable use case and we
should not ignore it.

> +
>   	/* Nobody set ioprio so far? Initialize it based on task's nice value */
>   	if (IOPRIO_PRIO_CLASS(bio->bi_ioprio) == IOPRIO_CLASS_NONE)
>   		bio->bi_ioprio = get_current_ioprio();
> diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
> index 1ab3081c82ed..e7fa81170b7c 100644
> --- a/include/linux/blk-mq.h
> +++ b/include/linux/blk-mq.h
> @@ -1149,6 +1149,18 @@ static inline unsigned int blk_rq_zone_no(struct request *rq)
>   	return disk_zone_no(rq->q->disk, blk_rq_pos(rq));
>   }
> 
> +/**
> + * blk_bio_is_seq_zoned_write() - Check if @bio requires write serialization.
> + * @bio: Bio to examine.
> + *
> + * Note: REQ_OP_ZONE_APPEND bios do not require serialization.
> + */
> +static inline bool blk_bio_is_seq_zoned_write(struct bio *bio)
> +{
> +	return disk_zone_is_seq(bio->bi_bdev->bd_disk, bio->bi_iter.bi_sector) &&
> +		op_needs_zoned_write_locking(bio_op(bio));
> +}
> +
>   static inline unsigned int blk_rq_zone_is_seq(struct request *rq)
>   {
>   	return disk_zone_is_seq(rq->q->disk, blk_rq_pos(rq));
> @@ -1196,6 +1208,11 @@ static inline bool blk_req_can_dispatch_to_zone(struct request *rq)
>   	return !blk_req_zone_is_write_locked(rq);
>   }
>   #else /* CONFIG_BLK_DEV_ZONED */
> +static inline bool blk_bio_is_seq_zoned_write(struct bio *bio)
> +{
> +	return false;
> +}
> +
>   static inline bool blk_rq_is_seq_zoned_write(struct request *rq)
>   {
>   	return false;
>
Bart Van Assche Dec. 20, 2023, 12:48 a.m. UTC | #4
On 12/19/23 16:05, Damien Le Moal wrote:
> On 12/20/23 02:42, Bart Van Assche wrote:
>> diff --git a/block/blk-mq.c b/block/blk-mq.c
>> index c11c97afa0bc..668888103a47 100644
>> --- a/block/blk-mq.c
>> +++ b/block/blk-mq.c
>> @@ -2922,6 +2922,13 @@ static bool blk_mq_can_use_cached_rq(struct request *rq, struct blk_plug *plug,
>>
>>    static void bio_set_ioprio(struct bio *bio)
>>    {
>> +	/*
>> +	 * Do not set the I/O priority of sequential zoned write bios because
>> +	 * this could lead to reordering and hence to unaligned write errors.
>> +	 */
>> +	if (blk_bio_is_seq_zoned_write(bio))
>> +		return;
> 
> That is not acceptable as that will ignore priorities passed for async direct
> IOs through aio->aio_reqprio. That one is a perfectly acceptable use case and we
> should not ignore it.

Hi Damien,

What you wrote is wrong. bio_set_ioprio() applies the I/O priority set
by ionice or by the blk-ioprio cgroup policy. The above patch does not
affect the priorities set via aio_reqprio. aio_reqprio is still copied
in ki_ioprio and ki_ioprio is still copied into bi_ioprio by the direct
I/O code.

Bart.
Damien Le Moal Dec. 20, 2023, 1:28 a.m. UTC | #5
On 12/20/23 09:48, Bart Van Assche wrote:
> On 12/19/23 16:05, Damien Le Moal wrote:
>> On 12/20/23 02:42, Bart Van Assche wrote:
>>> diff --git a/block/blk-mq.c b/block/blk-mq.c
>>> index c11c97afa0bc..668888103a47 100644
>>> --- a/block/blk-mq.c
>>> +++ b/block/blk-mq.c
>>> @@ -2922,6 +2922,13 @@ static bool blk_mq_can_use_cached_rq(struct request *rq, struct blk_plug *plug,
>>>
>>>    static void bio_set_ioprio(struct bio *bio)
>>>    {
>>> +	/*
>>> +	 * Do not set the I/O priority of sequential zoned write bios because
>>> +	 * this could lead to reordering and hence to unaligned write errors.
>>> +	 */
>>> +	if (blk_bio_is_seq_zoned_write(bio))
>>> +		return;
>>
>> That is not acceptable as that will ignore priorities passed for async direct
>> IOs through aio->aio_reqprio. That one is a perfectly acceptable use case and we
>> should not ignore it.
> 
> Hi Damien,
> 
> What you wrote is wrong. bio_set_ioprio() applies the I/O priority set
> by ionice or by the blk-ioprio cgroup policy. The above patch does not
> affect the priorities set via aio_reqprio. aio_reqprio is still copied
> in ki_ioprio and ki_ioprio is still copied into bi_ioprio by the direct
> I/O code.

OK. But your patch will still endup with IO priorities being ignored for
legitimate use cases that do not lead to mixed-priorities. E.g. applications
using directly the raw device and doing writes to a zone without mixing
priorities, either with AIO, ionice or cgroups.

The issue is when a user mixes different IO priorities for writes to the same
zone, and as I said before, since doing that is nonsensical, getting the IOs to
fail is fine by me. The user will then be aware that this should not be done.

f2fs has a problem with that though as that leads to write errors and FS going
read-only (I guess). btrfs will not have this issue because it uses zone append.
Need to check dm-zoned as their may be an issue there.

So what about what I proposed in an earlier email: introduce a bio flag "ignore
ioprio" that causes bio_set_ioprio() to not set any IO priority and have f2fs
set that flag for any zone write BIO it issues ? That will solve your f2fs issue
without messing up good use cases.
Christoph Hellwig Dec. 20, 2023, 3:53 a.m. UTC | #6
On Wed, Dec 20, 2023 at 10:28:37AM +0900, Damien Le Moal wrote:
> zone, and as I said before, since doing that is nonsensical, getting the IOs to
> fail is fine by me. The user will then be aware that this should not be done.
> 
> f2fs has a problem with that though as that leads to write errors and FS going
> read-only (I guess). btrfs will not have this issue because it uses zone append.
> Need to check dm-zoned as their may be an issue there.
> 
> So what about what I proposed in an earlier email: introduce a bio flag "ignore
> ioprio" that causes bio_set_ioprio() to not set any IO priority and have f2fs
> set that flag for any zone write BIO it issues ? That will solve your f2fs issue
> without messing up good use cases.

How can this even be a problem for f2f2 upsteam where f2fs must only
have a single write per zone outstanding?  I really don't want crap
in the block layer to work around a known broken model (multiple
outstanding WRITE commands per zone) that because it's so known broken
isn't even merged upstream.
Damien Le Moal Dec. 20, 2023, 4:40 a.m. UTC | #7
On 12/20/23 12:53, Christoph Hellwig wrote:
> On Wed, Dec 20, 2023 at 10:28:37AM +0900, Damien Le Moal wrote:
>> zone, and as I said before, since doing that is nonsensical, getting the IOs to
>> fail is fine by me. The user will then be aware that this should not be done.
>>
>> f2fs has a problem with that though as that leads to write errors and FS going
>> read-only (I guess). btrfs will not have this issue because it uses zone append.
>> Need to check dm-zoned as their may be an issue there.
>>
>> So what about what I proposed in an earlier email: introduce a bio flag "ignore
>> ioprio" that causes bio_set_ioprio() to not set any IO priority and have f2fs
>> set that flag for any zone write BIO it issues ? That will solve your f2fs issue
>> without messing up good use cases.
> 
> How can this even be a problem for f2f2 upsteam where f2fs must only
> have a single write per zone outstanding?  I really don't want crap
> in the block layer to work around a known broken model (multiple
> outstanding WRITE commands per zone) that because it's so known broken
> isn't even merged upstream.

The only constraint at the BIO level for writing to a zone is "issue the write
BIOs sequentially". So multiple write BIOs *can* be issued to a zone. The "one
write per zone in flight at any time" implemented with zone write locking
happens at the request level in the block IO scheduler, so underneath the file
system. So the issue can indeed happen.

But what you said could actually provide a solution: have the FS issue regular
writes one at a time if the writes have priorities.
diff mbox series

Patch

diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index 27f11320b8d1..3e2fcbdbac65 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -7,6 +7,11 @@  config MQ_IOSCHED_DEADLINE
 	help
 	  MQ version of the deadline IO scheduler.
 
+config MQ_IOSCHED_DEADLINE_TEST
+	tristate "MQ deadline unit tests" if !KUNIT_ALL_TESTS
+	depends on MQ_IOSCHED_DEADLINE && KUNIT
+	default KUNIT_ALL_TESTS
+
 config MQ_IOSCHED_KYBER
 	tristate "Kyber I/O scheduler"
 	default y
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index c0f92cc729ca..d1d54cac4c37 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -46,6 +46,8 @@  enum dd_data_dir {
 enum { DD_DIR_COUNT = 2 };
 
 enum dd_prio {
+	DD_INVALID_PRIO	= -1,
+	DD_PRIO_MIN	= 0,
 	DD_RT_PRIO	= 0,
 	DD_BE_PRIO	= 1,
 	DD_IDLE_PRIO	= 2,
@@ -113,6 +115,12 @@  static const enum dd_prio ioprio_class_to_prio[] = {
 	[IOPRIO_CLASS_IDLE]	= DD_IDLE_PRIO,
 };
 
+static const u8 prio_to_ioprio_class[] = {
+	[DD_RT_PRIO]	= IOPRIO_CLASS_RT,
+	[DD_BE_PRIO]	= IOPRIO_CLASS_BE,
+	[DD_IDLE_PRIO]	= IOPRIO_CLASS_IDLE,
+};
+
 static inline struct rb_root *
 deadline_rb_root(struct dd_per_prio *per_prio, struct request *rq)
 {
@@ -194,18 +202,67 @@  static inline struct request *deadline_from_pos(struct dd_per_prio *per_prio,
 	return deadline_first_rq_past_pos(per_prio, data_dir, pos);
 }
 
+/*
+ * If any sequential write requests are pending for the zone containing @pos,
+ * return the I/O priority for these write requests.
+ */
+static enum dd_prio dd_zone_prio(struct deadline_data *dd,
+				 struct block_device *bdev, sector_t pos)
+{
+#ifdef CONFIG_BLK_DEV_ZONED
+	struct gendisk *disk = bdev->bd_disk;
+	const unsigned int zno = disk_zone_no(disk, pos);
+	enum dd_prio prio;
+
+	pos -= bdev_offset_from_zone_start(bdev, pos);
+	for (prio = DD_PRIO_MIN; prio <= DD_PRIO_MAX; prio++) {
+		struct dd_per_prio *per_prio = &dd->per_prio[prio];
+		struct request *rq;
+
+		rq = deadline_first_rq_past_pos(per_prio, DD_WRITE, pos);
+		while (rq && blk_rq_zone_no(rq) == zno) {
+			struct rb_node *node;
+
+			if (blk_rq_is_seq_zoned_write(rq))
+				return prio;
+			node = rb_next(&rq->rb_node);
+			if (!node)
+				break;
+			rq = rb_entry_rq(node);
+		}
+	}
+#endif
+	return DD_INVALID_PRIO;
+}
+
 /*
  * Returns the I/O priority class (IOPRIO_CLASS_*) that has been assigned to a
  * request.
  */
-static enum dd_prio dd_rq_ioprio(struct request *rq)
+static enum dd_prio dd_rq_ioprio(struct deadline_data *dd, struct request *rq)
 {
-	return ioprio_class_to_prio[IOPRIO_PRIO_CLASS(req_get_ioprio(rq))];
+	enum dd_prio prio;
+
+	if (!blk_rq_is_seq_zoned_write(rq) || !rq->bio)
+		return ioprio_class_to_prio[IOPRIO_PRIO_CLASS(
+			req_get_ioprio(rq))];
+	prio = dd_zone_prio(dd, rq->q->disk->part0, blk_rq_pos(rq));
+	if (prio == DD_INVALID_PRIO)
+		return ioprio_class_to_prio[IOPRIO_PRIO_CLASS(
+			req_get_ioprio(rq))];
+	return prio;
 }
 
-static enum dd_prio dd_bio_ioprio(struct bio *bio)
+static enum dd_prio dd_bio_ioprio(struct deadline_data *dd, struct bio *bio)
 {
-	return ioprio_class_to_prio[IOPRIO_PRIO_CLASS(bio->bi_ioprio)];
+	enum dd_prio prio;
+
+	if (!blk_bio_is_seq_zoned_write(bio))
+		return ioprio_class_to_prio[IOPRIO_PRIO_CLASS(bio->bi_ioprio)];
+	prio = dd_zone_prio(dd, bio->bi_bdev, bio->bi_iter.bi_sector);
+	if (prio == DD_INVALID_PRIO)
+		return ioprio_class_to_prio[IOPRIO_PRIO_CLASS(bio->bi_ioprio)];
+	return prio;
 }
 
 static void
@@ -246,7 +303,7 @@  static void dd_request_merged(struct request_queue *q, struct request *req,
 			      enum elv_merge type)
 {
 	struct deadline_data *dd = q->elevator->elevator_data;
-	const enum dd_prio prio = dd_rq_ioprio(req);
+	const enum dd_prio prio = dd_rq_ioprio(dd, req);
 	struct dd_per_prio *per_prio = &dd->per_prio[prio];
 
 	/*
@@ -265,7 +322,7 @@  static void dd_merged_requests(struct request_queue *q, struct request *req,
 			       struct request *next)
 {
 	struct deadline_data *dd = q->elevator->elevator_data;
-	const enum dd_prio prio = dd_rq_ioprio(next);
+	const enum dd_prio prio = dd_rq_ioprio(dd, next);
 
 	lockdep_assert_held(&dd->lock);
 
@@ -560,7 +617,7 @@  static struct request *__dd_dispatch_request(struct deadline_data *dd,
 	dd->batching++;
 	deadline_move_request(dd, per_prio, rq);
 done:
-	prio = dd_rq_ioprio(rq);
+	prio = dd_rq_ioprio(dd, rq);
 	dd->per_prio[prio].latest_pos[data_dir] = blk_rq_pos(rq);
 	dd->per_prio[prio].stats.dispatched++;
 	/*
@@ -758,7 +815,7 @@  static int dd_request_merge(struct request_queue *q, struct request **rq,
 			    struct bio *bio)
 {
 	struct deadline_data *dd = q->elevator->elevator_data;
-	const enum dd_prio prio = dd_bio_ioprio(bio);
+	const enum dd_prio prio = dd_bio_ioprio(dd, bio);
 	struct dd_per_prio *per_prio = &dd->per_prio[prio];
 	sector_t sector = bio_end_sector(bio);
 	struct request *__rq;
@@ -822,7 +879,7 @@  static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
 	 */
 	blk_req_zone_write_unlock(rq);
 
-	prio = dd_rq_ioprio(rq);
+	prio = dd_rq_ioprio(dd, rq);
 	per_prio = &dd->per_prio[prio];
 	if (!rq->elv.priv[0]) {
 		per_prio->stats.inserted++;
@@ -931,7 +988,7 @@  static void dd_finish_request(struct request *rq)
 {
 	struct request_queue *q = rq->q;
 	struct deadline_data *dd = q->elevator->elevator_data;
-	const enum dd_prio prio = dd_rq_ioprio(rq);
+	const enum dd_prio prio = dd_rq_ioprio(dd, rq);
 	struct dd_per_prio *per_prio = &dd->per_prio[prio];
 
 	/*
@@ -1282,6 +1339,10 @@  static struct elevator_type mq_deadline = {
 };
 MODULE_ALIAS("mq-deadline-iosched");
 
+#ifdef CONFIG_MQ_IOSCHED_DEADLINE_TEST
+#include "mq-deadline_test.c"
+#endif
+
 static int __init deadline_init(void)
 {
 	return elv_register(&mq_deadline);
diff --git a/block/mq-deadline_test.c b/block/mq-deadline_test.c
new file mode 100644
index 000000000000..72bec6fd5f7a
--- /dev/null
+++ b/block/mq-deadline_test.c
@@ -0,0 +1,175 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2023 Google LLC
+ */
+#include <kunit/test.h>
+#include <linux/cleanup.h>
+
+static void test_ioprio(struct kunit *test)
+{
+	static struct block_device bdev;
+	static struct gendisk disk = { .part0 = &bdev };
+	static struct request_queue queue = { .disk = &disk };
+	static struct blk_mq_hw_ctx hctx = { .queue = &queue };
+	static struct bio bio1 = { .bi_bdev = &bdev,
+				   .bi_opf = REQ_OP_WRITE,
+				   .bi_ioprio = IOPRIO_CLASS_IDLE
+						<< IOPRIO_CLASS_SHIFT };
+	static struct request rq1 = { .q = &queue,
+				      .cmd_flags = REQ_OP_WRITE,
+				      .__sector = 1,
+				      .__data_len = 1,
+				      .bio = &bio1,
+				      .mq_hctx = &hctx,
+				      .ioprio = IOPRIO_CLASS_IDLE
+						<< IOPRIO_CLASS_SHIFT };
+	static struct bio bio2 = { .bi_bdev = &bdev,
+				   .bi_opf = REQ_OP_WRITE,
+				   .bi_ioprio = IOPRIO_CLASS_BE
+						<< IOPRIO_CLASS_SHIFT };
+	static struct request rq2 = { .q = &queue,
+				      .cmd_flags = REQ_OP_WRITE,
+				      .__sector = 3,
+				      .__data_len = 1,
+				      .bio = &bio2,
+				      .mq_hctx = &hctx,
+				      .ioprio = IOPRIO_CLASS_BE
+						<< IOPRIO_CLASS_SHIFT };
+	static struct bio bio3 = { .bi_bdev = &bdev,
+				   .bi_opf = REQ_OP_WRITE,
+				   .bi_ioprio = IOPRIO_CLASS_RT
+						<< IOPRIO_CLASS_SHIFT };
+	static struct request rq3 = { .q = &queue,
+				      .cmd_flags = REQ_OP_WRITE,
+				      .__sector = 5,
+				      .__data_len = 1,
+				      .bio = &bio3,
+				      .mq_hctx = &hctx,
+				      .ioprio = IOPRIO_CLASS_RT
+						<< IOPRIO_CLASS_SHIFT };
+	struct request *rq;
+	static LIST_HEAD(rq_list);
+
+	bdev.bd_disk = &disk;
+	bdev.bd_queue = &queue;
+	disk.queue = &queue;
+
+	dd_init_sched(&queue, &mq_deadline);
+	dd_prepare_request(&rq1);
+	dd_prepare_request(&rq2);
+	dd_prepare_request(&rq3);
+	list_add_tail(&rq1.queuelist, &rq_list);
+	list_add_tail(&rq2.queuelist, &rq_list);
+	list_add_tail(&rq3.queuelist, &rq_list);
+	dd_insert_requests(&hctx, &rq_list, false);
+	rq = dd_dispatch_request(&hctx);
+	KUNIT_EXPECT_PTR_EQ(test, rq, &rq3);
+	dd_finish_request(rq);
+	rq = dd_dispatch_request(&hctx);
+	KUNIT_EXPECT_PTR_EQ(test, rq, &rq2);
+	dd_finish_request(rq);
+	rq = dd_dispatch_request(&hctx);
+	KUNIT_EXPECT_PTR_EQ(test, rq, &rq1);
+	dd_finish_request(rq);
+	dd_exit_sched(queue.elevator);
+}
+
+/*
+ * Test that the write order is preserved if a higher I/O priority is assigned
+ * to higher LBAs. This test fails if dd_zone_prio() always returns
+ * DD_INVALID_PRIO.
+ */
+static void test_zone_prio(struct kunit *test)
+{
+	static struct block_device bdev;
+	static unsigned long seq_zones_wlock[1];
+	static struct gendisk disk = { .conv_zones_bitmap = NULL,
+				       .seq_zones_wlock = seq_zones_wlock,
+				       .part0 = &bdev };
+	static struct request_queue queue = {
+		.disk = &disk,
+		.limits = { .zoned = BLK_ZONED_HM, .chunk_sectors = 16 }
+	};
+	static struct blk_mq_hw_ctx hctx = { .queue = &queue };
+	static struct bio bio1 = { .bi_bdev = &bdev,
+				   .bi_opf = REQ_OP_WRITE,
+				   .bi_ioprio = IOPRIO_CLASS_IDLE
+						<< IOPRIO_CLASS_SHIFT };
+	static struct request rq1 = { .q = &queue,
+				      .cmd_flags = REQ_OP_WRITE,
+				      .__sector = 1,
+				      .__data_len = 1,
+				      .bio = &bio1,
+				      .mq_hctx = &hctx,
+				      .ioprio = IOPRIO_CLASS_IDLE
+						<< IOPRIO_CLASS_SHIFT };
+	static struct bio bio2 = { .bi_bdev = &bdev,
+				   .bi_opf = REQ_OP_WRITE,
+				   .bi_ioprio = IOPRIO_CLASS_BE
+						<< IOPRIO_CLASS_SHIFT };
+	static struct request rq2 = { .q = &queue,
+				      .cmd_flags = REQ_OP_WRITE,
+				      .__sector = 3,
+				      .__data_len = 1,
+				      .bio = &bio2,
+				      .mq_hctx = &hctx,
+				      .ioprio = IOPRIO_CLASS_BE
+						<< IOPRIO_CLASS_SHIFT };
+	static struct bio bio3 = { .bi_bdev = &bdev,
+				   .bi_opf = REQ_OP_WRITE,
+				   .bi_ioprio = IOPRIO_CLASS_RT
+						<< IOPRIO_CLASS_SHIFT };
+	static struct request rq3 = { .q = &queue,
+				      .cmd_flags = REQ_OP_WRITE,
+				      .__sector = 5,
+				      .__data_len = 1,
+				      .bio = &bio3,
+				      .mq_hctx = &hctx,
+				      .ioprio = IOPRIO_CLASS_RT
+						<< IOPRIO_CLASS_SHIFT };
+	struct request *rq;
+	static LIST_HEAD(rq_list);
+
+	bdev.bd_disk = &disk;
+	bdev.bd_queue = &queue;
+	disk.queue = &queue;
+
+	KUNIT_EXPECT_TRUE(test, blk_rq_is_seq_zoned_write(&rq1));
+	KUNIT_EXPECT_TRUE(test, blk_rq_is_seq_zoned_write(&rq2));
+	KUNIT_EXPECT_TRUE(test, blk_rq_is_seq_zoned_write(&rq3));
+
+	dd_init_sched(&queue, &mq_deadline);
+	dd_prepare_request(&rq1);
+	dd_prepare_request(&rq2);
+	dd_prepare_request(&rq3);
+	list_add_tail(&rq1.queuelist, &rq_list);
+	list_add_tail(&rq2.queuelist, &rq_list);
+	list_add_tail(&rq3.queuelist, &rq_list);
+	dd_insert_requests(&hctx, &rq_list, false);
+	rq = dd_dispatch_request(&hctx);
+	KUNIT_EXPECT_PTR_EQ(test, rq, &rq1);
+	dd_finish_request(rq);
+	rq = dd_dispatch_request(&hctx);
+	KUNIT_EXPECT_PTR_EQ(test, rq, &rq2);
+	dd_finish_request(rq);
+	rq = dd_dispatch_request(&hctx);
+	KUNIT_EXPECT_PTR_EQ(test, rq, &rq3);
+	dd_finish_request(rq);
+	dd_exit_sched(queue.elevator);
+}
+
+static struct kunit_case mq_deadline_test_cases[] = {
+	KUNIT_CASE(test_ioprio),
+	KUNIT_CASE(test_zone_prio),
+	{}
+};
+
+static struct kunit_suite mq_deadline_test_suite = {
+	.name = "mq-deadline",
+	.test_cases = mq_deadline_test_cases,
+};
+kunit_test_suite(mq_deadline_test_suite);
+
+MODULE_DESCRIPTION("mq-deadline unit tests");
+MODULE_AUTHOR("Bart Van Assche");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 1ab3081c82ed..e7fa81170b7c 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -1149,6 +1149,18 @@  static inline unsigned int blk_rq_zone_no(struct request *rq)
 	return disk_zone_no(rq->q->disk, blk_rq_pos(rq));
 }
 
+/**
+ * blk_bio_is_seq_zoned_write() - Check if @bio requires write serialization.
+ * @bio: Bio to examine.
+ *
+ * Note: REQ_OP_ZONE_APPEND bios do not require serialization.
+ */
+static inline bool blk_bio_is_seq_zoned_write(struct bio *bio)
+{
+	return disk_zone_is_seq(bio->bi_bdev->bd_disk, bio->bi_iter.bi_sector) &&
+		op_needs_zoned_write_locking(bio_op(bio));
+}
+
 static inline unsigned int blk_rq_zone_is_seq(struct request *rq)
 {
 	return disk_zone_is_seq(rq->q->disk, blk_rq_pos(rq));
@@ -1196,6 +1208,11 @@  static inline bool blk_req_can_dispatch_to_zone(struct request *rq)
 	return !blk_req_zone_is_write_locked(rq);
 }
 #else /* CONFIG_BLK_DEV_ZONED */
+static inline bool blk_bio_is_seq_zoned_write(struct bio *bio)
+{
+	return false;
+}
+
 static inline bool blk_rq_is_seq_zoned_write(struct request *rq)
 {
 	return false;