diff mbox series

[3/3] block/mq-deadline: Disable I/O prioritization in certain cases

Message ID 20231205053213.522772-4-bvanassche@acm.org (mailing list archive)
State New, archived
Headers show
Series Improve mq-deadline I/O priority support | expand

Commit Message

Bart Van Assche Dec. 5, 2023, 5:32 a.m. UTC
Fix the following two issues:
- Even with prio_aging_expire set to zero, I/O priorities still affect the
  request order.
- Assigning I/O priorities with the ioprio cgroup policy breaks zoned
  storage support in the mq-deadline scheduler.

This patch fixes both issues by disabling I/O prioritization for these
two cases.

Cc: Damien Le Moal <dlemoal@kernel.org>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
 block/mq-deadline.c | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

Comments

Damien Le Moal Dec. 6, 2023, 2:42 a.m. UTC | #1
On 12/5/23 14:32, Bart Van Assche wrote:
> Fix the following two issues:
> - Even with prio_aging_expire set to zero, I/O priorities still affect the
>   request order.
> - Assigning I/O priorities with the ioprio cgroup policy breaks zoned
>   storage support in the mq-deadline scheduler.

Can you provide more details ? E.g. an example of a setup that breaks ordering ?

> This patch fixes both issues by disabling I/O prioritization for these
> two cases.

... when prio_aging_expire is set to 0. Right ? Otherwise, the sentence above
reads as if you are disabling IO priority for zoned devices...

Also,

> 
> Cc: Damien Le Moal <dlemoal@kernel.org>
> Signed-off-by: Bart Van Assche <bvanassche@acm.org>
> ---
>  block/mq-deadline.c | 22 ++++++++++++----------
>  1 file changed, 12 insertions(+), 10 deletions(-)
> 
> diff --git a/block/mq-deadline.c b/block/mq-deadline.c
> index fe5da2ade953..6781cef0109e 100644
> --- a/block/mq-deadline.c
> +++ b/block/mq-deadline.c
> @@ -123,14 +123,16 @@ deadline_rb_root(struct dd_per_prio *per_prio, struct request *rq)
>   * Returns the I/O priority class (IOPRIO_CLASS_*) that has been assigned to a
>   * request.
>   */
> -static u8 dd_rq_ioclass(struct request *rq)
> +static u8 dd_rq_ioclass(struct deadline_data *dd, struct request *rq)
>  {
> -	return IOPRIO_PRIO_CLASS(req_get_ioprio(rq));
> +	return dd->prio_aging_expire ? IOPRIO_PRIO_CLASS(req_get_ioprio(rq)) :
> +				       IOPRIO_CLASS_NONE;

I personally would prefer the simpler:

	if (!dd->prio_aging_expire)
		return IOPRIO_CLASS_NONE;
	return IOPRIO_PRIO_CLASS(req_get_ioprio(rq));

>  }
>  
> -static u8 dd_bio_ioclass(struct bio *bio)
> +static u8 dd_bio_ioclass(struct deadline_data *dd, struct bio *bio)
>  {
> -	return IOPRIO_PRIO_CLASS(bio->bi_ioprio);
> +	return dd->prio_aging_expire ? IOPRIO_PRIO_CLASS(bio->bi_ioprio) :
> +				       IOPRIO_CLASS_NONE;

Same comment as above.

>  }
>  
>  /*
> @@ -233,7 +235,7 @@ static void dd_request_merged(struct request_queue *q, struct request *req,
>  			      enum elv_merge type)
>  {
>  	struct deadline_data *dd = q->elevator->elevator_data;
> -	const u8 ioprio_class = dd_rq_ioclass(req);
> +	const u8 ioprio_class = dd_rq_ioclass(dd, req);
>  	const enum dd_prio prio = ioprio_class_to_prio[ioprio_class];
>  	struct dd_per_prio *per_prio = &dd->per_prio[prio];
>  
> @@ -253,7 +255,7 @@ static void dd_merged_requests(struct request_queue *q, struct request *req,
>  			       struct request *next)
>  {
>  	struct deadline_data *dd = q->elevator->elevator_data;
> -	const u8 ioprio_class = dd_rq_ioclass(next);
> +	const u8 ioprio_class = dd_rq_ioclass(dd, next);
>  	const enum dd_prio prio = ioprio_class_to_prio[ioprio_class];
>  
>  	lockdep_assert_held(&dd->lock);
> @@ -550,7 +552,7 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd,
>  	dd->batching++;
>  	deadline_move_request(dd, per_prio, rq);
>  done:
> -	ioprio_class = dd_rq_ioclass(rq);
> +	ioprio_class = dd_rq_ioclass(dd, rq);
>  	prio = ioprio_class_to_prio[ioprio_class];
>  	dd->per_prio[prio].latest_pos[data_dir] = blk_rq_pos(rq);
>  	dd->per_prio[prio].stats.dispatched++;
> @@ -749,7 +751,7 @@ static int dd_request_merge(struct request_queue *q, struct request **rq,
>  			    struct bio *bio)
>  {
>  	struct deadline_data *dd = q->elevator->elevator_data;
> -	const u8 ioprio_class = dd_bio_ioclass(bio);
> +	const u8 ioprio_class = dd_bio_ioclass(dd, bio);
>  	const enum dd_prio prio = ioprio_class_to_prio[ioprio_class];
>  	struct dd_per_prio *per_prio = &dd->per_prio[prio];
>  	sector_t sector = bio_end_sector(bio);
> @@ -814,7 +816,7 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
>  	 */
>  	blk_req_zone_write_unlock(rq);
>  
> -	prio = ioprio_class_to_prio[dd_rq_ioclass(rq)];
> +	prio = ioprio_class_to_prio[dd_rq_ioclass(dd, rq)];
>  	per_prio = &dd->per_prio[prio];
>  	if (!rq->elv.priv[0]) {
>  		per_prio->stats.inserted++;
> @@ -923,7 +925,7 @@ static void dd_finish_request(struct request *rq)
>  {
>  	struct request_queue *q = rq->q;
>  	struct deadline_data *dd = q->elevator->elevator_data;
> -	const u8 ioprio_class = dd_rq_ioclass(rq);
> +	const u8 ioprio_class = dd_rq_ioclass(dd, rq);
>  	const enum dd_prio prio = ioprio_class_to_prio[ioprio_class];
>  	struct dd_per_prio *per_prio = &dd->per_prio[prio];

What about the call to dd_dispatch_prio_aged_requests() in
dd_dispatch_request() ? Shouldn't that call be skipped if prio_aging_expire is 0 ?

>
Bart Van Assche Dec. 6, 2023, 3:24 a.m. UTC | #2
On 12/5/23 16:42, Damien Le Moal wrote:
> On 12/5/23 14:32, Bart Van Assche wrote:
>> Fix the following two issues:
>> - Even with prio_aging_expire set to zero, I/O priorities still affect the
>>    request order.
>> - Assigning I/O priorities with the ioprio cgroup policy breaks zoned
>>    storage support in the mq-deadline scheduler.
> 
> Can you provide more details ? E.g. an example of a setup that breaks ordering ?

Regarding prio_aging_expire that is set to zero: a third party Android
developer reported that request prioritization in
mq-deadline is not compatible with MMC devices
(https://android-review.googlesource.com/c/kernel/common/+/2836235).

Regarding zoned storage and I/O priorities: we are working on modifying
Android such that foreground apps have a higher I/O priority than
background apps. As one can see in
https://android-review.googlesource.com/c/platform/system/core/+/2768906, 
we plan to use the ioprio cgroup policy to implement this. We noticed 
that this CL breaks zoned storage support if the mq-deadline I/O 
scheduler has been selected. I think that this should be solved in the
mq-deadline I/O
scheduler and also that ignoring I/O priorities for zoned writes that
must be submitted in order is a good solution. Respecting the I/O
priority for writes would require to make significant changes in the
layer that submits the zoned writes (F2FS): it would require that that
layer is modified such that it adds writes from foreground apps to the
log before those of background apps.

>> -static u8 dd_rq_ioclass(struct request *rq)
>> +static u8 dd_rq_ioclass(struct deadline_data *dd, struct request *rq)
>>   {
>> -	return IOPRIO_PRIO_CLASS(req_get_ioprio(rq));
>> +	return dd->prio_aging_expire ? IOPRIO_PRIO_CLASS(req_get_ioprio(rq)) :
>> +				       IOPRIO_CLASS_NONE;
> 
> I personally would prefer the simpler:
> 
> 	if (!dd->prio_aging_expire)
> 		return IOPRIO_CLASS_NONE;
> 	return IOPRIO_PRIO_CLASS(req_get_ioprio(rq));
> 
>>   }
>>   
>> -static u8 dd_bio_ioclass(struct bio *bio)
>> +static u8 dd_bio_ioclass(struct deadline_data *dd, struct bio *bio)
>>   {
>> -	return IOPRIO_PRIO_CLASS(bio->bi_ioprio);
>> +	return dd->prio_aging_expire ? IOPRIO_PRIO_CLASS(bio->bi_ioprio) :
>> +				       IOPRIO_CLASS_NONE;
> 
> Same comment as above.

I will make the proposed changes.
> What about the call to dd_dispatch_prio_aged_requests() in
> dd_dispatch_request() ? Shouldn't that call be skipped if prio_aging_expire is 0 ?

That sounds like a useful optimization to me. I will look into this.

Thanks,

Bart.
Bart Van Assche Dec. 8, 2023, 12:03 a.m. UTC | #3
On 12/5/23 16:42, Damien Le Moal wrote:
> ... when prio_aging_expire is set to 0. Right ? Otherwise, the sentence above
> reads as if you are disabling IO priority for zoned devices...

Hi Damien,

I just noticed that I posted an old version of this patch (3/3). In the patch
below I/O priorities are ignored for zoned writes.

Thanks,

Bart.


Subject: [PATCH] block/mq-deadline: Disable I/O prioritization in certain cases

Fix the following two issues:
- Even with prio_aging_expire set to zero, I/O priorities still affect the
   request order.
- Assigning I/O priorities with the ioprio cgroup policy breaks zoned
   storage support in the mq-deadline scheduler.

This patch fixes both issues by disabling I/O prioritization for these
two cases.

Cc: Damien Le Moal <dlemoal@kernel.org>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
  block/mq-deadline.c | 37 ++++++++++++++++++++++++-------------
  1 file changed, 24 insertions(+), 13 deletions(-)

diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index fe5da2ade953..310ff133ce20 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -119,18 +119,27 @@ deadline_rb_root(struct dd_per_prio *per_prio, struct request *rq)
  	return &per_prio->sort_list[rq_data_dir(rq)];
  }

+static bool dd_use_io_priority(struct deadline_data *dd, enum req_op op)
+{
+	return dd->prio_aging_expire != 0 && !op_needs_zoned_write_locking(op);
+}
+
  /*
   * Returns the I/O priority class (IOPRIO_CLASS_*) that has been assigned to a
   * request.
   */
-static u8 dd_rq_ioclass(struct request *rq)
+static u8 dd_rq_ioclass(struct deadline_data *dd, struct request *rq)
  {
-	return IOPRIO_PRIO_CLASS(req_get_ioprio(rq));
+	if (dd_use_io_priority(dd, req_op(rq)))
+		return IOPRIO_PRIO_CLASS(req_get_ioprio(rq));
+	return IOPRIO_CLASS_NONE;
  }

-static u8 dd_bio_ioclass(struct bio *bio)
+static u8 dd_bio_ioclass(struct deadline_data *dd, struct bio *bio)
  {
-	return IOPRIO_PRIO_CLASS(bio->bi_ioprio);
+	if (dd_use_io_priority(dd, bio_op(bio)))
+		return IOPRIO_PRIO_CLASS(bio->bi_ioprio);
+	return IOPRIO_CLASS_NONE;
  }

  /*
@@ -233,7 +242,7 @@ static void dd_request_merged(struct request_queue *q, struct request *req,
  			      enum elv_merge type)
  {
  	struct deadline_data *dd = q->elevator->elevator_data;
-	const u8 ioprio_class = dd_rq_ioclass(req);
+	const u8 ioprio_class = dd_rq_ioclass(dd, req);
  	const enum dd_prio prio = ioprio_class_to_prio[ioprio_class];
  	struct dd_per_prio *per_prio = &dd->per_prio[prio];

@@ -253,7 +262,7 @@ static void dd_merged_requests(struct request_queue *q, struct request *req,
  			       struct request *next)
  {
  	struct deadline_data *dd = q->elevator->elevator_data;
-	const u8 ioprio_class = dd_rq_ioclass(next);
+	const u8 ioprio_class = dd_rq_ioclass(dd, next);
  	const enum dd_prio prio = ioprio_class_to_prio[ioprio_class];

  	lockdep_assert_held(&dd->lock);
@@ -550,7 +559,7 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd,
  	dd->batching++;
  	deadline_move_request(dd, per_prio, rq);
  done:
-	ioprio_class = dd_rq_ioclass(rq);
+	ioprio_class = dd_rq_ioclass(dd, rq);
  	prio = ioprio_class_to_prio[ioprio_class];
  	dd->per_prio[prio].latest_pos[data_dir] = blk_rq_pos(rq);
  	dd->per_prio[prio].stats.dispatched++;
@@ -606,9 +615,11 @@ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
  	enum dd_prio prio;

  	spin_lock(&dd->lock);
-	rq = dd_dispatch_prio_aged_requests(dd, now);
-	if (rq)
-		goto unlock;
+	if (dd->prio_aging_expire != 0) {
+		rq = dd_dispatch_prio_aged_requests(dd, now);
+		if (rq)
+			goto unlock;
+	}

  	/*
  	 * Next, dispatch requests in priority order. Ignore lower priority
@@ -749,7 +760,7 @@ static int dd_request_merge(struct request_queue *q, struct request **rq,
  			    struct bio *bio)
  {
  	struct deadline_data *dd = q->elevator->elevator_data;
-	const u8 ioprio_class = dd_bio_ioclass(bio);
+	const u8 ioprio_class = dd_bio_ioclass(dd, bio);
  	const enum dd_prio prio = ioprio_class_to_prio[ioprio_class];
  	struct dd_per_prio *per_prio = &dd->per_prio[prio];
  	sector_t sector = bio_end_sector(bio);
@@ -814,7 +825,7 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
  	 */
  	blk_req_zone_write_unlock(rq);

-	prio = ioprio_class_to_prio[dd_rq_ioclass(rq)];
+	prio = ioprio_class_to_prio[dd_rq_ioclass(dd, rq)];
  	per_prio = &dd->per_prio[prio];
  	if (!rq->elv.priv[0]) {
  		per_prio->stats.inserted++;
@@ -923,7 +934,7 @@ static void dd_finish_request(struct request *rq)
  {
  	struct request_queue *q = rq->q;
  	struct deadline_data *dd = q->elevator->elevator_data;
-	const u8 ioprio_class = dd_rq_ioclass(rq);
+	const u8 ioprio_class = dd_rq_ioclass(dd, rq);
  	const enum dd_prio prio = ioprio_class_to_prio[ioprio_class];
  	struct dd_per_prio *per_prio = &dd->per_prio[prio];
Damien Le Moal Dec. 8, 2023, 3:37 a.m. UTC | #4
On 12/8/23 09:03, Bart Van Assche wrote:
> On 12/5/23 16:42, Damien Le Moal wrote:
>> ... when prio_aging_expire is set to 0. Right ? Otherwise, the sentence above
>> reads as if you are disabling IO priority for zoned devices...
> 
> Hi Damien,
> 
> I just noticed that I posted an old version of this patch (3/3). In the patch
> below I/O priorities are ignored for zoned writes.
> 
> Thanks,
> 
> Bart.
> 
> 
> Subject: [PATCH] block/mq-deadline: Disable I/O prioritization in certain cases
> 
> Fix the following two issues:
> - Even with prio_aging_expire set to zero, I/O priorities still affect the
>    request order.
> - Assigning I/O priorities with the ioprio cgroup policy breaks zoned
>    storage support in the mq-deadline scheduler.
> 
> This patch fixes both issues by disabling I/O prioritization for these
> two cases.
> 
> Cc: Damien Le Moal <dlemoal@kernel.org>
> Signed-off-by: Bart Van Assche <bvanassche@acm.org>
> ---
>   block/mq-deadline.c | 37 ++++++++++++++++++++++++-------------
>   1 file changed, 24 insertions(+), 13 deletions(-)
> 
> diff --git a/block/mq-deadline.c b/block/mq-deadline.c
> index fe5da2ade953..310ff133ce20 100644
> --- a/block/mq-deadline.c
> +++ b/block/mq-deadline.c
> @@ -119,18 +119,27 @@ deadline_rb_root(struct dd_per_prio *per_prio, struct request *rq)
>   	return &per_prio->sort_list[rq_data_dir(rq)];
>   }
> 
> +static bool dd_use_io_priority(struct deadline_data *dd, enum req_op op)
> +{
> +	return dd->prio_aging_expire != 0 && !op_needs_zoned_write_locking(op);
> +}

Hard NACK on this. The reason is that this will disable IO priority also for
sensible use cases that use libaio/io_uring with direct IOs, with an application
that does the right thing for writes, namely assigning the same priority for all
writes to a zone. There are some use cases like this in production.

I do understand that there is a problem when IO priorities come from cgroups and
the user go through a file system. But that should be handled by the file
system. That is, for f2fs, all writes going to the same zone should have the
same priority. Otherwise, priority inversion issues will lead to non sequential
write patterns.

Ideally, we should indeed have a generic solution for the cgroup case. But it
seems that for now, the simplest thing to do is to not allow priorities through
cgroups for writes to zoned devices, unless cgroups is made more intellignet
about it and manage bio priorities per zone to avoid priority inversion within a
zone.

> +
>   /*
>    * Returns the I/O priority class (IOPRIO_CLASS_*) that has been assigned to a
>    * request.
>    */
> -static u8 dd_rq_ioclass(struct request *rq)
> +static u8 dd_rq_ioclass(struct deadline_data *dd, struct request *rq)
>   {
> -	return IOPRIO_PRIO_CLASS(req_get_ioprio(rq));
> +	if (dd_use_io_priority(dd, req_op(rq)))
> +		return IOPRIO_PRIO_CLASS(req_get_ioprio(rq));
> +	return IOPRIO_CLASS_NONE;
>   }
> 
> -static u8 dd_bio_ioclass(struct bio *bio)
> +static u8 dd_bio_ioclass(struct deadline_data *dd, struct bio *bio)
>   {
> -	return IOPRIO_PRIO_CLASS(bio->bi_ioprio);
> +	if (dd_use_io_priority(dd, bio_op(bio)))
> +		return IOPRIO_PRIO_CLASS(bio->bi_ioprio);
> +	return IOPRIO_CLASS_NONE;
>   }
> 
>   /*
> @@ -233,7 +242,7 @@ static void dd_request_merged(struct request_queue *q, struct request *req,
>   			      enum elv_merge type)
>   {
>   	struct deadline_data *dd = q->elevator->elevator_data;
> -	const u8 ioprio_class = dd_rq_ioclass(req);
> +	const u8 ioprio_class = dd_rq_ioclass(dd, req);
>   	const enum dd_prio prio = ioprio_class_to_prio[ioprio_class];
>   	struct dd_per_prio *per_prio = &dd->per_prio[prio];
> 
> @@ -253,7 +262,7 @@ static void dd_merged_requests(struct request_queue *q, struct request *req,
>   			       struct request *next)
>   {
>   	struct deadline_data *dd = q->elevator->elevator_data;
> -	const u8 ioprio_class = dd_rq_ioclass(next);
> +	const u8 ioprio_class = dd_rq_ioclass(dd, next);
>   	const enum dd_prio prio = ioprio_class_to_prio[ioprio_class];
> 
>   	lockdep_assert_held(&dd->lock);
> @@ -550,7 +559,7 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd,
>   	dd->batching++;
>   	deadline_move_request(dd, per_prio, rq);
>   done:
> -	ioprio_class = dd_rq_ioclass(rq);
> +	ioprio_class = dd_rq_ioclass(dd, rq);
>   	prio = ioprio_class_to_prio[ioprio_class];
>   	dd->per_prio[prio].latest_pos[data_dir] = blk_rq_pos(rq);
>   	dd->per_prio[prio].stats.dispatched++;
> @@ -606,9 +615,11 @@ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
>   	enum dd_prio prio;
> 
>   	spin_lock(&dd->lock);
> -	rq = dd_dispatch_prio_aged_requests(dd, now);
> -	if (rq)
> -		goto unlock;
> +	if (dd->prio_aging_expire != 0) {
> +		rq = dd_dispatch_prio_aged_requests(dd, now);
> +		if (rq)
> +			goto unlock;
> +	}
> 
>   	/*
>   	 * Next, dispatch requests in priority order. Ignore lower priority
> @@ -749,7 +760,7 @@ static int dd_request_merge(struct request_queue *q, struct request **rq,
>   			    struct bio *bio)
>   {
>   	struct deadline_data *dd = q->elevator->elevator_data;
> -	const u8 ioprio_class = dd_bio_ioclass(bio);
> +	const u8 ioprio_class = dd_bio_ioclass(dd, bio);
>   	const enum dd_prio prio = ioprio_class_to_prio[ioprio_class];
>   	struct dd_per_prio *per_prio = &dd->per_prio[prio];
>   	sector_t sector = bio_end_sector(bio);
> @@ -814,7 +825,7 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
>   	 */
>   	blk_req_zone_write_unlock(rq);
> 
> -	prio = ioprio_class_to_prio[dd_rq_ioclass(rq)];
> +	prio = ioprio_class_to_prio[dd_rq_ioclass(dd, rq)];
>   	per_prio = &dd->per_prio[prio];
>   	if (!rq->elv.priv[0]) {
>   		per_prio->stats.inserted++;
> @@ -923,7 +934,7 @@ static void dd_finish_request(struct request *rq)
>   {
>   	struct request_queue *q = rq->q;
>   	struct deadline_data *dd = q->elevator->elevator_data;
> -	const u8 ioprio_class = dd_rq_ioclass(rq);
> +	const u8 ioprio_class = dd_rq_ioclass(dd, rq);
>   	const enum dd_prio prio = ioprio_class_to_prio[ioprio_class];
>   	struct dd_per_prio *per_prio = &dd->per_prio[prio];
> 
>
Bart Van Assche Dec. 8, 2023, 6:40 p.m. UTC | #5
On 12/7/23 17:37, Damien Le Moal wrote:
> On 12/8/23 09:03, Bart Van Assche wrote:
>> +static bool dd_use_io_priority(struct deadline_data *dd, enum req_op op)
>> +{
>> +	return dd->prio_aging_expire != 0 && !op_needs_zoned_write_locking(op);
>> +}
> 
> Hard NACK on this. The reason is that this will disable IO priority also for
> sensible use cases that use libaio/io_uring with direct IOs, with an application
> that does the right thing for writes, namely assigning the same priority for all
> writes to a zone. There are some use cases like this in production.
> 
> I do understand that there is a problem when IO priorities come from cgroups and
> the user go through a file system. But that should be handled by the file
> system. That is, for f2fs, all writes going to the same zone should have the
> same priority. Otherwise, priority inversion issues will lead to non sequential
> write patterns.
> 
> Ideally, we should indeed have a generic solution for the cgroup case. But it
> seems that for now, the simplest thing to do is to not allow priorities through
> cgroups for writes to zoned devices, unless cgroups is made more intellignet
> about it and manage bio priorities per zone to avoid priority inversion within a
> zone.

Hi Damien,

My understanding is that blkcg_set_ioprio() is called from inside submit_bio()
and hence that the reported issue cannot be solved by modifying F2FS. How about
modifying the blk-ioprio policy such that it ignores zoned writes?

Thanks,

Bart.
Damien Le Moal Dec. 11, 2023, 7:40 a.m. UTC | #6
On 12/9/23 03:40, Bart Van Assche wrote:
> On 12/7/23 17:37, Damien Le Moal wrote:
>> On 12/8/23 09:03, Bart Van Assche wrote:
>>> +static bool dd_use_io_priority(struct deadline_data *dd, enum req_op op)
>>> +{
>>> +	return dd->prio_aging_expire != 0 && !op_needs_zoned_write_locking(op);
>>> +}
>>
>> Hard NACK on this. The reason is that this will disable IO priority also for
>> sensible use cases that use libaio/io_uring with direct IOs, with an application
>> that does the right thing for writes, namely assigning the same priority for all
>> writes to a zone. There are some use cases like this in production.
>>
>> I do understand that there is a problem when IO priorities come from cgroups and
>> the user go through a file system. But that should be handled by the file
>> system. That is, for f2fs, all writes going to the same zone should have the
>> same priority. Otherwise, priority inversion issues will lead to non sequential
>> write patterns.
>>
>> Ideally, we should indeed have a generic solution for the cgroup case. But it
>> seems that for now, the simplest thing to do is to not allow priorities through
>> cgroups for writes to zoned devices, unless cgroups is made more intellignet
>> about it and manage bio priorities per zone to avoid priority inversion within a
>> zone.
> 
> Hi Damien,
> 
> My understanding is that blkcg_set_ioprio() is called from inside submit_bio()
> and hence that the reported issue cannot be solved by modifying F2FS. How about
> modifying the blk-ioprio policy such that it ignores zoned writes?

I do not see a better solution than that at the moment. So yes, let's do that.
But please add a big comment in the code explaining why we ignore zoned writes.
Christoph Hellwig Dec. 11, 2023, 4:57 p.m. UTC | #7
On Mon, Dec 04, 2023 at 09:32:13PM -0800, Bart Van Assche wrote:
> Fix the following two issues:
> - Even with prio_aging_expire set to zero, I/O priorities still affect the
>   request order.
> - Assigning I/O priorities with the ioprio cgroup policy breaks zoned
>   storage support in the mq-deadline scheduler.

Not it doesn't, how would it?  Or do you mean your f2fs hacks where you
assume there is some order kept?  You really need to get rid of them
and make sure f2fs doesn't care about reordering by writing the
metadata that records the data location only at I/O completion time.
Not only does that make zoned I/O trivially right, it also fixes the
stale data exposures you are almost guaranteed to have even on
conventional devices without that.
Bart Van Assche Dec. 11, 2023, 5:20 p.m. UTC | #8
On 12/11/23 08:57, Christoph Hellwig wrote:
> Or do you mean your f2fs hacks where you assume there is some order
> kept?

Hi Christoph,

It seems like there is a misunderstanding. The issue addressed by this
patch series is not F2FS-specific. The issue addressed by this patch
series can be encountered by any software that submits REQ_OP_WRITE
operations.

Bart.
Damien Le Moal Dec. 11, 2023, 10:40 p.m. UTC | #9
On 12/12/23 01:57, Christoph Hellwig wrote:
> On Mon, Dec 04, 2023 at 09:32:13PM -0800, Bart Van Assche wrote:
>> Fix the following two issues:
>> - Even with prio_aging_expire set to zero, I/O priorities still affect the
>>   request order.
>> - Assigning I/O priorities with the ioprio cgroup policy breaks zoned
>>   storage support in the mq-deadline scheduler.
> 
> Not it doesn't, how would it?  Or do you mean your f2fs hacks where you
> assume there is some order kept?  You really need to get rid of them
> and make sure f2fs doesn't care about reordering by writing the
> metadata that records the data location only at I/O completion time.
> Not only does that make zoned I/O trivially right, it also fixes the
> stale data exposures you are almost guaranteed to have even on
> conventional devices without that.

Priority CGroups can mess things up I think. If you have 2 processes belonging
to different CGs with different priorities and:
1) The processes do raw block device accesses and write to the same zone,
synchronized to get the WP correctly
2) The processes are writing different files and the FS decides to place the
block for the files in the same zone

Case (1) is clearly "the user is doing very stupid things" and for that case,
the user definitely deserve seeing his writes failing. But case (2) is perfectly
legit I think. That is the one that needs to be addressed. The choices I see
are: every file system supporting zone writes need to be priority CG aware when
writing files, or we ignore priority CG when writing.

The latter is I think better than the former as CGs can change without the FS
being aware (as far as I know), and such support would need to be implemented
for all FSes that support zone writing using regular writes (f2fs and zonefs).
Christoph Hellwig Dec. 12, 2023, 3:40 p.m. UTC | #10
On Mon, Dec 11, 2023 at 09:20:00AM -0800, Bart Van Assche wrote:
> It seems like there is a misunderstanding. The issue addressed by this
> patch series is not F2FS-specific. The issue addressed by this patch
> series can be encountered by any software that submits REQ_OP_WRITE
> operations.

How so?
Christoph Hellwig Dec. 12, 2023, 3:41 p.m. UTC | #11
On Tue, Dec 12, 2023 at 07:40:02AM +0900, Damien Le Moal wrote:
> The latter is I think better than the former as CGs can change without the FS
> being aware (as far as I know), and such support would need to be implemented
> for all FSes that support zone writing using regular writes (f2fs and zonefs).

How is cse 2 any kind of problem when using the proper append model?
Except when blk-zoned delays it so much that we really need to close
the zone becaue it's timing out, but that would really be configuration
issue.
Bart Van Assche Dec. 12, 2023, 5:15 p.m. UTC | #12
On 12/12/23 07:41, Christoph Hellwig wrote:
> On Tue, Dec 12, 2023 at 07:40:02AM +0900, Damien Le Moal wrote:
>> The latter is I think better than the former as CGs can change without the FS
>> being aware (as far as I know), and such support would need to be implemented
>> for all FSes that support zone writing using regular writes (f2fs and zonefs).
> 
> How is cse 2 any kind of problem when using the proper append model?
> Except when blk-zoned delays it so much that we really need to close
> the zone becaue it's timing out, but that would really be configuration
> issue.

As mentioned before, UFS devices implement the SCSI command set and hence do not
support write append operations. If anyone else standardizes a write append
command for SCSI we can ask UFS vendors to implement that command. However, as
far as I know nobody in T10 is working on standardizing a write append command.

Bart.
Christoph Hellwig Dec. 12, 2023, 5:18 p.m. UTC | #13
On Tue, Dec 12, 2023 at 09:15:48AM -0800, Bart Van Assche wrote:
> As mentioned before, UFS devices implement the SCSI command set and hence do not
> support write append operations. If anyone else standardizes a write append
> command for SCSI we can ask UFS vendors to implement that command. However, as
> far as I know nobody in T10 is working on standardizing a write append command.

The actual hardware support does not matter, it's the programming model.
Even with the zone append emulation in sd you don't need to care about
reordering due to I/O priorities.
Bart Van Assche Dec. 12, 2023, 5:42 p.m. UTC | #14
On 12/12/23 09:18, Christoph Hellwig wrote:
> On Tue, Dec 12, 2023 at 09:15:48AM -0800, Bart Van Assche wrote:
>> As mentioned before, UFS devices implement the SCSI command set and hence do not
>> support write append operations. If anyone else standardizes a write append
>> command for SCSI we can ask UFS vendors to implement that command. However, as
>> far as I know nobody in T10 is working on standardizing a write append command.
> 
> The actual hardware support does not matter, it's the programming model.
> Even with the zone append emulation in sd you don't need to care about
> reordering due to I/O priorities.

The actual hardware support does matter. Using the zone append emulation from
drivers/scsi/sd_zbc.c (or any other zone append emulation) is not an option for
high performance devices. This is because emulating zone append can only be done
by serializing write operations. Any such serialization negatively affects
performance.

Thanks,

Bart.
Christoph Hellwig Dec. 12, 2023, 5:48 p.m. UTC | #15
On Tue, Dec 12, 2023 at 09:42:24AM -0800, Bart Van Assche wrote:
> drivers/scsi/sd_zbc.c (or any other zone append emulation) is not an option for
> high performance devices. This is because emulating zone append can only be done
> by serializing write operations. Any such serialization negatively affects
> performance.

Seriously Bart, we've been through this a few times.  You keep insisting
on using a broken model despite having a major influence on the standards.
Go and fix them, and I bet you'll actually have plenty of support.  And
stop pushing hacks into the block layer and SCSI code to work around your
broken file system code and lack of interest in actually making the
hardware interface sane.

This isn't going to get you anywhere, while you're wasting a lot of
peoples time.
Bart Van Assche Dec. 12, 2023, 6:09 p.m. UTC | #16
On 12/12/23 09:48, Christoph Hellwig wrote:
> You keep insisting
> on using a broken model despite having a major influence on the standards.
> Go and fix them, and I bet you'll actually have plenty of support.

Hi Christoph,

I do *not* appreciate what you wrote. You shouldn't tell me what I should do
with regard to standardization. If you want standards to be changed, please
change these yourself.

Bart.
Christoph Hellwig Dec. 12, 2023, 6:13 p.m. UTC | #17
On Tue, Dec 12, 2023 at 10:09:49AM -0800, Bart Van Assche wrote:
>
> On 12/12/23 09:48, Christoph Hellwig wrote:
>> You keep insisting
>> on using a broken model despite having a major influence on the standards.
>> Go and fix them, and I bet you'll actually have plenty of support.
>
> Hi Christoph,
>
> I do *not* appreciate what you wrote. You shouldn't tell me what I should do
> with regard to standardization. If you want standards to be changed, please
> change these yourself.

Bart,

I don't appreciate all the work you create by insisting on a fundamentally
broken model either.  If you want zoned devices to work you need something
like zone append, and your insistence on using broken methods is not
helpful.  So if you don't want to change the standard to actually work
for your use case at least don't waste *our* time trying to work around
it badly.


>
> Bart.
---end quoted text---
Bart Van Assche Dec. 12, 2023, 6:19 p.m. UTC | #18
On 12/12/23 10:13, Christoph Hellwig wrote:
> I don't appreciate all the work you create by insisting on a fundamentally
> broken model either.  If you want zoned devices to work you need something
> like zone append, and your insistence on using broken methods is not
> helpful.  So if you don't want to change the standard to actually work
> for your use case at least don't waste *our* time trying to work around
> it badly.

Hi Christoph,

"Fundamentally broken model" is your personal opinion. I don't know anyone
else than you who considers zoned writes as a broken model.

Bart.
Christoph Hellwig Dec. 12, 2023, 6:26 p.m. UTC | #19
On Tue, Dec 12, 2023 at 10:19:31AM -0800, Bart Van Assche wrote:
> "Fundamentally broken model" is your personal opinion. I don't know anyone
> else than you who considers zoned writes as a broken model.

No Bart, it is not.  Talk to Damien, talk to Martin, to Jens.  Or just
look at all the patches you're sending to the list that play a never
ending hac-a-mole trying to bandaid over reordering that should be
perfectly fine.  You're playing a long term losing game by trying to
prevent reordering that you can't win.
Jaegeuk Kim Dec. 12, 2023, 7:03 p.m. UTC | #20
On 12/12, Christoph Hellwig wrote:
> On Tue, Dec 12, 2023 at 10:19:31AM -0800, Bart Van Assche wrote:
> > "Fundamentally broken model" is your personal opinion. I don't know anyone
> > else than you who considers zoned writes as a broken model.
> 
> No Bart, it is not.  Talk to Damien, talk to Martin, to Jens.  Or just
> look at all the patches you're sending to the list that play a never
> ending hac-a-mole trying to bandaid over reordering that should be
> perfectly fine.  You're playing a long term losing game by trying to
> prevent reordering that you can't win.

As one of users of zoned devices, I disagree this is a broken model, but even
better than the zone append model. When considering the filesystem performance,
it is essential to place the data per file to get better bandwidth. And for
NAND-based storage, filesystem is the right place to deal with the more efficient
garbage collecion based on the known data locations. That's why all the flash
storage vendors adopted it in the JEDEC. Agreed that zone append is nice, but
IMO, it's not practical for production.
Bart Van Assche Dec. 12, 2023, 10:44 p.m. UTC | #21
On 12/10/23 23:40, Damien Le Moal wrote:
> On 12/9/23 03:40, Bart Van Assche wrote:
>> My understanding is that blkcg_set_ioprio() is called from inside submit_bio()
>> and hence that the reported issue cannot be solved by modifying F2FS. How about
>> modifying the blk-ioprio policy such that it ignores zoned writes?
> 
> I do not see a better solution than that at the moment. So yes, let's do that.
> But please add a big comment in the code explaining why we ignore zoned writes.

Hi Damien,

We tested a patch for the blk-ioprio cgroup policy that makes it skip zoned writes.
We noticed that such a patch is not sufficient to prevent unaligned write errors
because some tasks have been assigned an I/O priority via the ionice command
(ioprio_set() system call). I think it would be wrong to skip the assignment of an
I/O priority for zoned writes in all code that can set an I/O priority. Since the
root cause of this issue is the inability of the mq-deadline I/O scheduler to
preserve the order for zoned writes with different I/O priorities, I think this
issue should be fixed in the mq-deadline I/O scheduler.

Thanks,

Bart.
Damien Le Moal Dec. 12, 2023, 11:44 p.m. UTC | #22
On 12/13/23 04:03, Jaegeuk Kim wrote:
> On 12/12, Christoph Hellwig wrote:
>> On Tue, Dec 12, 2023 at 10:19:31AM -0800, Bart Van Assche wrote:
>>> "Fundamentally broken model" is your personal opinion. I don't know anyone
>>> else than you who considers zoned writes as a broken model.
>>
>> No Bart, it is not.  Talk to Damien, talk to Martin, to Jens.  Or just
>> look at all the patches you're sending to the list that play a never
>> ending hac-a-mole trying to bandaid over reordering that should be
>> perfectly fine.  You're playing a long term losing game by trying to
>> prevent reordering that you can't win.
> 
> As one of users of zoned devices, I disagree this is a broken model, but even
> better than the zone append model. When considering the filesystem performance,
> it is essential to place the data per file to get better bandwidth. And for
> NAND-based storage, filesystem is the right place to deal with the more efficient
> garbage collecion based on the known data locations. That's why all the flash
> storage vendors adopted it in the JEDEC. Agreed that zone append is nice, but
> IMO, it's not practical for production.

The work on btrfs is a counter argument to this statement. The initial zone
support based on regular writes was going nowhere as trying to maintain ordering
was too complex and/or too invasive. Using zone append for the data path solved
and simplified many things.

I do think that zone append has a narrower use case spectrum for applications
relying on the raw block device directly. But for file systems, it definitely is
an easier to use writing model for zoned storage.
Damien Le Moal Dec. 12, 2023, 11:52 p.m. UTC | #23
On 12/13/23 07:44, Bart Van Assche wrote:
> On 12/10/23 23:40, Damien Le Moal wrote:
>> On 12/9/23 03:40, Bart Van Assche wrote:
>>> My understanding is that blkcg_set_ioprio() is called from inside submit_bio()
>>> and hence that the reported issue cannot be solved by modifying F2FS. How about
>>> modifying the blk-ioprio policy such that it ignores zoned writes?
>>
>> I do not see a better solution than that at the moment. So yes, let's do that.
>> But please add a big comment in the code explaining why we ignore zoned writes.
> 
> Hi Damien,
> 
> We tested a patch for the blk-ioprio cgroup policy that makes it skip zoned writes.
> We noticed that such a patch is not sufficient to prevent unaligned write errors
> because some tasks have been assigned an I/O priority via the ionice command
> (ioprio_set() system call). I think it would be wrong to skip the assignment of an
> I/O priority for zoned writes in all code that can set an I/O priority. Since the
> root cause of this issue is the inability of the mq-deadline I/O scheduler to
> preserve the order for zoned writes with different I/O priorities, I think this
> issue should be fixed in the mq-deadline I/O scheduler.

Not necessarily. When the priority for an IO is set when a BIO is prepared, we
know where that priority come from:
1) The user kiocb through aio_reqprio
2) The process ionice context
3) priority cgroups

We can disable (2) and (3) and leave (1) as is.

Trying to solve this issue in mq-deadline would require keeping track of the io
priority used for a write request that is issued to a zone and use that same
priority for all following write requests for the same zone until there are no
writes pending for that zone. Otherwise, you will get the priority inversion
causing the reordering.

But I think that doing all this without also causing priority inversion for the
user, i.e. a high priority write request ends up waiting for a low priority one,
will be challenging, to say the least.
Bart Van Assche Dec. 13, 2023, 1:02 a.m. UTC | #24
On 12/12/23 13:52, Damien Le Moal wrote:
> Trying to solve this issue in mq-deadline would require keeping track of the io
> priority used for a write request that is issued to a zone and use that same
> priority for all following write requests for the same zone until there are no
> writes pending for that zone. Otherwise, you will get the priority inversion
> causing the reordering.
> 
> But I think that doing all this without also causing priority inversion for the
> user, i.e. a high priority write request ends up waiting for a low priority one,
> will be challenging, to say the least.

Hi Damien,

How about the following algorithm?
- If a zoned write refers to the start of a zone or no other writes for
   the same zone occur in the RB-tree, use the I/O priority of the zoned
   write.
- If another write for the same zone occurs in the RB-tree, use the I/O
   priority from that other write.

While this algorithm does not guarantee that all zoned writes for a 
single zone have the same I/O priority, it guarantees that the 
mq-deadline I/O scheduler won't submit zoned writes in the wrong order 
because of their I/O priority.

Thanks,

Bart.
Damien Le Moal Dec. 13, 2023, 5:29 a.m. UTC | #25
On 12/13/23 10:02, Bart Van Assche wrote:
> On 12/12/23 13:52, Damien Le Moal wrote:
>> Trying to solve this issue in mq-deadline would require keeping track of the io
>> priority used for a write request that is issued to a zone and use that same
>> priority for all following write requests for the same zone until there are no
>> writes pending for that zone. Otherwise, you will get the priority inversion
>> causing the reordering.
>>
>> But I think that doing all this without also causing priority inversion for the
>> user, i.e. a high priority write request ends up waiting for a low priority one,
>> will be challenging, to say the least.
> 
> Hi Damien,
> 
> How about the following algorithm?
> - If a zoned write refers to the start of a zone or no other writes for
>    the same zone occur in the RB-tree, use the I/O priority of the zoned
>    write.
> - If another write for the same zone occurs in the RB-tree, use the I/O
>    priority from that other write.
> 
> While this algorithm does not guarantee that all zoned writes for a 
> single zone have the same I/O priority, it guarantees that the 
> mq-deadline I/O scheduler won't submit zoned writes in the wrong order 
> because of their I/O priority.

I guess this should work.

> 
> Thanks,
> 
> Bart.
>
Christoph Hellwig Dec. 13, 2023, 3:56 p.m. UTC | #26
On Tue, Dec 12, 2023 at 11:03:06AM -0800, Jaegeuk Kim wrote:
> As one of users of zoned devices, I disagree this is a broken model,

So you think that chasing potential for reordering all over the I/O
stack in perpetualality, including obscure error handling paths and
disabling features intentended to throttle and delay I/O (like
ioprio and cgroups) is not a broken model?

> it is essential to place the data per file to get better bandwidth. And for
> NAND-based storage, filesystem is the right place to deal with the more efficient
> garbage collecion based on the known data locations.

And that works perfectly fine match for zone append.

> That's why all the flash
> storage vendors adopted it in the JEDEC.

Everyone sucking up to Google to place their product in Android, yes..


> Agreed that zone append is nice, but
> IMO, it's not practical for production.

You've delivered exactly zero arguments for that.
Jaegeuk Kim Dec. 13, 2023, 4:41 p.m. UTC | #27
On 12/13, Christoph Hellwig wrote:
> On Tue, Dec 12, 2023 at 11:03:06AM -0800, Jaegeuk Kim wrote:
> > As one of users of zoned devices, I disagree this is a broken model,
> 
> So you think that chasing potential for reordering all over the I/O
> stack in perpetualality, including obscure error handling paths and
> disabling features intentended to throttle and delay I/O (like
> ioprio and cgroups) is not a broken model?

As of now, we don't see any reordering issue except this. I don't have any
concern to keep the same ioprio on writes, since handheld devices are mostly
sensitive to reads. So, if you have other use-cases using zoned writes which
require different ioprio on writes, I think you can suggest a knob to control
it by users.

> 
> > it is essential to place the data per file to get better bandwidth. And for
> > NAND-based storage, filesystem is the right place to deal with the more efficient
> > garbage collecion based on the known data locations.
> 
> And that works perfectly fine match for zone append.

How that works, if the device gives random LBAs back to the adjacent data in
a file? And, how to make the LBAs into the sequential ones back?

> 
> > That's why all the flash
> > storage vendors adopted it in the JEDEC.
> 
> Everyone sucking up to Google to place their product in Android, yes..

Sorry, I needed to stop reading here, as you're totally biased. This is not
the case in JEDEC, as Bart spent multiple years to synchronize the technical
benefitcs that we've seen across UFS vendors as well as OEMs.

> 
> 
> > Agreed that zone append is nice, but
> > IMO, it's not practical for production.
> 
> You've delivered exactly zero arguments for that.
Jaegeuk Kim Dec. 13, 2023, 4:49 p.m. UTC | #28
On 12/13, Damien Le Moal wrote:
> On 12/13/23 04:03, Jaegeuk Kim wrote:
> > On 12/12, Christoph Hellwig wrote:
> >> On Tue, Dec 12, 2023 at 10:19:31AM -0800, Bart Van Assche wrote:
> >>> "Fundamentally broken model" is your personal opinion. I don't know anyone
> >>> else than you who considers zoned writes as a broken model.
> >>
> >> No Bart, it is not.  Talk to Damien, talk to Martin, to Jens.  Or just
> >> look at all the patches you're sending to the list that play a never
> >> ending hac-a-mole trying to bandaid over reordering that should be
> >> perfectly fine.  You're playing a long term losing game by trying to
> >> prevent reordering that you can't win.
> > 
> > As one of users of zoned devices, I disagree this is a broken model, but even
> > better than the zone append model. When considering the filesystem performance,
> > it is essential to place the data per file to get better bandwidth. And for
> > NAND-based storage, filesystem is the right place to deal with the more efficient
> > garbage collecion based on the known data locations. That's why all the flash
> > storage vendors adopted it in the JEDEC. Agreed that zone append is nice, but
> > IMO, it's not practical for production.
> 
> The work on btrfs is a counter argument to this statement. The initial zone
> support based on regular writes was going nowhere as trying to maintain ordering
> was too complex and/or too invasive. Using zone append for the data path solved
> and simplified many things.

We're in supporting zoned writes, and we don't see huge problem of reordering
issues like you mention. I do agree there're pros and cons between the two, but
I believe using which one depends on user behaviors. If there's a user, why it
should be blocked? IOWs, why not just trying to support both?

> 
> I do think that zone append has a narrower use case spectrum for applications
> relying on the raw block device directly. But for file systems, it definitely is
> an easier to use writing model for zoned storage.
> 
> -- 
> Damien Le Moal
> Western Digital Research
Damien Le Moal Dec. 13, 2023, 10:55 p.m. UTC | #29
On 12/14/23 01:49, Jaegeuk Kim wrote:
> On 12/13, Damien Le Moal wrote:
>> On 12/13/23 04:03, Jaegeuk Kim wrote:
>>> On 12/12, Christoph Hellwig wrote:
>>>> On Tue, Dec 12, 2023 at 10:19:31AM -0800, Bart Van Assche wrote:
>>>>> "Fundamentally broken model" is your personal opinion. I don't know anyone
>>>>> else than you who considers zoned writes as a broken model.
>>>>
>>>> No Bart, it is not.  Talk to Damien, talk to Martin, to Jens.  Or just
>>>> look at all the patches you're sending to the list that play a never
>>>> ending hac-a-mole trying to bandaid over reordering that should be
>>>> perfectly fine.  You're playing a long term losing game by trying to
>>>> prevent reordering that you can't win.
>>>
>>> As one of users of zoned devices, I disagree this is a broken model, but even
>>> better than the zone append model. When considering the filesystem performance,
>>> it is essential to place the data per file to get better bandwidth. And for
>>> NAND-based storage, filesystem is the right place to deal with the more efficient
>>> garbage collecion based on the known data locations. That's why all the flash
>>> storage vendors adopted it in the JEDEC. Agreed that zone append is nice, but
>>> IMO, it's not practical for production.
>>
>> The work on btrfs is a counter argument to this statement. The initial zone
>> support based on regular writes was going nowhere as trying to maintain ordering
>> was too complex and/or too invasive. Using zone append for the data path solved
>> and simplified many things.
> 
> We're in supporting zoned writes, and we don't see huge problem of reordering
> issues like you mention. I do agree there're pros and cons between the two, but
> I believe using which one depends on user behaviors. If there's a user, why it
> should be blocked? IOWs, why not just trying to support both?

We do support both... But:
1) regular writes to zones is a user (= application) facing API. An application
using a block device directly without an FS can directly drive the issuing of
sequential writes to a zone. If there is an FS between the application and the
device, the FS decides what to do (regular writes or zone append, and to which zone)
2) Zone append cannot be directly issued by applications to block devices. I am
working on restoring zone append writes in zonefs as an alternative to this
limitation.

Now, in the context of IO priorities, issuing sequential writes to the same zone
with different priorities really is a silly thing to do. Even if done in the
proper order, that would essentially mean that whoever does that (FS or
application) is creating priority inversion issues for himself and thus negating
any benefit one can achieve with IO priorities (that is, most of the time,
lowering tail latency for a class of IOs).

As I mentioned before, for applications that use the zoned block device
directly, I think we should just leave things as is, that is, let the writes
fail if they are reordered due to a nonsensical IO priority setup. That is a
nice way to warn the user that he/she is doing something silly.

For the FS case, it is a little more difficult given that the user may have a
sensible IO priority setup, e.g. assigning different IO priorities (cgroups,
ioprio_set or ionice) to different processes accessing different files. For that
case, if the FS decides to issue writes to these files to the same zone, then
the problem occur. But back to the previous point: this is a silly thing to do
when writes have to be sequential. That is priority inversion right there.

The difficulty for an FS is, I think, that the FS cannot easily know the IO
priority until the BIO for the write is issued... So that is the problem that
needs fixing.

Bart's proposed fix will, I think, address your issue. However, it will also
hide IO priority setup problems to users accessing the block device directly.
That I do not like. As I stated above, I think it is better to let writes fail
in that case to signal the priority inversion. There are *a lot* of IO priority
SMR HDD users out there. Literally millions of drives running with that, and not
just for read operations. So please understand my concerns.

A better solution may be to introduce a BIO flags that says "ignore IO
priorities". f2fs can use that to avoid reordering writes to the same zone due
to different IO priorities (again, *that* is the issue to fix in the first place
I think, because that is simply silly to do, even with a regular HDD or SSD
since that will break sequential write streams and thus impact performace,
increase device-level GC/WAF etc).
Bart Van Assche Dec. 14, 2023, 12:08 a.m. UTC | #30
On 12/12/23 10:13, Christoph Hellwig wrote:
> If you want zoned devices to work you need something
> like zone append [ ... ]

If F2FS would submit REQ_OP_ZONE_APPEND operations, the only realistic
approach in the short term would be that sd_zbc.c translates these
operations into WRITE commands. Would it be acceptable to optimize
sd_zbc.c such that it does not restrict the queue depth to one per zone
if the storage device (UFS) preserves the command order per hardware
queue?

Thanks,

Bart.
Damien Le Moal Dec. 14, 2023, 12:37 a.m. UTC | #31
On 12/14/23 09:08, Bart Van Assche wrote:
> On 12/12/23 10:13, Christoph Hellwig wrote:
>> If you want zoned devices to work you need something
>> like zone append [ ... ]
> 
> If F2FS would submit REQ_OP_ZONE_APPEND operations, the only realistic
> approach in the short term would be that sd_zbc.c translates these
> operations into WRITE commands. Would it be acceptable to optimize
> sd_zbc.c such that it does not restrict the queue depth to one per zone
> if the storage device (UFS) preserves the command order per hardware
> queue?

Yes, that can be trivially done with the sd_zbc.c zone append emulation. If you
check the code, you'll see that sd_zbc_prepare_zone_append() returns
BLK_STS_ZONE_RESOURCE if the target zone is already locked. That causes a
requeue and the zone append to be resubmitted later. All you need to do for UFS
devices is tweak that to not requeue the zone append if the write command used
to emulate it can be issued. The completion path will also, of course, need some
tweaks to not attempt to unlock the target zone if it was not locked.

> 
> Thanks,
> 
> Bart.
>
Christoph Hellwig Dec. 14, 2023, 8:51 a.m. UTC | #32
On Thu, Dec 14, 2023 at 09:37:21AM +0900, Damien Le Moal wrote:
> Yes, that can be trivially done with the sd_zbc.c zone append emulation. If you
> check the code, you'll see that sd_zbc_prepare_zone_append() returns
> BLK_STS_ZONE_RESOURCE if the target zone is already locked. That causes a
> requeue and the zone append to be resubmitted later. All you need to do for UFS
> devices is tweak that to not requeue the zone append if the write command used
> to emulate it can be issued. The completion path will also, of course, need some
> tweaks to not attempt to unlock the target zone if it was not locked.

On the Linux side yes.  I still don't see how the hardware can actually
make this scheme work withou the potential of deadlocks.  But compared
to the problems with having a completely in-order I/O stacks that's
peanuts as they say in Germany.
Christoph Hellwig Dec. 14, 2023, 8:57 a.m. UTC | #33
On Wed, Dec 13, 2023 at 08:41:32AM -0800, Jaegeuk Kim wrote:
> I don't have any
> concern to keep the same ioprio on writes, since handheld devices are mostly
> sensitive to reads. So, if you have other use-cases using zoned writes which
> require different ioprio on writes, I think you can suggest a knob to control
> it by users.

Get out of your little handheld world.  In Linux we need a generally usable
I/O stack, and any feature exposed by the kernel and will be used quite
differently than you imagine.

Just like people will add reordering to the I/O stack that's not there
right now (in addition to the ones your testing doesn't hit).  That
doensn't mean we should avoid them - you genereally get better performance
by not reordering without a good reason (like thotting), but especially
in error handling paths or resource constrained environment they will
hapen all over.  We've had this whole discussion with the I/O barriers
that did not work for exactly the same reasons.

> 
> > 
> > > it is essential to place the data per file to get better bandwidth. And for
> > > NAND-based storage, filesystem is the right place to deal with the more efficient
> > > garbage collecion based on the known data locations.
> > 
> > And that works perfectly fine match for zone append.
> 
> How that works, if the device gives random LBAs back to the adjacent data in
> a file? And, how to make the LBAs into the sequential ones back?

Why would your device pick random LBAs?  If you send a zone append to
zone it will be written at the write pointer, which is absolutely not
random.  All I/O written in a single write is going to be sequential,
so just like for all other devices doing large sequential writes is
important.  Multiple writes can get reordered, but if you havily hit
the same zone you'd get the same effect in the file system allocator
too.

> Sorry, I needed to stop reading here, as you're totally biased. This is not
> the case in JEDEC, as Bart spent multiple years to synchronize the technical
> benefitcs that we've seen across UFS vendors as well as OEMs.

*lol*  There is no more fucked up corporate pressure standard committee
than the storage standards in JEDEC.  That's why not one actually takes
them seriously.
Jaegeuk Kim Dec. 14, 2023, 5:22 p.m. UTC | #34
On 12/14, Christoph Hellwig wrote:
> On Wed, Dec 13, 2023 at 08:41:32AM -0800, Jaegeuk Kim wrote:
> > I don't have any
> > concern to keep the same ioprio on writes, since handheld devices are mostly
> > sensitive to reads. So, if you have other use-cases using zoned writes which
> > require different ioprio on writes, I think you can suggest a knob to control
> > it by users.
> 
> Get out of your little handheld world.  In Linux we need a generally usable
> I/O stack, and any feature exposed by the kernel and will be used quite
> differently than you imagine.
> 
> Just like people will add reordering to the I/O stack that's not there
> right now (in addition to the ones your testing doesn't hit).  That
> doensn't mean we should avoid them - you genereally get better performance
> by not reordering without a good reason (like thotting), but especially
> in error handling paths or resource constrained environment they will
> hapen all over.  We've had this whole discussion with the I/O barriers
> that did not work for exactly the same reasons.
> 
> > 
> > > 
> > > > it is essential to place the data per file to get better bandwidth. And for
> > > > NAND-based storage, filesystem is the right place to deal with the more efficient
> > > > garbage collecion based on the known data locations.
> > > 
> > > And that works perfectly fine match for zone append.
> > 
> > How that works, if the device gives random LBAs back to the adjacent data in
> > a file? And, how to make the LBAs into the sequential ones back?
> 
> Why would your device pick random LBAs?  If you send a zone append to
> zone it will be written at the write pointer, which is absolutely not
> random.  All I/O written in a single write is going to be sequential,
> so just like for all other devices doing large sequential writes is
> important.  Multiple writes can get reordered, but if you havily hit
> the same zone you'd get the same effect in the file system allocator
> too.

How can you guarantee the device does not give any random LBAs? What'd
be the selling point of zone append to end users? Are you sure this can
give the better write trhought forever? Have you considered how to
implement this in device side such as FTL mapping overhead and garbage
collection leading to tail latencies?

My takeaway on the two approaches would be:
                  zone_append        zone_write
		  -----------        ----------
LBA               from FTL           from filesystem
FTL mapping       Page-map           Zone-map
SRAM/DRAM needs   Large              Small
FTL GC            Required           Not required
Tail latencies    Exist              Not exisit
GC Efficience     Worse              Better
Longevity         As-is              Longer
Discard cmd       Required           Not required
Block complexity  Small              Large
Failure cases     Less exist         Exist
Fsck              Don't know         F2FS-TOOLS support
Filesystem        BTRFS support(?)   F2FS support

Given this, I took zone_write, especially for mobile devices, since we can
recover the unaligned writes in the corner cases by fsck. And, most benefit
would be getting rid of FTL mapping overhead which improves random read IOPs
significantly due to the lack of SRAM in low-end storages. And, longer lifetime
by mitigating garbage collection overhead is more important in mobile world.

If there's any flag or knob that we can set, IMO, that'd be enough.

> 
> > Sorry, I needed to stop reading here, as you're totally biased. This is not
> > the case in JEDEC, as Bart spent multiple years to synchronize the technical
> > benefitcs that we've seen across UFS vendors as well as OEMs.
> 
> *lol*  There is no more fucked up corporate pressure standard committee
> than the storage standards in JEDEC.  That's why not one actually takes
> them seriously.
Bart Van Assche Dec. 14, 2023, 7:32 p.m. UTC | #35
On 12/14/23 00:57, Christoph Hellwig wrote:
> On Wed, Dec 13, 2023 at 08:41:32AM -0800, Jaegeuk Kim wrote:
>> How that works, if the device gives random LBAs back to the adjacent data in
>> a file? And, how to make the LBAs into the sequential ones back?
> 
> Why would your device pick random LBAs?  If you send a zone append to
> zone it will be written at the write pointer, which is absolutely not
> random.  All I/O written in a single write is going to be sequential,
> so just like for all other devices doing large sequential writes is
> important.
Bio splitting is one potential cause of write reordering that can be
triggered even if the filesystem submits large writes. The maximum segment
size in the UFS driver is rather small (512 KiB) to restrict the latency
impact of writing data on reads.

Bart.
Damien Le Moal Dec. 15, 2023, 1:12 a.m. UTC | #36
On 12/15/23 02:22, Jaegeuk Kim wrote:
> On 12/14, Christoph Hellwig wrote:
>> On Wed, Dec 13, 2023 at 08:41:32AM -0800, Jaegeuk Kim wrote:
>>> I don't have any
>>> concern to keep the same ioprio on writes, since handheld devices are mostly
>>> sensitive to reads. So, if you have other use-cases using zoned writes which
>>> require different ioprio on writes, I think you can suggest a knob to control
>>> it by users.
>>
>> Get out of your little handheld world.  In Linux we need a generally usable
>> I/O stack, and any feature exposed by the kernel and will be used quite
>> differently than you imagine.
>>
>> Just like people will add reordering to the I/O stack that's not there
>> right now (in addition to the ones your testing doesn't hit).  That
>> doensn't mean we should avoid them - you genereally get better performance
>> by not reordering without a good reason (like thotting), but especially
>> in error handling paths or resource constrained environment they will
>> hapen all over.  We've had this whole discussion with the I/O barriers
>> that did not work for exactly the same reasons.
>>
>>>
>>>>
>>>>> it is essential to place the data per file to get better bandwidth. And for
>>>>> NAND-based storage, filesystem is the right place to deal with the more efficient
>>>>> garbage collecion based on the known data locations.
>>>>
>>>> And that works perfectly fine match for zone append.
>>>
>>> How that works, if the device gives random LBAs back to the adjacent data in
>>> a file? And, how to make the LBAs into the sequential ones back?
>>
>> Why would your device pick random LBAs?  If you send a zone append to
>> zone it will be written at the write pointer, which is absolutely not
>> random.  All I/O written in a single write is going to be sequential,
>> so just like for all other devices doing large sequential writes is
>> important.  Multiple writes can get reordered, but if you havily hit
>> the same zone you'd get the same effect in the file system allocator
>> too.
> 
> How can you guarantee the device does not give any random LBAs? What'd> be the selling point of zone append to end users? Are you sure this can
> give the better write trhought forever? Have you considered how to
> implement this in device side such as FTL mapping overhead and garbage
> collection leading to tail latencies?

Answers to all your questions, in order:

1) Asking this is to me similar to asking how can one guarantee that the device
gives back the data that was written... You are asking for guarantees that the
device is not buggy. By definition, zone append will return the writen start LBA
within the zone that the zone append command specified. And that start LBA will
always be equal to the zone write pointer value when the device started
executing the zone append command.

2) When there is an FS, the user cannot know if the FS is using zone append or
not, so the user should not care at all. If by "user" you mean "the file
system", then it is a design decision. We already pointed out that generally
speaking, zone append will be easier to use because it does not have ordering
constraints.

3) Yes, because the writes are always sequential, which is the least expensive
pattern for the device internal as that only triggers minimal internal activity
on the FTL, GC, weir leveling etc, at least for a decently designed device.

4) See above. If the device interface forces the device user to always write
sequentially, as mandated by a zoned device, then FTL, GC and weir leveling is
minimized. The actual device internal GC that may or may not happen completely
depend on how the device maps zones to flash super blocks. If the mapping is
1:1, then GC will be nearly non-existent. If the mapping is not 1:1, then GC
overhead may exist. The discussion should then be about the design choices of
the device. The fact that the host chose zone append will not in anyway make
things worse for the device. Even with regular writes the host must write
sequentially, same as what zone append achieves (potentially a lot more easily).

> My takeaway on the two approaches would be:
>                   zone_append        zone_write
> 		  -----------        ----------
> LBA               from FTL           from filesystem
> FTL mapping       Page-map           Zone-map

Not sure what you mean here. zone append always returns an LBA from within the
zone specified by the LBA in the command CDB. So mapping is still per zone. zone
append is *NOT* a random write command. Zone append automatically implements
sequential writing within a zone for the user. In the case of regular writes,
the user must fully control sentimentality. In both cases the write pattern *is*
sequential.

> SRAM/DRAM needs   Large              Small

There are no differences in this area because the FTL is the same for both. No
changes, nothing special for zone append.

> FTL GC            Required           Not required

Incorrect. See above. That depends on the device mapping of zones to flash
superblocks. And GC requirements are the same for both because the write pattern
is identical: it is sequential within each zone being written. The user still
controls which zone it wants to write. Zone append is not a magic command that
chooses a target zone automatically.

> Tail latencies    Exist              Not exisit

Incorrect. They are the same and because of the lack of ordering requirement
with zone append, if anything, zone append can give better latency.

> GC Efficience     Worse              Better

Nope. See above. Same.

> Longevity         As-is              Longer
> Discard cmd       Required           Not required

There is no discard with zone devices. Only zone reset. So both are "not
required" here.

> Block complexity  Small              Large
> Failure cases     Less exist         Exist
> Fsck              Don't know         F2FS-TOOLS support
> Filesystem        BTRFS support(?)   F2FS support

Yes, btrfs data path uses zone append.

> 
> Given this, I took zone_write, especially for mobile devices, since we can
> recover the unaligned writes in the corner cases by fsck. And, most benefit
> would be getting rid of FTL mapping overhead which improves random read IOPs
> significantly due to the lack of SRAM in low-end storages. And, longer lifetime
> by mitigating garbage collection overhead is more important in mobile world.

As mentioned, GC is not an issue, or rather, GC depends on how the device is
designed, not on which type of write command the host uses. Writes are always
sequential for both types !
Jaegeuk Kim Dec. 15, 2023, 2:03 a.m. UTC | #37
On 12/15, Damien Le Moal wrote:
> On 12/15/23 02:22, Jaegeuk Kim wrote:
> > On 12/14, Christoph Hellwig wrote:
> >> On Wed, Dec 13, 2023 at 08:41:32AM -0800, Jaegeuk Kim wrote:
> >>> I don't have any
> >>> concern to keep the same ioprio on writes, since handheld devices are mostly
> >>> sensitive to reads. So, if you have other use-cases using zoned writes which
> >>> require different ioprio on writes, I think you can suggest a knob to control
> >>> it by users.
> >>
> >> Get out of your little handheld world.  In Linux we need a generally usable
> >> I/O stack, and any feature exposed by the kernel and will be used quite
> >> differently than you imagine.
> >>
> >> Just like people will add reordering to the I/O stack that's not there
> >> right now (in addition to the ones your testing doesn't hit).  That
> >> doensn't mean we should avoid them - you genereally get better performance
> >> by not reordering without a good reason (like thotting), but especially
> >> in error handling paths or resource constrained environment they will
> >> hapen all over.  We've had this whole discussion with the I/O barriers
> >> that did not work for exactly the same reasons.
> >>
> >>>
> >>>>
> >>>>> it is essential to place the data per file to get better bandwidth. And for
> >>>>> NAND-based storage, filesystem is the right place to deal with the more efficient
> >>>>> garbage collecion based on the known data locations.
> >>>>
> >>>> And that works perfectly fine match for zone append.
> >>>
> >>> How that works, if the device gives random LBAs back to the adjacent data in
> >>> a file? And, how to make the LBAs into the sequential ones back?
> >>
> >> Why would your device pick random LBAs?  If you send a zone append to
> >> zone it will be written at the write pointer, which is absolutely not
> >> random.  All I/O written in a single write is going to be sequential,
> >> so just like for all other devices doing large sequential writes is
> >> important.  Multiple writes can get reordered, but if you havily hit
> >> the same zone you'd get the same effect in the file system allocator
> >> too.
> > 
> > How can you guarantee the device does not give any random LBAs? What'd> be the selling point of zone append to end users? Are you sure this can
> > give the better write trhought forever? Have you considered how to
> > implement this in device side such as FTL mapping overhead and garbage
> > collection leading to tail latencies?
> 
> Answers to all your questions, in order:
> 
> 1) Asking this is to me similar to asking how can one guarantee that the device
> gives back the data that was written... You are asking for guarantees that the
> device is not buggy. By definition, zone append will return the writen start LBA
> within the zone that the zone append command specified. And that start LBA will
> always be equal to the zone write pointer value when the device started
> executing the zone append command.
> 
> 2) When there is an FS, the user cannot know if the FS is using zone append or
> not, so the user should not care at all. If by "user" you mean "the file
> system", then it is a design decision. We already pointed out that generally
> speaking, zone append will be easier to use because it does not have ordering
> constraints.
> 
> 3) Yes, because the writes are always sequential, which is the least expensive
> pattern for the device internal as that only triggers minimal internal activity
> on the FTL, GC, weir leveling etc, at least for a decently designed device.
> 
> 4) See above. If the device interface forces the device user to always write
> sequentially, as mandated by a zoned device, then FTL, GC and weir leveling is
> minimized. The actual device internal GC that may or may not happen completely
> depend on how the device maps zones to flash super blocks. If the mapping is
> 1:1, then GC will be nearly non-existent. If the mapping is not 1:1, then GC
> overhead may exist. The discussion should then be about the design choices of
> the device. The fact that the host chose zone append will not in anyway make
> things worse for the device. Even with regular writes the host must write
> sequentially, same as what zone append achieves (potentially a lot more easily).
> 
> > My takeaway on the two approaches would be:
> >                   zone_append        zone_write
> > 		  -----------        ----------
> > LBA               from FTL           from filesystem
> > FTL mapping       Page-map           Zone-map
> 
> Not sure what you mean here. zone append always returns an LBA from within the
> zone specified by the LBA in the command CDB. So mapping is still per zone. zone
> append is *NOT* a random write command. Zone append automatically implements
> sequential writing within a zone for the user. In the case of regular writes,
> the user must fully control sentimentality. In both cases the write pattern *is*
> sequential.

Okay, it seems there's first disconnect here, which fails to explain all the
below gaps. Do you think the device supporting zone_append keeps LBAs inline
with PBAs within a zone? E.g., LBA#n guarantees to map to PBA#n in a zone.
If LBA order is exactly matching to the PBA order all the time, the mapping
granularity is zone. Otherwise, it should be page.

> 
> > SRAM/DRAM needs   Large              Small
> 
> There are no differences in this area because the FTL is the same for both. No
> changes, nothing special for zone append.
> 
> > FTL GC            Required           Not required
> 
> Incorrect. See above. That depends on the device mapping of zones to flash
> superblocks. And GC requirements are the same for both because the write pattern
> is identical: it is sequential within each zone being written. The user still
> controls which zone it wants to write. Zone append is not a magic command that
> chooses a target zone automatically.
> 
> > Tail latencies    Exist              Not exisit
> 
> Incorrect. They are the same and because of the lack of ordering requirement
> with zone append, if anything, zone append can give better latency.
> 
> > GC Efficience     Worse              Better
> 
> Nope. See above. Same.
> 
> > Longevity         As-is              Longer
> > Discard cmd       Required           Not required
> 
> There is no discard with zone devices. Only zone reset. So both are "not
> required" here.
> 
> > Block complexity  Small              Large
> > Failure cases     Less exist         Exist
> > Fsck              Don't know         F2FS-TOOLS support
> > Filesystem        BTRFS support(?)   F2FS support
> 
> Yes, btrfs data path uses zone append.
> 
> > 
> > Given this, I took zone_write, especially for mobile devices, since we can
> > recover the unaligned writes in the corner cases by fsck. And, most benefit
> > would be getting rid of FTL mapping overhead which improves random read IOPs
> > significantly due to the lack of SRAM in low-end storages. And, longer lifetime
> > by mitigating garbage collection overhead is more important in mobile world.
> 
> As mentioned, GC is not an issue, or rather, GC depends on how the device is
> designed, not on which type of write command the host uses. Writes are always
> sequential for both types !
> 
> 
> -- 
> Damien Le Moal
> Western Digital Research
Keith Busch Dec. 15, 2023, 2:20 a.m. UTC | #38
On Thu, Dec 14, 2023 at 06:03:11PM -0800, Jaegeuk Kim wrote:
> 
> Okay, it seems there's first disconnect here, which fails to explain all the
> below gaps. Do you think the device supporting zone_append keeps LBAs inline
> with PBAs within a zone? E.g., LBA#n guarantees to map to PBA#n in a zone.
> If LBA order is exactly matching to the PBA order all the time, the mapping
> granularity is zone. Otherwise, it should be page.

Yah, you're describing how 'zone append' works.
Christoph Hellwig Dec. 15, 2023, 4:49 a.m. UTC | #39
On Thu, Dec 14, 2023 at 06:20:16PM -0800, Keith Busch wrote:
> On Thu, Dec 14, 2023 at 06:03:11PM -0800, Jaegeuk Kim wrote:
> > 
> > Okay, it seems there's first disconnect here, which fails to explain all the
> > below gaps. Do you think the device supporting zone_append keeps LBAs inline
> > with PBAs within a zone? E.g., LBA#n guarantees to map to PBA#n in a zone.
> > If LBA order is exactly matching to the PBA order all the time, the mapping
> > granularity is zone. Otherwise, it should be page.
> 
> Yah, you're describing how 'zone append' works.

Haha.  I'm glad I read through all the answers as Damien and you already
did the explaining.

Note that in a complex SSD of course there can still be some remapping
due to wear leveling, worn out blocks or similar policy decisions, but
that usuall happens below the superblock level and the SSDs can be much
smarter about the algorithms used for that as it knows all data will
be written sequentially.
diff mbox series

Patch

diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index fe5da2ade953..6781cef0109e 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -123,14 +123,16 @@  deadline_rb_root(struct dd_per_prio *per_prio, struct request *rq)
  * Returns the I/O priority class (IOPRIO_CLASS_*) that has been assigned to a
  * request.
  */
-static u8 dd_rq_ioclass(struct request *rq)
+static u8 dd_rq_ioclass(struct deadline_data *dd, struct request *rq)
 {
-	return IOPRIO_PRIO_CLASS(req_get_ioprio(rq));
+	return dd->prio_aging_expire ? IOPRIO_PRIO_CLASS(req_get_ioprio(rq)) :
+				       IOPRIO_CLASS_NONE;
 }
 
-static u8 dd_bio_ioclass(struct bio *bio)
+static u8 dd_bio_ioclass(struct deadline_data *dd, struct bio *bio)
 {
-	return IOPRIO_PRIO_CLASS(bio->bi_ioprio);
+	return dd->prio_aging_expire ? IOPRIO_PRIO_CLASS(bio->bi_ioprio) :
+				       IOPRIO_CLASS_NONE;
 }
 
 /*
@@ -233,7 +235,7 @@  static void dd_request_merged(struct request_queue *q, struct request *req,
 			      enum elv_merge type)
 {
 	struct deadline_data *dd = q->elevator->elevator_data;
-	const u8 ioprio_class = dd_rq_ioclass(req);
+	const u8 ioprio_class = dd_rq_ioclass(dd, req);
 	const enum dd_prio prio = ioprio_class_to_prio[ioprio_class];
 	struct dd_per_prio *per_prio = &dd->per_prio[prio];
 
@@ -253,7 +255,7 @@  static void dd_merged_requests(struct request_queue *q, struct request *req,
 			       struct request *next)
 {
 	struct deadline_data *dd = q->elevator->elevator_data;
-	const u8 ioprio_class = dd_rq_ioclass(next);
+	const u8 ioprio_class = dd_rq_ioclass(dd, next);
 	const enum dd_prio prio = ioprio_class_to_prio[ioprio_class];
 
 	lockdep_assert_held(&dd->lock);
@@ -550,7 +552,7 @@  static struct request *__dd_dispatch_request(struct deadline_data *dd,
 	dd->batching++;
 	deadline_move_request(dd, per_prio, rq);
 done:
-	ioprio_class = dd_rq_ioclass(rq);
+	ioprio_class = dd_rq_ioclass(dd, rq);
 	prio = ioprio_class_to_prio[ioprio_class];
 	dd->per_prio[prio].latest_pos[data_dir] = blk_rq_pos(rq);
 	dd->per_prio[prio].stats.dispatched++;
@@ -749,7 +751,7 @@  static int dd_request_merge(struct request_queue *q, struct request **rq,
 			    struct bio *bio)
 {
 	struct deadline_data *dd = q->elevator->elevator_data;
-	const u8 ioprio_class = dd_bio_ioclass(bio);
+	const u8 ioprio_class = dd_bio_ioclass(dd, bio);
 	const enum dd_prio prio = ioprio_class_to_prio[ioprio_class];
 	struct dd_per_prio *per_prio = &dd->per_prio[prio];
 	sector_t sector = bio_end_sector(bio);
@@ -814,7 +816,7 @@  static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
 	 */
 	blk_req_zone_write_unlock(rq);
 
-	prio = ioprio_class_to_prio[dd_rq_ioclass(rq)];
+	prio = ioprio_class_to_prio[dd_rq_ioclass(dd, rq)];
 	per_prio = &dd->per_prio[prio];
 	if (!rq->elv.priv[0]) {
 		per_prio->stats.inserted++;
@@ -923,7 +925,7 @@  static void dd_finish_request(struct request *rq)
 {
 	struct request_queue *q = rq->q;
 	struct deadline_data *dd = q->elevator->elevator_data;
-	const u8 ioprio_class = dd_rq_ioclass(rq);
+	const u8 ioprio_class = dd_rq_ioclass(dd, rq);
 	const enum dd_prio prio = ioprio_class_to_prio[ioprio_class];
 	struct dd_per_prio *per_prio = &dd->per_prio[prio];