diff mbox series

[5/5] block/mq-deadline: Remove zone locking

Message ID 20220614174943.611369-6-bvanassche@acm.org (mailing list archive)
State New, archived
Headers show
Series Improve zoned storage write performance | expand

Commit Message

Bart Van Assche June 14, 2022, 5:49 p.m. UTC
Measurements have shown that limiting the queue depth to one has a
significant negative performance impact on zoned UFS devices. Hence this
patch that removes zone locking from the mq-deadline scheduler. This
patch is based on the following assumptions:
- Applications submit write requests to sequential write required zones
  in order.
- If such write requests get reordered by the software or hardware queue
  mechanism, nr_hw_queues * nr_requests - 1 retries are sufficient to
  reorder the write requests.
- It happens infrequently that zoned write requests are reordered by the
  block layer.
- Either no I/O scheduler is used or an I/O scheduler is used that
  submits write requests per zone in LBA order.

DD_BE_PRIO is selected for sequential writes to preserve the LBA order.

See also commit 5700f69178e9 ("mq-deadline: Introduce zone locking
support").

Cc: Damien Le Moal <damien.lemoal@wdc.com>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
 block/mq-deadline.c | 74 ++++-----------------------------------------
 1 file changed, 6 insertions(+), 68 deletions(-)

Comments

Damien Le Moal June 14, 2022, 11:40 p.m. UTC | #1
On 6/15/22 02:49, Bart Van Assche wrote:
> Measurements have shown that limiting the queue depth to one has a
> significant negative performance impact on zoned UFS devices. Hence this
> patch that removes zone locking from the mq-deadline scheduler. This
> patch is based on the following assumptions:
> - Applications submit write requests to sequential write required zones
>   in order.
> - If such write requests get reordered by the software or hardware queue
>   mechanism, nr_hw_queues * nr_requests - 1 retries are sufficient to
>   reorder the write requests.

As mentioned in my previous reply for patch 2, AHCI will not behave like
that, at all. Retrying will be useless most of the time because the
adapter send commands to the drive randomly from the set of commands that
are marked ready in the ready register. So no. I am opposed to
unconditionally removing zone write locking.

If UFS LLD can deliver commands in order then use a queue flag to say
"zone write locking not needed" to disable it for that device class. There
probably are some SAS HBAs that could benefit from this too, but I have
seen so many reordering bugs with these (e.g. requeue at tail of a write
that got a TSF) that I would not want to remove zone write locking for these.

And I also do not want to start getting 10 calls a day from customers
complaining about very bad write performance due to all these retries
which are likely going to be slower in the end than writing at QD=1 per zone.

> - It happens infrequently that zoned write requests are reordered by the
>   block layer.

What make you say that ? It only takes 2 contexts trying to dispatch
commands to different queues. Or a write process being rescheduled to a
different CPU/queue.

> - Either no I/O scheduler is used or an I/O scheduler is used that
>   submits write requests per zone in LBA order.

And unfortunately, as the AHCI example shows, having the scheduler
dispatch requests in LBA order is not enough.

> 
> DD_BE_PRIO is selected for sequential writes to preserve the LBA order.

So if the application wanted the writes to have RT policy so that these
commands get the high priority bit set on SATA disks, that will not be
honored. No to that too.

> 
> See also commit 5700f69178e9 ("mq-deadline: Introduce zone locking
> support").
> 
> Cc: Damien Le Moal <damien.lemoal@wdc.com>
> Signed-off-by: Bart Van Assche <bvanassche@acm.org>
> ---
>  block/mq-deadline.c | 74 ++++-----------------------------------------
>  1 file changed, 6 insertions(+), 68 deletions(-)
> 
> diff --git a/block/mq-deadline.c b/block/mq-deadline.c
> index 6ed602b2f80a..e168fc9a980a 100644
> --- a/block/mq-deadline.c
> +++ b/block/mq-deadline.c
> @@ -104,7 +104,6 @@ struct deadline_data {
>  	int prio_aging_expire;
>  
>  	spinlock_t lock;
> -	spinlock_t zone_lock;
>  };
>  
>  /* Maps an I/O priority class to a deadline scheduler priority. */
> @@ -285,30 +284,10 @@ static struct request *
>  deadline_fifo_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
>  		      enum dd_data_dir data_dir)
>  {
> -	struct request *rq;
> -	unsigned long flags;
> -
>  	if (list_empty(&per_prio->fifo_list[data_dir]))
>  		return NULL;
>  
> -	rq = rq_entry_fifo(per_prio->fifo_list[data_dir].next);
> -	if (data_dir == DD_READ || !blk_queue_is_zoned(rq->q))
> -		return rq;
> -
> -	/*
> -	 * Look for a write request that can be dispatched, that is one with
> -	 * an unlocked target zone.
> -	 */
> -	spin_lock_irqsave(&dd->zone_lock, flags);
> -	list_for_each_entry(rq, &per_prio->fifo_list[DD_WRITE], queuelist) {
> -		if (blk_req_can_dispatch_to_zone(rq))
> -			goto out;
> -	}
> -	rq = NULL;
> -out:
> -	spin_unlock_irqrestore(&dd->zone_lock, flags);
> -
> -	return rq;
> +	return rq_entry_fifo(per_prio->fifo_list[data_dir].next);
>  }
>  
>  /*
> @@ -319,29 +298,7 @@ static struct request *
>  deadline_next_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
>  		      enum dd_data_dir data_dir)
>  {
> -	struct request *rq;
> -	unsigned long flags;
> -
> -	rq = per_prio->next_rq[data_dir];
> -	if (!rq)
> -		return NULL;
> -
> -	if (data_dir == DD_READ || !blk_queue_is_zoned(rq->q))
> -		return rq;
> -
> -	/*
> -	 * Look for a write request that can be dispatched, that is one with
> -	 * an unlocked target zone.
> -	 */
> -	spin_lock_irqsave(&dd->zone_lock, flags);
> -	while (rq) {
> -		if (blk_req_can_dispatch_to_zone(rq))
> -			break;
> -		rq = deadline_latter_request(rq);
> -	}
> -	spin_unlock_irqrestore(&dd->zone_lock, flags);
> -
> -	return rq;
> +	return per_prio->next_rq[data_dir];
>  }
>  
>  /*
> @@ -467,10 +424,6 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd,
>  	ioprio_class = dd_rq_ioclass(rq);
>  	prio = ioprio_class_to_prio[ioprio_class];
>  	dd->per_prio[prio].stats.dispatched++;
> -	/*
> -	 * If the request needs its target zone locked, do it.
> -	 */
> -	blk_req_zone_write_lock(rq);
>  	rq->rq_flags |= RQF_STARTED;
>  	return rq;
>  }
> @@ -640,7 +593,6 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e)
>  	dd->fifo_batch = fifo_batch;
>  	dd->prio_aging_expire = prio_aging_expire;
>  	spin_lock_init(&dd->lock);
> -	spin_lock_init(&dd->zone_lock);
>  
>  	q->elevator = eq;
>  	return 0;
> @@ -716,17 +668,13 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
>  	u8 ioprio_class = IOPRIO_PRIO_CLASS(ioprio);
>  	struct dd_per_prio *per_prio;
>  	enum dd_prio prio;
> +	bool seq_write = blk_rq_is_seq_write(rq);
>  	LIST_HEAD(free);
>  
>  	lockdep_assert_held(&dd->lock);
>  
> -	/*
> -	 * This may be a requeue of a write request that has locked its
> -	 * target zone. If it is the case, this releases the zone lock.
> -	 */
> -	blk_req_zone_write_unlock(rq);
> -
> -	prio = ioprio_class_to_prio[ioprio_class];
> +	prio = seq_write ? DD_BE_PRIO :
> +		ioprio_class_to_prio[ioprio_class];
>  	per_prio = &dd->per_prio[prio];
>  	if (!rq->elv.priv[0]) {
>  		per_prio->stats.inserted++;
> @@ -740,7 +688,7 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
>  
>  	trace_block_rq_insert(rq);
>  
> -	if (at_head) {
> +	if (at_head && !seq_write) {
>  		list_add(&rq->queuelist, &per_prio->dispatch);
>  		rq->fifo_time = jiffies;
>  	} else {
> @@ -819,16 +767,6 @@ static void dd_finish_request(struct request *rq)
>  		return;
>  
>  	atomic_inc(&per_prio->stats.completed);
> -
> -	if (blk_queue_is_zoned(q)) {
> -		unsigned long flags;
> -
> -		spin_lock_irqsave(&dd->zone_lock, flags);
> -		blk_req_zone_write_unlock(rq);
> -		if (!list_empty(&per_prio->fifo_list[DD_WRITE]))
> -			blk_mq_sched_mark_restart_hctx(rq->mq_hctx);
> -		spin_unlock_irqrestore(&dd->zone_lock, flags);
> -	}
>  }
>  
>  static bool dd_has_work_for_prio(struct dd_per_prio *per_prio)
diff mbox series

Patch

diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index 6ed602b2f80a..e168fc9a980a 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -104,7 +104,6 @@  struct deadline_data {
 	int prio_aging_expire;
 
 	spinlock_t lock;
-	spinlock_t zone_lock;
 };
 
 /* Maps an I/O priority class to a deadline scheduler priority. */
@@ -285,30 +284,10 @@  static struct request *
 deadline_fifo_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
 		      enum dd_data_dir data_dir)
 {
-	struct request *rq;
-	unsigned long flags;
-
 	if (list_empty(&per_prio->fifo_list[data_dir]))
 		return NULL;
 
-	rq = rq_entry_fifo(per_prio->fifo_list[data_dir].next);
-	if (data_dir == DD_READ || !blk_queue_is_zoned(rq->q))
-		return rq;
-
-	/*
-	 * Look for a write request that can be dispatched, that is one with
-	 * an unlocked target zone.
-	 */
-	spin_lock_irqsave(&dd->zone_lock, flags);
-	list_for_each_entry(rq, &per_prio->fifo_list[DD_WRITE], queuelist) {
-		if (blk_req_can_dispatch_to_zone(rq))
-			goto out;
-	}
-	rq = NULL;
-out:
-	spin_unlock_irqrestore(&dd->zone_lock, flags);
-
-	return rq;
+	return rq_entry_fifo(per_prio->fifo_list[data_dir].next);
 }
 
 /*
@@ -319,29 +298,7 @@  static struct request *
 deadline_next_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
 		      enum dd_data_dir data_dir)
 {
-	struct request *rq;
-	unsigned long flags;
-
-	rq = per_prio->next_rq[data_dir];
-	if (!rq)
-		return NULL;
-
-	if (data_dir == DD_READ || !blk_queue_is_zoned(rq->q))
-		return rq;
-
-	/*
-	 * Look for a write request that can be dispatched, that is one with
-	 * an unlocked target zone.
-	 */
-	spin_lock_irqsave(&dd->zone_lock, flags);
-	while (rq) {
-		if (blk_req_can_dispatch_to_zone(rq))
-			break;
-		rq = deadline_latter_request(rq);
-	}
-	spin_unlock_irqrestore(&dd->zone_lock, flags);
-
-	return rq;
+	return per_prio->next_rq[data_dir];
 }
 
 /*
@@ -467,10 +424,6 @@  static struct request *__dd_dispatch_request(struct deadline_data *dd,
 	ioprio_class = dd_rq_ioclass(rq);
 	prio = ioprio_class_to_prio[ioprio_class];
 	dd->per_prio[prio].stats.dispatched++;
-	/*
-	 * If the request needs its target zone locked, do it.
-	 */
-	blk_req_zone_write_lock(rq);
 	rq->rq_flags |= RQF_STARTED;
 	return rq;
 }
@@ -640,7 +593,6 @@  static int dd_init_sched(struct request_queue *q, struct elevator_type *e)
 	dd->fifo_batch = fifo_batch;
 	dd->prio_aging_expire = prio_aging_expire;
 	spin_lock_init(&dd->lock);
-	spin_lock_init(&dd->zone_lock);
 
 	q->elevator = eq;
 	return 0;
@@ -716,17 +668,13 @@  static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
 	u8 ioprio_class = IOPRIO_PRIO_CLASS(ioprio);
 	struct dd_per_prio *per_prio;
 	enum dd_prio prio;
+	bool seq_write = blk_rq_is_seq_write(rq);
 	LIST_HEAD(free);
 
 	lockdep_assert_held(&dd->lock);
 
-	/*
-	 * This may be a requeue of a write request that has locked its
-	 * target zone. If it is the case, this releases the zone lock.
-	 */
-	blk_req_zone_write_unlock(rq);
-
-	prio = ioprio_class_to_prio[ioprio_class];
+	prio = seq_write ? DD_BE_PRIO :
+		ioprio_class_to_prio[ioprio_class];
 	per_prio = &dd->per_prio[prio];
 	if (!rq->elv.priv[0]) {
 		per_prio->stats.inserted++;
@@ -740,7 +688,7 @@  static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
 
 	trace_block_rq_insert(rq);
 
-	if (at_head) {
+	if (at_head && !seq_write) {
 		list_add(&rq->queuelist, &per_prio->dispatch);
 		rq->fifo_time = jiffies;
 	} else {
@@ -819,16 +767,6 @@  static void dd_finish_request(struct request *rq)
 		return;
 
 	atomic_inc(&per_prio->stats.completed);
-
-	if (blk_queue_is_zoned(q)) {
-		unsigned long flags;
-
-		spin_lock_irqsave(&dd->zone_lock, flags);
-		blk_req_zone_write_unlock(rq);
-		if (!list_empty(&per_prio->fifo_list[DD_WRITE]))
-			blk_mq_sched_mark_restart_hctx(rq->mq_hctx);
-		spin_unlock_irqrestore(&dd->zone_lock, flags);
-	}
 }
 
 static bool dd_has_work_for_prio(struct dd_per_prio *per_prio)