diff mbox series

[v4,4/7] block: One requeue list per hctx

Message ID 20230621201237.796902-5-bvanassche@acm.org (mailing list archive)
State New, archived
Headers show
Series Submit zoned writes in order | expand

Commit Message

Bart Van Assche June 21, 2023, 8:12 p.m. UTC
Prepare for processing the requeue list from inside __blk_mq_run_hw_queue().

Cc: Christoph Hellwig <hch@lst.de>
Cc: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Cc: Ming Lei <ming.lei@redhat.com>
Cc: Mike Snitzer <snitzer@kernel.org>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
 block/blk-flush.c      | 24 ++++++++--------
 block/blk-mq-debugfs.c | 64 +++++++++++++++++++++---------------------
 block/blk-mq.c         | 53 ++++++++++++++++++++--------------
 include/linux/blk-mq.h |  6 ++++
 include/linux/blkdev.h |  5 ----
 5 files changed, 83 insertions(+), 69 deletions(-)

Comments

Christoph Hellwig June 23, 2023, 5:48 a.m. UTC | #1
>  void blk_mq_kick_requeue_list(struct request_queue *q)
>  {
> -	kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work, 0);
> +	struct blk_mq_hw_ctx *hctx;
> +	unsigned long i;
> +
> +	queue_for_each_hw_ctx(q, hctx, i)
> +		kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND,
> +					    &hctx->requeue_work, 0);
>  }
>  EXPORT_SYMBOL(blk_mq_kick_requeue_list);
>  
>  void blk_mq_delay_kick_requeue_list(struct request_queue *q,
>  				    unsigned long msecs)
>  {
> -	kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work,
> -				    msecs_to_jiffies(msecs));
> +	struct blk_mq_hw_ctx *hctx;
> +	unsigned long i;
> +
> +	queue_for_each_hw_ctx(q, hctx, i)
> +		kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND,
> +					    &hctx->requeue_work,
> +					    msecs_to_jiffies(msecs));
>  }

I'd iplement blk_mq_kick_requeue_list as a wrapper around
blk_mq_delay_kick_requeue_list that passes a 0 msec argument.

Otherwise looks fine:

Reviewed-by: Christoph Hellwig <hch@lst.de>
Ming Lei June 26, 2023, 8:09 a.m. UTC | #2
On Wed, Jun 21, 2023 at 01:12:31PM -0700, Bart Van Assche wrote:
> Prepare for processing the requeue list from inside __blk_mq_run_hw_queue().

Please add more details about the motivation for this kind of change.

IMO requeue is supposed to be run in slow code path, not see reason why
it has to be moved to hw queue level.

Thanks,
Ming
Bart Van Assche June 26, 2023, 4:54 p.m. UTC | #3
On 6/26/23 01:09, Ming Lei wrote:
> On Wed, Jun 21, 2023 at 01:12:31PM -0700, Bart Van Assche wrote:
>> Prepare for processing the requeue list from inside __blk_mq_run_hw_queue().
> 
> Please add more details about the motivation for this kind of change.
> 
> IMO requeue is supposed to be run in slow code path, not see reason why
> it has to be moved to hw queue level.

As Damien explained, the code for sending requeued requests to the I/O
scheduler should be dropped because this is not necessary in order to
prevent reordering of requeued zoned writes.

The remaining advantages of this patch series are:
* Simplification of the block layer API since the
   blk_mq_{,delay_}kick_requeue_list() functions are removed.
* Faster requeuing. Although I don't have any use case that needs
   faster requeuing, it is interesting to see that this patch series
   makes the blktest test that tests requeuing run twice as fast.

Is there agreement to proceed with this patch series if all review
comments are addressed?

Thanks,

Bart.
Ming Lei June 27, 2023, 1:06 a.m. UTC | #4
On Mon, Jun 26, 2023 at 09:54:01AM -0700, Bart Van Assche wrote:
> On 6/26/23 01:09, Ming Lei wrote:
> > On Wed, Jun 21, 2023 at 01:12:31PM -0700, Bart Van Assche wrote:
> > > Prepare for processing the requeue list from inside __blk_mq_run_hw_queue().
> > 
> > Please add more details about the motivation for this kind of change.
> > 
> > IMO requeue is supposed to be run in slow code path, not see reason why
> > it has to be moved to hw queue level.
> 
> As Damien explained, the code for sending requeued requests to the I/O
> scheduler should be dropped because this is not necessary in order to
> prevent reordering of requeued zoned writes.
> 
> The remaining advantages of this patch series are:
> * Simplification of the block layer API since the
>   blk_mq_{,delay_}kick_requeue_list() functions are removed.
> * Faster requeuing. Although I don't have any use case that needs
>   faster requeuing, it is interesting to see that this patch series
>   makes the blktest test that tests requeuing run twice as fast.

Not sure if it is good, cause requeue is supposed to be slow.

> 
> Is there agreement to proceed with this patch series if all review
> comments are addressed?

One concern is that you added requeue stuff into hctx, which may
add one extra L1 cache fetch in fast path.


Thank,
Ming
diff mbox series

Patch

diff --git a/block/blk-flush.c b/block/blk-flush.c
index dba392cf22be..4bfb92f58aa9 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -91,7 +91,7 @@  enum {
 	FLUSH_PENDING_TIMEOUT	= 5 * HZ,
 };
 
-static void blk_kick_flush(struct request_queue *q,
+static void blk_kick_flush(struct blk_mq_hw_ctx *hctx,
 			   struct blk_flush_queue *fq, blk_opf_t flags);
 
 static inline struct blk_flush_queue *
@@ -165,6 +165,7 @@  static void blk_flush_complete_seq(struct request *rq,
 				   unsigned int seq, blk_status_t error)
 {
 	struct request_queue *q = rq->q;
+	struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
 	struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx];
 	blk_opf_t cmd_flags;
 
@@ -188,9 +189,9 @@  static void blk_flush_complete_seq(struct request *rq,
 
 	case REQ_FSEQ_DATA:
 		list_move_tail(&rq->flush.list, &fq->flush_data_in_flight);
-		spin_lock(&q->requeue_lock);
-		list_add_tail(&rq->queuelist, &q->flush_list);
-		spin_unlock(&q->requeue_lock);
+		spin_lock(&hctx->requeue_lock);
+		list_add_tail(&rq->queuelist, &hctx->flush_list);
+		spin_unlock(&hctx->requeue_lock);
 		blk_mq_kick_requeue_list(q);
 		break;
 
@@ -210,7 +211,7 @@  static void blk_flush_complete_seq(struct request *rq,
 		BUG();
 	}
 
-	blk_kick_flush(q, fq, cmd_flags);
+	blk_kick_flush(hctx, fq, cmd_flags);
 }
 
 static enum rq_end_io_ret flush_end_io(struct request *flush_rq,
@@ -275,7 +276,7 @@  bool is_flush_rq(struct request *rq)
 
 /**
  * blk_kick_flush - consider issuing flush request
- * @q: request_queue being kicked
+ * @hctx: hwq being kicked
  * @fq: flush queue
  * @flags: cmd_flags of the original request
  *
@@ -286,9 +287,10 @@  bool is_flush_rq(struct request *rq)
  * spin_lock_irq(fq->mq_flush_lock)
  *
  */
-static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
-			   blk_opf_t flags)
+static void blk_kick_flush(struct blk_mq_hw_ctx *hctx,
+			   struct blk_flush_queue *fq, blk_opf_t flags)
 {
+	struct request_queue *q = hctx->queue;
 	struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx];
 	struct request *first_rq =
 		list_first_entry(pending, struct request, flush.list);
@@ -348,9 +350,9 @@  static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
 	smp_wmb();
 	req_ref_set(flush_rq, 1);
 
-	spin_lock(&q->requeue_lock);
-	list_add_tail(&flush_rq->queuelist, &q->flush_list);
-	spin_unlock(&q->requeue_lock);
+	spin_lock(&hctx->requeue_lock);
+	list_add_tail(&flush_rq->queuelist, &hctx->flush_list);
+	spin_unlock(&hctx->requeue_lock);
 
 	blk_mq_kick_requeue_list(q);
 }
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index c3b5930106b2..787bdff3cc64 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -18,37 +18,6 @@  static int queue_poll_stat_show(void *data, struct seq_file *m)
 	return 0;
 }
 
-static void *queue_requeue_list_start(struct seq_file *m, loff_t *pos)
-	__acquires(&q->requeue_lock)
-{
-	struct request_queue *q = m->private;
-
-	spin_lock_irq(&q->requeue_lock);
-	return seq_list_start(&q->requeue_list, *pos);
-}
-
-static void *queue_requeue_list_next(struct seq_file *m, void *v, loff_t *pos)
-{
-	struct request_queue *q = m->private;
-
-	return seq_list_next(v, &q->requeue_list, pos);
-}
-
-static void queue_requeue_list_stop(struct seq_file *m, void *v)
-	__releases(&q->requeue_lock)
-{
-	struct request_queue *q = m->private;
-
-	spin_unlock_irq(&q->requeue_lock);
-}
-
-static const struct seq_operations queue_requeue_list_seq_ops = {
-	.start	= queue_requeue_list_start,
-	.next	= queue_requeue_list_next,
-	.stop	= queue_requeue_list_stop,
-	.show	= blk_mq_debugfs_rq_show,
-};
-
 static int blk_flags_show(struct seq_file *m, const unsigned long flags,
 			  const char *const *flag_name, int flag_name_count)
 {
@@ -157,7 +126,6 @@  static ssize_t queue_state_write(void *data, const char __user *buf,
 
 static const struct blk_mq_debugfs_attr blk_mq_debugfs_queue_attrs[] = {
 	{ "poll_stat", 0400, queue_poll_stat_show },
-	{ "requeue_list", 0400, .seq_ops = &queue_requeue_list_seq_ops },
 	{ "pm_only", 0600, queue_pm_only_show, NULL },
 	{ "state", 0600, queue_state_show, queue_state_write },
 	{ "zone_wlock", 0400, queue_zone_wlock_show, NULL },
@@ -513,6 +481,37 @@  static int hctx_dispatch_busy_show(void *data, struct seq_file *m)
 	return 0;
 }
 
+static void *hctx_requeue_list_start(struct seq_file *m, loff_t *pos)
+	__acquires(&hctx->requeue_lock)
+{
+	struct blk_mq_hw_ctx *hctx = m->private;
+
+	spin_lock_irq(&hctx->requeue_lock);
+	return seq_list_start(&hctx->requeue_list, *pos);
+}
+
+static void *hctx_requeue_list_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct blk_mq_hw_ctx *hctx = m->private;
+
+	return seq_list_next(v, &hctx->requeue_list, pos);
+}
+
+static void hctx_requeue_list_stop(struct seq_file *m, void *v)
+	__releases(&hctx->requeue_lock)
+{
+	struct blk_mq_hw_ctx *hctx = m->private;
+
+	spin_unlock_irq(&hctx->requeue_lock);
+}
+
+static const struct seq_operations hctx_requeue_list_seq_ops = {
+	.start = hctx_requeue_list_start,
+	.next = hctx_requeue_list_next,
+	.stop = hctx_requeue_list_stop,
+	.show = blk_mq_debugfs_rq_show,
+};
+
 #define CTX_RQ_SEQ_OPS(name, type)					\
 static void *ctx_##name##_rq_list_start(struct seq_file *m, loff_t *pos) \
 	__acquires(&ctx->lock)						\
@@ -628,6 +627,7 @@  static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = {
 	{"run", 0600, hctx_run_show, hctx_run_write},
 	{"active", 0400, hctx_active_show},
 	{"dispatch_busy", 0400, hctx_dispatch_busy_show},
+	{"requeue_list", 0400, .seq_ops = &hctx_requeue_list_seq_ops},
 	{"type", 0400, hctx_type_show},
 	{},
 };
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 453a90767f7a..c359a28d9b25 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1421,6 +1421,7 @@  static void __blk_mq_requeue_request(struct request *rq)
 void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list)
 {
 	struct request_queue *q = rq->q;
+	struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
 	unsigned long flags;
 
 	__blk_mq_requeue_request(rq);
@@ -1428,9 +1429,9 @@  void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list)
 	/* this request will be re-inserted to io scheduler queue */
 	blk_mq_sched_requeue_request(rq);
 
-	spin_lock_irqsave(&q->requeue_lock, flags);
-	list_add_tail(&rq->queuelist, &q->requeue_list);
-	spin_unlock_irqrestore(&q->requeue_lock, flags);
+	spin_lock_irqsave(&hctx->requeue_lock, flags);
+	list_add_tail(&rq->queuelist, &hctx->requeue_list);
+	spin_unlock_irqrestore(&hctx->requeue_lock, flags);
 
 	if (kick_requeue_list)
 		blk_mq_kick_requeue_list(q);
@@ -1439,16 +1440,16 @@  EXPORT_SYMBOL(blk_mq_requeue_request);
 
 static void blk_mq_requeue_work(struct work_struct *work)
 {
-	struct request_queue *q =
-		container_of(work, struct request_queue, requeue_work.work);
+	struct blk_mq_hw_ctx *hctx =
+		container_of(work, struct blk_mq_hw_ctx, requeue_work.work);
 	LIST_HEAD(requeue_list);
 	LIST_HEAD(flush_list);
 	struct request *rq;
 
-	spin_lock_irq(&q->requeue_lock);
-	list_splice_init(&q->requeue_list, &requeue_list);
-	list_splice_init(&q->flush_list, &flush_list);
-	spin_unlock_irq(&q->requeue_lock);
+	spin_lock_irq(&hctx->requeue_lock);
+	list_splice_init(&hctx->requeue_list, &requeue_list);
+	list_splice_init(&hctx->flush_list, &flush_list);
+	spin_unlock_irq(&hctx->requeue_lock);
 
 	while (!list_empty(&requeue_list)) {
 		rq = list_entry(requeue_list.next, struct request, queuelist);
@@ -1471,20 +1472,30 @@  static void blk_mq_requeue_work(struct work_struct *work)
 		blk_mq_insert_request(rq, 0);
 	}
 
-	blk_mq_run_hw_queues(q, false);
+	blk_mq_run_hw_queue(hctx, false);
 }
 
 void blk_mq_kick_requeue_list(struct request_queue *q)
 {
-	kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work, 0);
+	struct blk_mq_hw_ctx *hctx;
+	unsigned long i;
+
+	queue_for_each_hw_ctx(q, hctx, i)
+		kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND,
+					    &hctx->requeue_work, 0);
 }
 EXPORT_SYMBOL(blk_mq_kick_requeue_list);
 
 void blk_mq_delay_kick_requeue_list(struct request_queue *q,
 				    unsigned long msecs)
 {
-	kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work,
-				    msecs_to_jiffies(msecs));
+	struct blk_mq_hw_ctx *hctx;
+	unsigned long i;
+
+	queue_for_each_hw_ctx(q, hctx, i)
+		kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND,
+					    &hctx->requeue_work,
+					    msecs_to_jiffies(msecs));
 }
 EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list);
 
@@ -3614,6 +3625,11 @@  static int blk_mq_init_hctx(struct request_queue *q,
 		struct blk_mq_tag_set *set,
 		struct blk_mq_hw_ctx *hctx, unsigned hctx_idx)
 {
+	INIT_DELAYED_WORK(&hctx->requeue_work, blk_mq_requeue_work);
+	INIT_LIST_HEAD(&hctx->flush_list);
+	INIT_LIST_HEAD(&hctx->requeue_list);
+	spin_lock_init(&hctx->requeue_lock);
+
 	hctx->queue_num = hctx_idx;
 
 	if (!(hctx->flags & BLK_MQ_F_STACKING))
@@ -4229,11 +4245,6 @@  int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 	q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
 	blk_mq_update_poll_flag(q);
 
-	INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work);
-	INIT_LIST_HEAD(&q->flush_list);
-	INIT_LIST_HEAD(&q->requeue_list);
-	spin_lock_init(&q->requeue_lock);
-
 	q->nr_requests = set->queue_depth;
 
 	blk_mq_init_cpu_queues(q, set->nr_hw_queues);
@@ -4782,10 +4793,10 @@  void blk_mq_cancel_work_sync(struct request_queue *q)
 	struct blk_mq_hw_ctx *hctx;
 	unsigned long i;
 
-	cancel_delayed_work_sync(&q->requeue_work);
-
-	queue_for_each_hw_ctx(q, hctx, i)
+	queue_for_each_hw_ctx(q, hctx, i) {
+		cancel_delayed_work_sync(&hctx->requeue_work);
 		cancel_delayed_work_sync(&hctx->run_work);
+	}
 }
 
 static int __init blk_mq_init(void)
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 2610b299ec77..672e8880f9e2 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -308,6 +308,12 @@  struct blk_mq_hw_ctx {
 		unsigned long		state;
 	} ____cacheline_aligned_in_smp;
 
+	struct list_head	flush_list;
+
+	struct list_head	requeue_list;
+	spinlock_t		requeue_lock;
+	struct delayed_work	requeue_work;
+
 	/**
 	 * @run_work: Used for scheduling a hardware queue run at a later time.
 	 */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index ed44a997f629..ed4f89657f1f 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -479,11 +479,6 @@  struct request_queue {
 	 * for flush operations
 	 */
 	struct blk_flush_queue	*fq;
-	struct list_head	flush_list;
-
-	struct list_head	requeue_list;
-	spinlock_t		requeue_lock;
-	struct delayed_work	requeue_work;
 
 	struct mutex		sysfs_lock;
 	struct mutex		sysfs_dir_lock;