[v2,02/12] block: Send flush requests to the I/O scheduler

Message ID	20230407235822.1672286-3-bvanassche@acm.org (mailing list archive)
State	New, archived
Headers	show Return-Path: <linux-block-owner@vger.kernel.org> From: Bart Van Assche <bvanassche@acm.org> To: Jens Axboe <axboe@kernel.dk> Cc: linux-block@vger.kernel.org, Jaegeuk Kim <jaegeuk@kernel.org>, Christoph Hellwig <hch@lst.de>, Bart Van Assche <bvanassche@acm.org>, Damien Le Moal <damien.lemoal@opensource.wdc.com>, Ming Lei <ming.lei@redhat.com>, Mike Snitzer <snitzer@kernel.org> Subject: [PATCH v2 02/12] block: Send flush requests to the I/O scheduler Date: Fri, 7 Apr 2023 16:58:12 -0700 Message-Id: <20230407235822.1672286-3-bvanassche@acm.org> In-Reply-To: <20230407235822.1672286-1-bvanassche@acm.org> References: <20230407235822.1672286-1-bvanassche@acm.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Precedence: bulk
Series	Submit zoned writes in order \| expand [v2,00/12] Submit zoned writes in order [v2,01/12] block: Send zoned writes to the I/O scheduler [v2,02/12] block: Send flush requests to the I/O scheduler [v2,03/12] block: Send requeued requests to the I/O scheduler [v2,04/12] block: Requeue requests if a CPU is unplugged [v2,05/12] block: One requeue list per hctx [v2,06/12] block: Preserve the order of requeued requests [v2,07/12] block: Make it easier to debug zoned write reordering [v2,08/12] block: mq-deadline: Simplify deadline_skip_seq_writes() [v2,09/12] block: mq-deadline: Disable head insertion for zoned writes [v2,10/12] block: mq-deadline: Introduce a local variable [v2,11/12] block: mq-deadline: Fix a race condition related to zoned writes [v2,12/12] block: mq-deadline: Handle requeued requests correctly

Message ID

20230407235822.1672286-3-bvanassche@acm.org (mailing list archive)

State

New, archived

Headers

From: Bart Van Assche <bvanassche@acm.org>
To: Jens Axboe <axboe@kernel.dk>
Cc: linux-block@vger.kernel.org, Jaegeuk Kim <jaegeuk@kernel.org>,
        Christoph Hellwig <hch@lst.de>,
        Bart Van Assche <bvanassche@acm.org>,
        Damien Le Moal <damien.lemoal@opensource.wdc.com>,
        Ming Lei <ming.lei@redhat.com>,
        Mike Snitzer <snitzer@kernel.org>
Subject: [PATCH v2 02/12] block: Send flush requests to the I/O scheduler
Date: Fri,  7 Apr 2023 16:58:12 -0700
Message-Id: <20230407235822.1672286-3-bvanassche@acm.org>
In-Reply-To: <20230407235822.1672286-1-bvanassche@acm.org>
References: <20230407235822.1672286-1-bvanassche@acm.org>
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit
Precedence: bulk

Series

Submit zoned writes in order | expand

Commit Message

Bart Van Assche April 7, 2023, 11:58 p.m. UTC

Prevent that zoned writes with the FUA flag set are reordered against each
other or against other zoned writes. Separate the I/O scheduler members
from the flush members in struct request since with this patch applied a
request may pass through both an I/O scheduler and the flush machinery.

Cc: Christoph Hellwig <hch@lst.de>
Cc: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Cc: Ming Lei <ming.lei@redhat.com>
Cc: Mike Snitzer <snitzer@kernel.org>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
 block/blk-flush.c      |  3 ++-
 block/blk-mq.c         | 11 ++++-------
 block/mq-deadline.c    |  2 +-
 include/linux/blk-mq.h | 27 +++++++++++----------------
 4 files changed, 18 insertions(+), 25 deletions(-)

Comments

Damien Le Moal April 10, 2023, 7:46 a.m. UTC | #1

On 4/8/23 08:58, Bart Van Assche wrote:
> Prevent that zoned writes with the FUA flag set are reordered against each
> other or against other zoned writes. Separate the I/O scheduler members
> from the flush members in struct request since with this patch applied a
> request may pass through both an I/O scheduler and the flush machinery.
> 
> Cc: Christoph Hellwig <hch@lst.de>
> Cc: Damien Le Moal <damien.lemoal@opensource.wdc.com>
> Cc: Ming Lei <ming.lei@redhat.com>
> Cc: Mike Snitzer <snitzer@kernel.org>
> Signed-off-by: Bart Van Assche <bvanassche@acm.org>
> ---
>  block/blk-flush.c      |  3 ++-
>  block/blk-mq.c         | 11 ++++-------
>  block/mq-deadline.c    |  2 +-
>  include/linux/blk-mq.h | 27 +++++++++++----------------
>  4 files changed, 18 insertions(+), 25 deletions(-)
> 
> diff --git a/block/blk-flush.c b/block/blk-flush.c
> index 53202eff545e..e0cf153388d8 100644
> --- a/block/blk-flush.c
> +++ b/block/blk-flush.c
> @@ -432,7 +432,8 @@ void blk_insert_flush(struct request *rq)
>  	 */
>  	if ((policy & REQ_FSEQ_DATA) &&
>  	    !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
> -		blk_mq_request_bypass_insert(rq, false, true);
> +		blk_mq_sched_insert_request(rq, /*at_head=*/false,
> +					    /*run_queue=*/true, /*async=*/true);
>  		return;
>  	}
>  
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index fefc9a728e0e..250556546bbf 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -390,8 +390,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
>  		INIT_HLIST_NODE(&rq->hash);
>  		RB_CLEAR_NODE(&rq->rb_node);
>  
> -		if (!op_is_flush(data->cmd_flags) &&
> -		    e->type->ops.prepare_request) {
> +		if (e->type->ops.prepare_request) {
>  			e->type->ops.prepare_request(rq);
>  			rq->rq_flags |= RQF_ELVPRIV;
>  		}
> @@ -452,13 +451,11 @@ static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data)
>  		data->rq_flags |= RQF_ELV;
>  
>  		/*
> -		 * Flush/passthrough requests are special and go directly to the
> -		 * dispatch list. Don't include reserved tags in the
> -		 * limiting, as it isn't useful.
> +		 * Do not limit the depth for passthrough requests nor for
> +		 * requests with a reserved tag.
>  		 */
> -		if (!op_is_flush(data->cmd_flags) &&
> +		if (e->type->ops.limit_depth &&
>  		    !blk_op_is_passthrough(data->cmd_flags) &&
> -		    e->type->ops.limit_depth &&
>  		    !(data->flags & BLK_MQ_REQ_RESERVED))
>  			e->type->ops.limit_depth(data->cmd_flags, data);
>  	}
> diff --git a/block/mq-deadline.c b/block/mq-deadline.c
> index f10c2a0d18d4..d885ccf49170 100644
> --- a/block/mq-deadline.c
> +++ b/block/mq-deadline.c
> @@ -789,7 +789,7 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
>  
>  	prio = ioprio_class_to_prio[ioprio_class];
>  	per_prio = &dd->per_prio[prio];
> -	if (!rq->elv.priv[0]) {
> +	if (!rq->elv.priv[0] && !(rq->rq_flags & RQF_FLUSH_SEQ)) {
>  		per_prio->stats.inserted++;
>  		rq->elv.priv[0] = (void *)(uintptr_t)1;
>  	}

Given that this change has nothing specific to zoned devices, you could rewrite
the commit message to mention that. And are bfq and kyber OK with this change as
well ?

Also, to be consistent with this change, shouldn't blk_mq_sched_bypass_insert()
be updated as well ? That function is called from blk_mq_sched_insert_request().

> diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
> index 06caacd77ed6..5e6c79ad83d2 100644
> --- a/include/linux/blk-mq.h
> +++ b/include/linux/blk-mq.h
> @@ -169,25 +169,20 @@ struct request {
>  		void *completion_data;
>  	};
>  
> -
>  	/*
>  	 * Three pointers are available for the IO schedulers, if they need
> -	 * more they have to dynamically allocate it.  Flush requests are
> -	 * never put on the IO scheduler. So let the flush fields share
> -	 * space with the elevator data.
> +	 * more they have to dynamically allocate it.
>  	 */
> -	union {
> -		struct {
> -			struct io_cq		*icq;
> -			void			*priv[2];
> -		} elv;
> -
> -		struct {
> -			unsigned int		seq;
> -			struct list_head	list;
> -			rq_end_io_fn		*saved_end_io;
> -		} flush;
> -	};
> +	struct {
> +		struct io_cq		*icq;
> +		void			*priv[2];
> +	} elv;
> +
> +	struct {
> +		unsigned int		seq;
> +		struct list_head	list;
> +		rq_end_io_fn		*saved_end_io;
> +	} flush;
>  
>  	union {
>  		struct __call_single_data csd;

Bart Van Assche April 11, 2023, 12:15 a.m. UTC | #2

On 4/10/23 00:46, Damien Le Moal wrote:
> Given that this change has nothing specific to zoned devices, you could rewrite
> the commit message to mention that. And are bfq and kyber OK with this change as
> well ?
> 
> Also, to be consistent with this change, shouldn't blk_mq_sched_bypass_insert()
> be updated as well ? That function is called from blk_mq_sched_insert_request().

Hi Damien,

I'm considering to replace patch 02/12 from this series by the patch below:


Subject: [PATCH] block: Send flush requests to the I/O scheduler

Send flush requests to the I/O scheduler such that I/O scheduler policies
are applied to writes with the FUA flag set. Separate the I/O scheduler
members from the flush members in struct request since with this patch
applied a request may pass through both an I/O scheduler and the flush
machinery.

This change affects the statistics of I/O schedulers that track I/O
statistics (BFQ and mq-deadline).

Cc: Christoph Hellwig <hch@lst.de>
Cc: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Cc: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
  block/blk-flush.c      |  8 ++++++--
  block/blk-mq-sched.c   | 19 +++++++++++++++----
  block/blk-mq-sched.h   |  1 +
  block/blk-mq.c         | 22 +++++-----------------
  include/linux/blk-mq.h | 27 +++++++++++----------------
  5 files changed, 38 insertions(+), 39 deletions(-)

diff --git a/block/blk-flush.c b/block/blk-flush.c
index 53202eff545e..cf0afb75fafd 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -336,8 +336,11 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
  		 * account of this driver tag
  		 */
  		flush_rq->rq_flags |= RQF_MQ_INFLIGHT;
-	} else
+	} else {
  		flush_rq->internal_tag = first_rq->internal_tag;
+		flush_rq->rq_flags |= RQF_ELV;
+		blk_mq_sched_prepare(flush_rq);
+	}

  	flush_rq->cmd_flags = REQ_OP_FLUSH | REQ_PREFLUSH;
  	flush_rq->cmd_flags |= (flags & REQ_DRV) | (flags & REQ_FAILFAST_MASK);
@@ -432,7 +435,8 @@ void blk_insert_flush(struct request *rq)
  	 */
  	if ((policy & REQ_FSEQ_DATA) &&
  	    !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
-		blk_mq_request_bypass_insert(rq, false, true);
+		blk_mq_sched_insert_request(rq, /*at_head=*/false,
+					    /*run_queue=*/true, /*async=*/true);
  		return;
  	}

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 62c8c1ba1321..9938c35aa7ed 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -18,6 +18,20 @@
  #include "blk-mq-tag.h"
  #include "blk-wbt.h"

+/* Prepare a request for insertion into an I/O scheduler. */
+void blk_mq_sched_prepare(struct request *rq)
+{
+	struct elevator_queue *e = rq->q->elevator;
+
+	INIT_HLIST_NODE(&rq->hash);
+	RB_CLEAR_NODE(&rq->rb_node);
+
+	if (e->type->ops.prepare_request) {
+		e->type->ops.prepare_request(rq);
+		rq->rq_flags |= RQF_ELVPRIV;
+	}
+}
+
  /*
   * Mark a hardware queue as needing a restart.
   */
@@ -397,10 +411,7 @@ static bool blk_mq_sched_bypass_insert(struct request *rq)
  	 * passthrough request is added to scheduler queue, there isn't any
  	 * chance to dispatch it given we prioritize requests in hctx->dispatch.
  	 */
-	if ((rq->rq_flags & RQF_FLUSH_SEQ) || blk_rq_is_passthrough(rq))
-		return true;
-
-	return false;
+	return req_op(rq) == REQ_OP_FLUSH || blk_rq_is_passthrough(rq);
  }

  void blk_mq_sched_insert_request(struct request *rq, bool at_head,
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
index 025013972453..6337c5a66af6 100644
--- a/block/blk-mq-sched.h
+++ b/block/blk-mq-sched.h
@@ -8,6 +8,7 @@

  #define MAX_SCHED_RQ (16 * BLKDEV_DEFAULT_RQ)

+void blk_mq_sched_prepare(struct request *rq);
  bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
  		unsigned int nr_segs, struct request **merged_request);
  bool blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
diff --git a/block/blk-mq.c b/block/blk-mq.c
index db93b1a71157..2731466b1952 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -384,18 +384,8 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
  	WRITE_ONCE(rq->deadline, 0);
  	req_ref_set(rq, 1);

-	if (rq->rq_flags & RQF_ELV) {
-		struct elevator_queue *e = data->q->elevator;
-
-		INIT_HLIST_NODE(&rq->hash);
-		RB_CLEAR_NODE(&rq->rb_node);
-
-		if (!op_is_flush(data->cmd_flags) &&
-		    e->type->ops.prepare_request) {
-			e->type->ops.prepare_request(rq);
-			rq->rq_flags |= RQF_ELVPRIV;
-		}
-	}
+	if (rq->rq_flags & RQF_ELV)
+		blk_mq_sched_prepare(rq);

  	return rq;
  }
@@ -452,13 +442,11 @@ static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data)
  		data->rq_flags |= RQF_ELV;

  		/*
-		 * Flush/passthrough requests are special and go directly to the
-		 * dispatch list. Don't include reserved tags in the
-		 * limiting, as it isn't useful.
+		 * Do not limit the depth for passthrough requests nor for
+		 * requests with a reserved tag.
  		 */
-		if (!op_is_flush(data->cmd_flags) &&
+		if (e->type->ops.limit_depth &&
  		    !blk_op_is_passthrough(data->cmd_flags) &&
-		    e->type->ops.limit_depth &&
  		    !(data->flags & BLK_MQ_REQ_RESERVED))
  			e->type->ops.limit_depth(data->cmd_flags, data);
  	}
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 06caacd77ed6..5e6c79ad83d2 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -169,25 +169,20 @@ struct request {
  		void *completion_data;
  	};

-
  	/*
  	 * Three pointers are available for the IO schedulers, if they need
-	 * more they have to dynamically allocate it.  Flush requests are
-	 * never put on the IO scheduler. So let the flush fields share
-	 * space with the elevator data.
+	 * more they have to dynamically allocate it.
  	 */
-	union {
-		struct {
-			struct io_cq		*icq;
-			void			*priv[2];
-		} elv;
-
-		struct {
-			unsigned int		seq;
-			struct list_head	list;
-			rq_end_io_fn		*saved_end_io;
-		} flush;
-	};
+	struct {
+		struct io_cq		*icq;
+		void			*priv[2];
+	} elv;
+
+	struct {
+		unsigned int		seq;
+		struct list_head	list;
+		rq_end_io_fn		*saved_end_io;
+	} flush;

  	union {
  		struct __call_single_data csd;

Christoph Hellwig April 11, 2023, 6:38 a.m. UTC | #3

On Mon, Apr 10, 2023 at 05:15:44PM -0700, Bart Van Assche wrote:
> Subject: [PATCH] block: Send flush requests to the I/O scheduler
>
> Send flush requests to the I/O scheduler such that I/O scheduler policies
> are applied to writes with the FUA flag set. Separate the I/O scheduler
> members from the flush members in struct request since with this patch
> applied a request may pass through both an I/O scheduler and the flush
> machinery.
>
> This change affects the statistics of I/O schedulers that track I/O
> statistics (BFQ and mq-deadline).

This looks reasonably to me, as these special cases are nasty.
But we'll need very careful testing, including performance testing
to ensure this doesn't regress.

> +		blk_mq_sched_insert_request(rq, /*at_head=*/false,
> +					    /*run_queue=*/true, /*async=*/true);

And place drop these silly comments.  If you want to do something about
this rather suboptimal interface convert the three booleans to a flags
argument with properly named flags.

> -	if ((rq->rq_flags & RQF_FLUSH_SEQ) || blk_rq_is_passthrough(rq))
> -		return true;
> -
> -	return false;
> +	return req_op(rq) == REQ_OP_FLUSH || blk_rq_is_passthrough(rq);

This just seem like an arbitrary reformatting.  While I also prefer
your new version, I don't think it belongs into this patch.

Bart Van Assche April 11, 2023, 5:13 p.m. UTC | #4

On 4/10/23 23:38, Christoph Hellwig wrote:
> On Mon, Apr 10, 2023 at 05:15:44PM -0700, Bart Van Assche wrote:
>> +		blk_mq_sched_insert_request(rq, /*at_head=*/false,
>> +					    /*run_queue=*/true, /*async=*/true);
> 
> And place drop these silly comments.  If you want to do something about
> this rather suboptimal interface convert the three booleans to a flags
> argument with properly named flags.

I will remove these comments, but it is not clear to me what is silly about
these comments. There are even tools that can check the correctness of these
comments. See also
https://clang.llvm.org/extra/clang-tidy/checks/bugprone/argument-comment.html

>> -	if ((rq->rq_flags & RQF_FLUSH_SEQ) || blk_rq_is_passthrough(rq))
>> -		return true;
>> -
>> -	return false;
>> +	return req_op(rq) == REQ_OP_FLUSH || blk_rq_is_passthrough(rq);
> 
> This just seem like an arbitrary reformatting.  While I also prefer
> your new version, I don't think it belongs into this patch.

I will move this change into a patch of its own.

Thanks,

Bart.

diff --git a/block/blk-flush.c b/block/blk-flush.c
index 53202eff545e..e0cf153388d8 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -432,7 +432,8 @@  void blk_insert_flush(struct request *rq)
 	 */
 	if ((policy & REQ_FSEQ_DATA) &&
 	    !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
-		blk_mq_request_bypass_insert(rq, false, true);
+		blk_mq_sched_insert_request(rq, /*at_head=*/false,
+					    /*run_queue=*/true, /*async=*/true);
 		return;
 	}
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index fefc9a728e0e..250556546bbf 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -390,8 +390,7 @@  static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 		INIT_HLIST_NODE(&rq->hash);
 		RB_CLEAR_NODE(&rq->rb_node);
 
-		if (!op_is_flush(data->cmd_flags) &&
-		    e->type->ops.prepare_request) {
+		if (e->type->ops.prepare_request) {
 			e->type->ops.prepare_request(rq);
 			rq->rq_flags |= RQF_ELVPRIV;
 		}
@@ -452,13 +451,11 @@  static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data)
 		data->rq_flags |= RQF_ELV;
 
 		/*
-		 * Flush/passthrough requests are special and go directly to the
-		 * dispatch list. Don't include reserved tags in the
-		 * limiting, as it isn't useful.
+		 * Do not limit the depth for passthrough requests nor for
+		 * requests with a reserved tag.
 		 */
-		if (!op_is_flush(data->cmd_flags) &&
+		if (e->type->ops.limit_depth &&
 		    !blk_op_is_passthrough(data->cmd_flags) &&
-		    e->type->ops.limit_depth &&
 		    !(data->flags & BLK_MQ_REQ_RESERVED))
 			e->type->ops.limit_depth(data->cmd_flags, data);
 	}
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index f10c2a0d18d4..d885ccf49170 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -789,7 +789,7 @@  static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
 
 	prio = ioprio_class_to_prio[ioprio_class];
 	per_prio = &dd->per_prio[prio];
-	if (!rq->elv.priv[0]) {
+	if (!rq->elv.priv[0] && !(rq->rq_flags & RQF_FLUSH_SEQ)) {
 		per_prio->stats.inserted++;
 		rq->elv.priv[0] = (void *)(uintptr_t)1;
 	}
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 06caacd77ed6..5e6c79ad83d2 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -169,25 +169,20 @@  struct request {
 		void *completion_data;
 	};
 
-
 	/*
 	 * Three pointers are available for the IO schedulers, if they need
-	 * more they have to dynamically allocate it.  Flush requests are
-	 * never put on the IO scheduler. So let the flush fields share
-	 * space with the elevator data.
+	 * more they have to dynamically allocate it.
 	 */
-	union {
-		struct {
-			struct io_cq		*icq;
-			void			*priv[2];
-		} elv;
-
-		struct {
-			unsigned int		seq;
-			struct list_head	list;
-			rq_end_io_fn		*saved_end_io;
-		} flush;
-	};
+	struct {
+		struct io_cq		*icq;
+		void			*priv[2];
+	} elv;
+
+	struct {
+		unsigned int		seq;
+		struct list_head	list;
+		rq_end_io_fn		*saved_end_io;
+	} flush;
 
 	union {
 		struct __call_single_data csd;

[v2,02/12] block: Send flush requests to the I/O scheduler

Commit Message

Comments

Patch