diff mbox series

[v3,1/3] blk-mq: always use __blk_mq_alloc_requests() to alloc and init rq

Message ID 20230628124546.1056698-2-chengming.zhou@linux.dev (mailing list archive)
State New, archived
Headers show
Series blk-mq: fix start_time_ns and alloc_time_ns for pre-allocated rq | expand

Commit Message

Chengming Zhou June 28, 2023, 12:45 p.m. UTC
From: Chengming Zhou <zhouchengming@bytedance.com>

This patch is preparation for the next patch that ktime_get_ns() only once
for batched pre-allocated requests start_time_ns setting.

1. data->flags is input for blk_mq_rq_ctx_init(), shouldn't update in
   every blk_mq_rq_ctx_init() in batched requests alloc. So put the
   data->flags initialization in the caller.

2. make blk_mq_alloc_request_hctx() to reuse __blk_mq_alloc_requests(),
   instead of directly using blk_mq_rq_ctx_init() by itself, so avoid
   doing the same data->flags initialization in it.

After these cleanup, __blk_mq_alloc_requests() is the only entry to
alloc and init rq.

Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
---
 block/blk-mq.c | 46 ++++++++++++++++++----------------------------
 1 file changed, 18 insertions(+), 28 deletions(-)

Comments

Christoph Hellwig June 29, 2023, 5:28 a.m. UTC | #1
On Wed, Jun 28, 2023 at 08:45:44PM +0800, chengming.zhou@linux.dev wrote:
> After these cleanup, __blk_mq_alloc_requests() is the only entry to
> alloc and init rq.

I find the code a little hard to follow now, due to the optional
setting of the ctx.  We also introduce really odd behavior here
if the caller for a hctx-specific allocation doesn't have free
tags, as we'll now run into the normal retry path.

Is this really needed for your timestamp changes?  If not I'd prefer
to skip it.
Chengming Zhou June 29, 2023, 7:40 a.m. UTC | #2
On 2023/6/29 13:28, Christoph Hellwig wrote:
> On Wed, Jun 28, 2023 at 08:45:44PM +0800, chengming.zhou@linux.dev wrote:
>> After these cleanup, __blk_mq_alloc_requests() is the only entry to
>> alloc and init rq.
> 
> I find the code a little hard to follow now, due to the optional
> setting of the ctx.  We also introduce really odd behavior here
> if the caller for a hctx-specific allocation doesn't have free
> tags, as we'll now run into the normal retry path.
> 
> Is this really needed for your timestamp changes?  If not I'd prefer
> to skip it.
> 

Thanks for your review!

Since hctx-specific allocation path always has BLK_MQ_REQ_NOWAIT flag,
it won't retry.

But I agree, this makes the general __blk_mq_alloc_requests() more complex.

The reason is blk_mq_rq_ctx_init() has some data->rq_flags initialization:

```
if (data->flags & BLK_MQ_REQ_PM)
	data->rq_flags |= RQF_PM;
if (blk_queue_io_stat(q))
	data->rq_flags |= RQF_IO_STAT;
rq->rq_flags = data->rq_flags;
```

Because we need this data->rq_flags to tell if we need start_time_ns,
we need to put these initialization in the callers of blk_mq_rq_ctx_init().

Now we basically have two callers, the 1st is general __blk_mq_alloc_requests(),
the 2nd is the special blk_mq_alloc_request_hctx(). So I change the 2nd caller
to reuse the 1st __blk_mq_alloc_requests().

Or we put these data->rq_flags initialization in blk_mq_alloc_request_hctx() too?

Thanks.
Christoph Hellwig July 10, 2023, 7:36 a.m. UTC | #3
On Thu, Jun 29, 2023 at 03:40:03PM +0800, Chengming Zhou wrote:
> Thanks for your review!
> 
> Since hctx-specific allocation path always has BLK_MQ_REQ_NOWAIT flag,
> it won't retry.
> 
> But I agree, this makes the general __blk_mq_alloc_requests() more complex.

And also very confusing as it pretends to share some code, while almost
nothing of __blk_mq_alloc_requests is actually used.

> The reason is blk_mq_rq_ctx_init() has some data->rq_flags initialization:
> 
> ```
> if (data->flags & BLK_MQ_REQ_PM)
> 	data->rq_flags |= RQF_PM;
> if (blk_queue_io_stat(q))
> 	data->rq_flags |= RQF_IO_STAT;
> rq->rq_flags = data->rq_flags;
> ```
> 
> Because we need this data->rq_flags to tell if we need start_time_ns,
> we need to put these initialization in the callers of blk_mq_rq_ctx_init().

Why can't we just always initialize the time stampts after
blk_mq_rq_ctx_init? Something like this (untested) variant of your
patch 2 from the latest iteration:

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 5504719b970d59..55bf1009f3e32a 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -328,8 +328,26 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
 }
 EXPORT_SYMBOL(blk_rq_init);
 
+/* Set alloc and start time when pre-allocated rq is actually used */
+static inline void blk_mq_rq_time_init(struct request *rq, bool set_alloc_time)
+{
+	if (blk_mq_need_time_stamp(rq)) {
+		u64 now = ktime_get_ns();
+
+#ifdef CONFIG_BLK_RQ_ALLOC_TIME
+		/*
+		 * The alloc time is only used by iocost for now,
+		 * only possible when blk_mq_need_time_stamp().
+		 */
+		if (set_alloc_time)
+			rq->alloc_time_ns = now;
+#endif
+		rq->start_time_ns = now;
+	}
+}
+
 static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
-		struct blk_mq_tags *tags, unsigned int tag, u64 alloc_time_ns)
+		struct blk_mq_tags *tags, unsigned int tag)
 {
 	struct blk_mq_ctx *ctx = data->ctx;
 	struct blk_mq_hw_ctx *hctx = data->hctx;
@@ -356,14 +374,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 	}
 	rq->timeout = 0;
 
-	if (blk_mq_need_time_stamp(rq))
-		rq->start_time_ns = ktime_get_ns();
-	else
-		rq->start_time_ns = 0;
 	rq->part = NULL;
-#ifdef CONFIG_BLK_RQ_ALLOC_TIME
-	rq->alloc_time_ns = alloc_time_ns;
-#endif
 	rq->io_start_time_ns = 0;
 	rq->stats_sectors = 0;
 	rq->nr_phys_segments = 0;
@@ -393,8 +404,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 }
 
 static inline struct request *
-__blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data,
-		u64 alloc_time_ns)
+__blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data)
 {
 	unsigned int tag, tag_offset;
 	struct blk_mq_tags *tags;
@@ -413,7 +423,7 @@ __blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data,
 		tag = tag_offset + i;
 		prefetch(tags->static_rqs[tag]);
 		tag_mask &= ~(1UL << i);
-		rq = blk_mq_rq_ctx_init(data, tags, tag, alloc_time_ns);
+		rq = blk_mq_rq_ctx_init(data, tags, tag);
 		rq_list_add(data->cached_rq, rq);
 		nr++;
 	}
@@ -427,12 +437,13 @@ __blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data,
 static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data)
 {
 	struct request_queue *q = data->q;
+	bool set_alloc_time = blk_queue_rq_alloc_time(q);
 	u64 alloc_time_ns = 0;
 	struct request *rq;
 	unsigned int tag;
 
 	/* alloc_time includes depth and tag waits */
-	if (blk_queue_rq_alloc_time(q))
+	if (set_alloc_time)
 		alloc_time_ns = ktime_get_ns();
 
 	if (data->cmd_flags & REQ_NOWAIT)
@@ -474,9 +485,11 @@ static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data)
 	 * Try batched alloc if we want more than 1 tag.
 	 */
 	if (data->nr_tags > 1) {
-		rq = __blk_mq_alloc_requests_batch(data, alloc_time_ns);
-		if (rq)
+		rq = __blk_mq_alloc_requests_batch(data);
+		if (rq) {
+			blk_mq_rq_time_init(rq, true);
 			return rq;
+		}
 		data->nr_tags = 1;
 	}
 
@@ -499,8 +512,10 @@ static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data)
 		goto retry;
 	}
 
-	return blk_mq_rq_ctx_init(data, blk_mq_tags_from_data(data), tag,
-					alloc_time_ns);
+	rq = blk_mq_rq_ctx_init(data, blk_mq_tags_from_data(data), tag);
+	if (rq)
+		blk_mq_rq_time_init(rq, set_alloc_time);
+	return rq;
 }
 
 static struct request *blk_mq_rq_cache_fill(struct request_queue *q,
@@ -555,6 +570,7 @@ static struct request *blk_mq_alloc_cached_request(struct request_queue *q,
 			return NULL;
 
 		plug->cached_rq = rq_list_next(rq);
+		blk_mq_rq_time_init(rq, blk_queue_rq_alloc_time(rq->q));
 	}
 
 	rq->cmd_flags = opf;
@@ -656,8 +672,8 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
 	tag = blk_mq_get_tag(&data);
 	if (tag == BLK_MQ_NO_TAG)
 		goto out_queue_exit;
-	rq = blk_mq_rq_ctx_init(&data, blk_mq_tags_from_data(&data), tag,
-					alloc_time_ns);
+	rq = blk_mq_rq_ctx_init(&data, blk_mq_tags_from_data(&data), tag);
+	blk_mq_rq_time_init(rq, blk_queue_rq_alloc_time(rq->q));
 	rq->__data_len = 0;
 	rq->__sector = (sector_t) -1;
 	rq->bio = rq->biotail = NULL;
@@ -2896,6 +2912,7 @@ static inline struct request *blk_mq_get_cached_request(struct request_queue *q,
 	plug->cached_rq = rq_list_next(rq);
 	rq_qos_throttle(q, *bio);
 
+	blk_mq_rq_time_init(rq, blk_queue_rq_alloc_time(rq->q));
 	rq->cmd_flags = (*bio)->bi_opf;
 	INIT_LIST_HEAD(&rq->queuelist);
 	return rq;
Chengming Zhou July 10, 2023, 11:07 a.m. UTC | #4
On 2023/7/10 15:36, Christoph Hellwig wrote:
> On Thu, Jun 29, 2023 at 03:40:03PM +0800, Chengming Zhou wrote:
>> Thanks for your review!
>>
>> Since hctx-specific allocation path always has BLK_MQ_REQ_NOWAIT flag,
>> it won't retry.
>>
>> But I agree, this makes the general __blk_mq_alloc_requests() more complex.
> 
> And also very confusing as it pretends to share some code, while almost
> nothing of __blk_mq_alloc_requests is actually used.

You are right.

I will not mess with reusing __blk_mq_alloc_requests() in the next version.

> 
>> The reason is blk_mq_rq_ctx_init() has some data->rq_flags initialization:
>>
>> ```
>> if (data->flags & BLK_MQ_REQ_PM)
>> 	data->rq_flags |= RQF_PM;
>> if (blk_queue_io_stat(q))
>> 	data->rq_flags |= RQF_IO_STAT;
>> rq->rq_flags = data->rq_flags;
>> ```
>>
>> Because we need this data->rq_flags to tell if we need start_time_ns,
>> we need to put these initialization in the callers of blk_mq_rq_ctx_init().
> 
> Why can't we just always initialize the time stampts after
> blk_mq_rq_ctx_init? Something like this (untested) variant of your
> patch 2 from the latest iteration:

I get what you mean: always initialize the two time stamps after blk_mq_rq_ctx_init(),
so we know whether the time stamps are needed in blk_mq_rq_time_init().

It seems better and clearer indeed, I will try to change as you suggest.

> 
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 5504719b970d59..55bf1009f3e32a 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -328,8 +328,26 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
>  }
>  EXPORT_SYMBOL(blk_rq_init);
>  
> +/* Set alloc and start time when pre-allocated rq is actually used */
> +static inline void blk_mq_rq_time_init(struct request *rq, bool set_alloc_time)

We need to pass "u64 alloc_time_ns" here, which includes depth and tag waits time by definition.

So:
1. for non-batched request that need alloc_time_ns: passed alloc_time_ns != 0
2. for batched request that need alloc_time_ns: passed alloc_time_ns == 0, will be set to start_time_ns

I have just updated the patch:
https://lore.kernel.org/all/20230710105516.2053478-1-chengming.zhou@linux.dev/

Thanks!


> +{
> +	if (blk_mq_need_time_stamp(rq)) {
> +		u64 now = ktime_get_ns();
> +
> +#ifdef CONFIG_BLK_RQ_ALLOC_TIME
> +		/*
> +		 * The alloc time is only used by iocost for now,
> +		 * only possible when blk_mq_need_time_stamp().
> +		 */
> +		if (set_alloc_time)
> +			rq->alloc_time_ns = now;
> +#endif
> +		rq->start_time_ns = now;
> +	}
> +}
> +
>  static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
> -		struct blk_mq_tags *tags, unsigned int tag, u64 alloc_time_ns)
> +		struct blk_mq_tags *tags, unsigned int tag)
>  {
>  	struct blk_mq_ctx *ctx = data->ctx;
>  	struct blk_mq_hw_ctx *hctx = data->hctx;
> @@ -356,14 +374,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
>  	}
>  	rq->timeout = 0;
>  
> -	if (blk_mq_need_time_stamp(rq))
> -		rq->start_time_ns = ktime_get_ns();
> -	else
> -		rq->start_time_ns = 0;
>  	rq->part = NULL;
> -#ifdef CONFIG_BLK_RQ_ALLOC_TIME
> -	rq->alloc_time_ns = alloc_time_ns;
> -#endif
>  	rq->io_start_time_ns = 0;
>  	rq->stats_sectors = 0;
>  	rq->nr_phys_segments = 0;
> @@ -393,8 +404,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
>  }
>  
>  static inline struct request *
> -__blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data,
> -		u64 alloc_time_ns)
> +__blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data)
>  {
>  	unsigned int tag, tag_offset;
>  	struct blk_mq_tags *tags;
> @@ -413,7 +423,7 @@ __blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data,
>  		tag = tag_offset + i;
>  		prefetch(tags->static_rqs[tag]);
>  		tag_mask &= ~(1UL << i);
> -		rq = blk_mq_rq_ctx_init(data, tags, tag, alloc_time_ns);
> +		rq = blk_mq_rq_ctx_init(data, tags, tag);
>  		rq_list_add(data->cached_rq, rq);
>  		nr++;
>  	}
> @@ -427,12 +437,13 @@ __blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data,
>  static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data)
>  {
>  	struct request_queue *q = data->q;
> +	bool set_alloc_time = blk_queue_rq_alloc_time(q);
>  	u64 alloc_time_ns = 0;
>  	struct request *rq;
>  	unsigned int tag;
>  
>  	/* alloc_time includes depth and tag waits */
> -	if (blk_queue_rq_alloc_time(q))
> +	if (set_alloc_time)
>  		alloc_time_ns = ktime_get_ns();
>  
>  	if (data->cmd_flags & REQ_NOWAIT)
> @@ -474,9 +485,11 @@ static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data)
>  	 * Try batched alloc if we want more than 1 tag.
>  	 */
>  	if (data->nr_tags > 1) {
> -		rq = __blk_mq_alloc_requests_batch(data, alloc_time_ns);
> -		if (rq)
> +		rq = __blk_mq_alloc_requests_batch(data);
> +		if (rq) {
> +			blk_mq_rq_time_init(rq, true);
>  			return rq;
> +		}
>  		data->nr_tags = 1;
>  	}
>  
> @@ -499,8 +512,10 @@ static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data)
>  		goto retry;
>  	}
>  
> -	return blk_mq_rq_ctx_init(data, blk_mq_tags_from_data(data), tag,
> -					alloc_time_ns);
> +	rq = blk_mq_rq_ctx_init(data, blk_mq_tags_from_data(data), tag);
> +	if (rq)
> +		blk_mq_rq_time_init(rq, set_alloc_time);
> +	return rq;
>  }
>  
>  static struct request *blk_mq_rq_cache_fill(struct request_queue *q,
> @@ -555,6 +570,7 @@ static struct request *blk_mq_alloc_cached_request(struct request_queue *q,
>  			return NULL;
>  
>  		plug->cached_rq = rq_list_next(rq);
> +		blk_mq_rq_time_init(rq, blk_queue_rq_alloc_time(rq->q));
>  	}
>  
>  	rq->cmd_flags = opf;
> @@ -656,8 +672,8 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
>  	tag = blk_mq_get_tag(&data);
>  	if (tag == BLK_MQ_NO_TAG)
>  		goto out_queue_exit;
> -	rq = blk_mq_rq_ctx_init(&data, blk_mq_tags_from_data(&data), tag,
> -					alloc_time_ns);
> +	rq = blk_mq_rq_ctx_init(&data, blk_mq_tags_from_data(&data), tag);
> +	blk_mq_rq_time_init(rq, blk_queue_rq_alloc_time(rq->q));
>  	rq->__data_len = 0;
>  	rq->__sector = (sector_t) -1;
>  	rq->bio = rq->biotail = NULL;
> @@ -2896,6 +2912,7 @@ static inline struct request *blk_mq_get_cached_request(struct request_queue *q,
>  	plug->cached_rq = rq_list_next(rq);
>  	rq_qos_throttle(q, *bio);
>  
> +	blk_mq_rq_time_init(rq, blk_queue_rq_alloc_time(rq->q));
>  	rq->cmd_flags = (*bio)->bi_opf;
>  	INIT_LIST_HEAD(&rq->queuelist);
>  	return rq;
diff mbox series

Patch

diff --git a/block/blk-mq.c b/block/blk-mq.c
index decb6ab2d508..c50ef953759f 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -349,11 +349,6 @@  static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 	rq->mq_ctx = ctx;
 	rq->mq_hctx = hctx;
 	rq->cmd_flags = data->cmd_flags;
-
-	if (data->flags & BLK_MQ_REQ_PM)
-		data->rq_flags |= RQF_PM;
-	if (blk_queue_io_stat(q))
-		data->rq_flags |= RQF_IO_STAT;
 	rq->rq_flags = data->rq_flags;
 
 	if (data->rq_flags & RQF_SCHED_TAGS) {
@@ -447,6 +442,15 @@  static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data)
 	if (data->cmd_flags & REQ_NOWAIT)
 		data->flags |= BLK_MQ_REQ_NOWAIT;
 
+	if (data->flags & BLK_MQ_REQ_RESERVED)
+		data->rq_flags |= RQF_RESV;
+
+	if (data->flags & BLK_MQ_REQ_PM)
+		data->rq_flags |= RQF_PM;
+
+	if (blk_queue_io_stat(q))
+		data->rq_flags |= RQF_IO_STAT;
+
 	if (q->elevator) {
 		/*
 		 * All requests use scheduler tags when an I/O scheduler is
@@ -471,14 +475,15 @@  static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data)
 	}
 
 retry:
-	data->ctx = blk_mq_get_ctx(q);
-	data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->ctx);
+	/* See blk_mq_alloc_request_hctx() for details */
+	if (!data->ctx) {
+		data->ctx = blk_mq_get_ctx(q);
+		data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->ctx);
+	}
+
 	if (!(data->rq_flags & RQF_SCHED_TAGS))
 		blk_mq_tag_busy(data->hctx);
 
-	if (data->flags & BLK_MQ_REQ_RESERVED)
-		data->rq_flags |= RQF_RESV;
-
 	/*
 	 * Try batched alloc if we want more than 1 tag.
 	 */
@@ -505,6 +510,7 @@  static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data)
 		 * is going away.
 		 */
 		msleep(3);
+		data->ctx = NULL;
 		goto retry;
 	}
 
@@ -613,16 +619,10 @@  struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
 		.cmd_flags	= opf,
 		.nr_tags	= 1,
 	};
-	u64 alloc_time_ns = 0;
 	struct request *rq;
 	unsigned int cpu;
-	unsigned int tag;
 	int ret;
 
-	/* alloc_time includes depth and tag waits */
-	if (blk_queue_rq_alloc_time(q))
-		alloc_time_ns = ktime_get_ns();
-
 	/*
 	 * If the tag allocator sleeps we could get an allocation for a
 	 * different hardware context.  No need to complicate the low level
@@ -653,20 +653,10 @@  struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
 		goto out_queue_exit;
 	data.ctx = __blk_mq_get_ctx(q, cpu);
 
-	if (q->elevator)
-		data.rq_flags |= RQF_SCHED_TAGS;
-	else
-		blk_mq_tag_busy(data.hctx);
-
-	if (flags & BLK_MQ_REQ_RESERVED)
-		data.rq_flags |= RQF_RESV;
-
 	ret = -EWOULDBLOCK;
-	tag = blk_mq_get_tag(&data);
-	if (tag == BLK_MQ_NO_TAG)
+	rq = __blk_mq_alloc_requests(&data);
+	if (!rq)
 		goto out_queue_exit;
-	rq = blk_mq_rq_ctx_init(&data, blk_mq_tags_from_data(&data), tag,
-					alloc_time_ns);
 	rq->__data_len = 0;
 	rq->__sector = (sector_t) -1;
 	rq->bio = rq->biotail = NULL;