diff mbox series

blk-mq: aggregate ktime_get per request batch

Message ID 20221004153004.2058994-1-kbusch@meta.com (mailing list archive)
State New, archived
Headers show
Series blk-mq: aggregate ktime_get per request batch | expand

Commit Message

Keith Busch Oct. 4, 2022, 3:30 p.m. UTC
From: Keith Busch <kbusch@kernel.org>

The request statistics fields' precision doesn't require each get a
timestamp at their exact location of being set. We can get the time just
once per batch to reduce the overhead from this repeated call. This is
only really helpful if requests are batched, but no harm done if sending
one at a time.

Using fio's t/io_uring benchmark with default settings, time spent in
ktime_get() reduced from ~6% to <1%, and is good for ~3% IOPs boost when
stats are enabled.

Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 block/blk-mq.c             | 88 ++++++++++++++++++++++++--------------
 drivers/block/virtio_blk.c |  3 +-
 drivers/nvme/host/pci.c    |  3 +-
 include/linux/blk-mq.h     |  2 +-
 include/linux/blkdev.h     |  7 +++
 5 files changed, 69 insertions(+), 34 deletions(-)

Comments

Christoph Hellwig Oct. 10, 2022, 7:30 a.m. UTC | #1
>  	rq->timeout = 0;
>  
> -	if (blk_mq_need_time_stamp(rq))
> -		rq->start_time_ns = ktime_get_ns();
> -	else
> -		rq->start_time_ns = 0;
> +	rq->start_time_ns = now;
>  	rq->part = NULL;
>  #ifdef CONFIG_BLK_RQ_ALLOC_TIME
> -	rq->alloc_time_ns = alloc_time_ns;
> +	rq->alloc_time_ns = now;
>  #endif

Given that rq->start_time_ns and rq->alloc_time_ns are not always
the same (except for the odd flush case), do we even need both fields?

>  static inline struct request *
>  __blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data,
> -		u64 alloc_time_ns)
> +		u64 now)

Nit: this fits into the previous line now.

>  	return blk_mq_rq_ctx_init(data, blk_mq_tags_from_data(data), tag,
> -					alloc_time_ns);
> +					now);

Same here.

>  static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
> -					    struct request *rq, bool last)
> +					    struct request *rq, bool last, u64 now)

Overly long line.  FYI, this becomes a lot more readable anyway with
the two tab indent:

static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
		struct request *rq, bool last, u64 now)

> @@ -2521,6 +2528,7 @@ static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
>  	 * Any other error (busy), just add it to our list as we
>  	 * previously would have done.
>  	 */
> +	rq->io_start_time_ns = now;

Looking though this more I hate all these extra assignments, and
it will change the accounting a bit from what we had.  Why can't we
pass the time to blk_mq_start_request and just keep it where it was?

> +	u64 now = 0;
> +
> +	if (blk_queue_stat(rq->q))
> +		now = ktime_get_ns();

This code pattern is duplicated a lot.  Can't we have a nice
helper and shortcut it to:

	u64 now = blk_ktime_get_ns_for_stats(rq->q);

and use the opportunity to document the logic in the helpe.

> -static blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last)
> +static blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last, u64 now)

Overly long line here.  But I really wonder why we even needthis helper.
diff mbox series

Patch

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 8070b6c10e8d..c759fb36b684 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -339,7 +339,7 @@  void blk_rq_init(struct request_queue *q, struct request *rq)
 EXPORT_SYMBOL(blk_rq_init);
 
 static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
-		struct blk_mq_tags *tags, unsigned int tag, u64 alloc_time_ns)
+		struct blk_mq_tags *tags, unsigned int tag, u64 now)
 {
 	struct blk_mq_ctx *ctx = data->ctx;
 	struct blk_mq_hw_ctx *hctx = data->hctx;
@@ -366,15 +366,11 @@  static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 	}
 	rq->timeout = 0;
 
-	if (blk_mq_need_time_stamp(rq))
-		rq->start_time_ns = ktime_get_ns();
-	else
-		rq->start_time_ns = 0;
+	rq->start_time_ns = now;
 	rq->part = NULL;
 #ifdef CONFIG_BLK_RQ_ALLOC_TIME
-	rq->alloc_time_ns = alloc_time_ns;
+	rq->alloc_time_ns = now;
 #endif
-	rq->io_start_time_ns = 0;
 	rq->stats_sectors = 0;
 	rq->nr_phys_segments = 0;
 #if defined(CONFIG_BLK_DEV_INTEGRITY)
@@ -407,7 +403,7 @@  static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 
 static inline struct request *
 __blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data,
-		u64 alloc_time_ns)
+		u64 now)
 {
 	unsigned int tag, tag_offset;
 	struct blk_mq_tags *tags;
@@ -426,7 +422,7 @@  __blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data,
 		tag = tag_offset + i;
 		prefetch(tags->static_rqs[tag]);
 		tag_mask &= ~(1UL << i);
-		rq = blk_mq_rq_ctx_init(data, tags, tag, alloc_time_ns);
+		rq = blk_mq_rq_ctx_init(data, tags, tag, now);
 		rq_list_add(data->cached_rq, rq);
 		nr++;
 	}
@@ -440,13 +436,13 @@  __blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data,
 static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data)
 {
 	struct request_queue *q = data->q;
-	u64 alloc_time_ns = 0;
 	struct request *rq;
 	unsigned int tag;
+	u64 now = 0;
 
 	/* alloc_time includes depth and tag waits */
-	if (blk_queue_rq_alloc_time(q))
-		alloc_time_ns = ktime_get_ns();
+	if (blk_queue_need_ts(q))
+		now = ktime_get_ns();
 
 	if (data->cmd_flags & REQ_NOWAIT)
 		data->flags |= BLK_MQ_REQ_NOWAIT;
@@ -481,7 +477,7 @@  static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data)
 	 * Try batched alloc if we want more than 1 tag.
 	 */
 	if (data->nr_tags > 1) {
-		rq = __blk_mq_alloc_requests_batch(data, alloc_time_ns);
+		rq = __blk_mq_alloc_requests_batch(data, now);
 		if (rq)
 			return rq;
 		data->nr_tags = 1;
@@ -507,7 +503,7 @@  static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data)
 	}
 
 	return blk_mq_rq_ctx_init(data, blk_mq_tags_from_data(data), tag,
-					alloc_time_ns);
+					now);
 }
 
 static struct request *blk_mq_rq_cache_fill(struct request_queue *q,
@@ -610,14 +606,14 @@  struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
 		.cmd_flags	= opf,
 		.nr_tags	= 1,
 	};
-	u64 alloc_time_ns = 0;
 	unsigned int cpu;
 	unsigned int tag;
+	u64 now = 0;
 	int ret;
 
 	/* alloc_time includes depth and tag waits */
-	if (blk_queue_rq_alloc_time(q))
-		alloc_time_ns = ktime_get_ns();
+	if (blk_queue_need_ts(q))
+		now = ktime_get_ns();
 
 	/*
 	 * If the tag allocator sleeps we could get an allocation for a
@@ -661,7 +657,7 @@  struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
 	if (tag == BLK_MQ_NO_TAG)
 		goto out_queue_exit;
 	return blk_mq_rq_ctx_init(&data, blk_mq_tags_from_data(&data), tag,
-					alloc_time_ns);
+					now);
 
 out_queue_exit:
 	blk_queue_exit(q);
@@ -1214,8 +1210,13 @@  void blk_mq_start_request(struct request *rq)
 
 	trace_block_rq_issue(rq);
 
-	if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
-		rq->io_start_time_ns = ktime_get_ns();
+	if (blk_queue_stat(q) ) {
+		/*
+		 * io_start_time_ns may not have been set earlier if the stat
+		 * attribute was being changed.
+		 */
+		if (!rq->io_start_time_ns)
+			rq->io_start_time_ns = ktime_get_ns();
 		rq->stats_sectors = blk_rq_sectors(rq);
 		rq->rq_flags |= RQF_STATS;
 		rq_qos_issue(q, rq);
@@ -1944,10 +1945,14 @@  bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
 	blk_status_t ret = BLK_STS_OK;
 	LIST_HEAD(zone_list);
 	bool needs_resource = false;
+	u64 now = 0;
 
 	if (list_empty(list))
 		return false;
 
+	if (blk_queue_stat(q))
+		now = ktime_get_ns();
+
 	/*
 	 * Now process all the entries, sending them to the driver.
 	 */
@@ -1983,6 +1988,8 @@  bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
 		 */
 		if (nr_budgets)
 			nr_budgets--;
+
+		rq->io_start_time_ns = now;
 		ret = q->mq_ops->queue_rq(hctx, &bd);
 		switch (ret) {
 		case BLK_STS_OK:
@@ -2507,7 +2514,7 @@  static void blk_mq_bio_to_request(struct request *rq, struct bio *bio,
 }
 
 static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
-					    struct request *rq, bool last)
+					    struct request *rq, bool last, u64 now)
 {
 	struct request_queue *q = rq->q;
 	struct blk_mq_queue_data bd = {
@@ -2521,6 +2528,7 @@  static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
 	 * Any other error (busy), just add it to our list as we
 	 * previously would have done.
 	 */
+	rq->io_start_time_ns = now;
 	ret = q->mq_ops->queue_rq(hctx, &bd);
 	switch (ret) {
 	case BLK_STS_OK:
@@ -2541,7 +2549,7 @@  static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
 
 static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
 						struct request *rq,
-						bool bypass_insert, bool last)
+						bool bypass_insert, bool last, u64 now)
 {
 	struct request_queue *q = rq->q;
 	bool run_queue = true;
@@ -2574,7 +2582,7 @@  static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
 		goto insert;
 	}
 
-	return __blk_mq_issue_directly(hctx, rq, last);
+	return __blk_mq_issue_directly(hctx, rq, last, now);
 insert:
 	if (bypass_insert)
 		return BLK_STS_RESOURCE;
@@ -2597,8 +2605,13 @@  static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
 static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
 		struct request *rq)
 {
-	blk_status_t ret =
-		__blk_mq_try_issue_directly(hctx, rq, false, true);
+	blk_status_t ret;
+	u64 now = 0;
+
+	if (blk_queue_stat(rq->q))
+		now = ktime_get_ns();
+
+	ret = __blk_mq_try_issue_directly(hctx, rq, false, true, now);
 
 	if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE)
 		blk_mq_request_bypass_insert(rq, false, true);
@@ -2606,14 +2619,15 @@  static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
 		blk_mq_end_request(rq, ret);
 }
 
-static blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last)
+static blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last, u64 now)
 {
-	return __blk_mq_try_issue_directly(rq->mq_hctx, rq, true, last);
+	return __blk_mq_try_issue_directly(rq->mq_hctx, rq, true, last, now);
 }
 
 static void blk_mq_plug_issue_direct(struct blk_plug *plug, bool from_schedule)
 {
 	struct blk_mq_hw_ctx *hctx = NULL;
+	u64 now = ktime_get_ns();
 	struct request *rq;
 	int queued = 0;
 	int errors = 0;
@@ -2628,7 +2642,7 @@  static void blk_mq_plug_issue_direct(struct blk_plug *plug, bool from_schedule)
 			hctx = rq->mq_hctx;
 		}
 
-		ret = blk_mq_request_issue_directly(rq, last);
+		ret = blk_mq_request_issue_directly(rq, last, now);
 		switch (ret) {
 		case BLK_STS_OK:
 			queued++;
@@ -2656,9 +2670,14 @@  static void blk_mq_plug_issue_direct(struct blk_plug *plug, bool from_schedule)
 static void __blk_mq_flush_plug_list(struct request_queue *q,
 				     struct blk_plug *plug)
 {
+	u64 now = 0;
+
 	if (blk_queue_quiesced(q))
 		return;
-	q->mq_ops->queue_rqs(&plug->mq_list);
+	if (blk_queue_stat(q))
+		now = ktime_get_ns();
+
+	q->mq_ops->queue_rqs(&plug->mq_list, now);
 }
 
 static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched)
@@ -2736,6 +2755,10 @@  void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
 {
 	int queued = 0;
 	int errors = 0;
+	u64 now = 0;
+
+	if (blk_queue_stat(hctx->queue))
+		now = ktime_get_ns();
 
 	while (!list_empty(list)) {
 		blk_status_t ret;
@@ -2743,7 +2766,7 @@  void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
 				queuelist);
 
 		list_del_init(&rq->queuelist);
-		ret = blk_mq_request_issue_directly(rq, list_empty(list));
+		ret = blk_mq_request_issue_directly(rq, list_empty(list), now);
 		if (ret != BLK_STS_OK) {
 			errors++;
 			if (ret == BLK_STS_RESOURCE ||
@@ -2938,6 +2961,7 @@  blk_status_t blk_insert_cloned_request(struct request *rq)
 	struct request_queue *q = rq->q;
 	unsigned int max_sectors = blk_queue_get_max_sectors(q, req_op(rq));
 	blk_status_t ret;
+	u64 now = 0;
 
 	if (blk_rq_sectors(rq) > max_sectors) {
 		/*
@@ -2975,6 +2999,8 @@  blk_status_t blk_insert_cloned_request(struct request *rq)
 	if (blk_crypto_insert_cloned_request(rq))
 		return BLK_STS_IOERR;
 
+	if (blk_do_io_stat(rq))
+		now = ktime_get_ns();
 	blk_account_io_start(rq);
 
 	/*
@@ -2983,7 +3009,7 @@  blk_status_t blk_insert_cloned_request(struct request *rq)
 	 * insert.
 	 */
 	blk_mq_run_dispatch_ops(q,
-			ret = blk_mq_request_issue_directly(rq, true));
+			ret = blk_mq_request_issue_directly(rq, true, now));
 	if (ret)
 		blk_account_io_done(rq, ktime_get_ns());
 	return ret;
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 3f4739d52268..b3b391c56d8b 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -417,7 +417,7 @@  static bool virtblk_add_req_batch(struct virtio_blk_vq *vq,
 	return kick;
 }
 
-static void virtio_queue_rqs(struct request **rqlist)
+static void virtio_queue_rqs(struct request **rqlist, u64 now)
 {
 	struct request *req, *next, *prev = NULL;
 	struct request *requeue_list = NULL;
@@ -426,6 +426,7 @@  static void virtio_queue_rqs(struct request **rqlist)
 		struct virtio_blk_vq *vq = get_virtio_blk_vq(req->mq_hctx);
 		bool kick;
 
+		req->io_start_time_ns = now;
 		if (!virtblk_prep_rq_batch(req)) {
 			rq_list_move(rqlist, &requeue_list, req, prev);
 			req = prev;
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 5b796efa325b..6fc14a3f2980 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -981,7 +981,7 @@  static bool nvme_prep_rq_batch(struct nvme_queue *nvmeq, struct request *req)
 	return nvme_prep_rq(nvmeq->dev, req) == BLK_STS_OK;
 }
 
-static void nvme_queue_rqs(struct request **rqlist)
+static void nvme_queue_rqs(struct request **rqlist, u64 now)
 {
 	struct request *req, *next, *prev = NULL;
 	struct request *requeue_list = NULL;
@@ -989,6 +989,7 @@  static void nvme_queue_rqs(struct request **rqlist)
 	rq_list_for_each_safe(rqlist, req, next) {
 		struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
 
+		req->io_start_time_ns = now;
 		if (!nvme_prep_rq_batch(nvmeq, req)) {
 			/* detach 'req' and add to remainder list */
 			rq_list_move(rqlist, &requeue_list, req, prev);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index ba18e9bdb799..071ebd7fd6c9 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -562,7 +562,7 @@  struct blk_mq_ops {
 	 * empty the @rqlist completely, then the rest will be queued
 	 * individually by the block layer upon return.
 	 */
-	void (*queue_rqs)(struct request **rqlist);
+	void (*queue_rqs)(struct request **rqlist, u64 now);
 
 	/**
 	 * @get_budget: Reserve budget before queue request, once .queue_rq is
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 49373d002631..6bda1414cca0 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -599,6 +599,7 @@  bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q);
 #define blk_queue_stable_writes(q) \
 	test_bit(QUEUE_FLAG_STABLE_WRITES, &(q)->queue_flags)
 #define blk_queue_io_stat(q)	test_bit(QUEUE_FLAG_IO_STAT, &(q)->queue_flags)
+#define blk_queue_stat(q)	test_bit(QUEUE_FLAG_STATS, &(q)->queue_flags)
 #define blk_queue_add_random(q)	test_bit(QUEUE_FLAG_ADD_RANDOM, &(q)->queue_flags)
 #define blk_queue_zone_resetall(q)	\
 	test_bit(QUEUE_FLAG_ZONE_RESETALL, &(q)->queue_flags)
@@ -612,6 +613,12 @@  bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q);
 #define blk_queue_rq_alloc_time(q)	false
 #endif
 
+#define blk_queue_need_ts(q)		\
+	(blk_queue_rq_alloc_time(q) ||	\
+	 blk_queue_stat(q) || 		\
+	 blk_queue_io_stat(q) || 	\
+	 q->elevator)
+
 #define blk_noretry_request(rq) \
 	((rq)->cmd_flags & (REQ_FAILFAST_DEV|REQ_FAILFAST_TRANSPORT| \
 			     REQ_FAILFAST_DRIVER))