diff mbox

[4/4] block: rearrange a few request fields for better cache layout

Message ID 1515544167-10751-5-git-send-email-axboe@kernel.dk (mailing list archive)
State New, archived
Headers show

Commit Message

Jens Axboe Jan. 10, 2018, 12:29 a.m. UTC
Move completion related items (like the call single data) near the
end of the struct, instead of mixing them in with the initial
queueing related fields.

Move queuelist below the bio structures. Then we have all
queueing related bits in the first cache line.

This yields a 1.5-2% increase in IOPS for a null_blk test, both for
sync and for high thread count access. Sync test goes form 975K to
992K, 32-thread case from 20.8M to 21.2M IOPS.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c         | 19 ++++++++++---------
 include/linux/blkdev.h | 28 +++++++++++++++-------------
 2 files changed, 25 insertions(+), 22 deletions(-)

Comments

Bart Van Assche Jan. 10, 2018, 6:34 p.m. UTC | #1
On Tue, 2018-01-09 at 17:29 -0700, Jens Axboe wrote:
> Move completion related items (like the call single data) near the

> end of the struct, instead of mixing them in with the initial

> queueing related fields.

> 

> Move queuelist below the bio structures. Then we have all

> queueing related bits in the first cache line.

> 

> This yields a 1.5-2% increase in IOPS for a null_blk test, both for

> sync and for high thread count access. Sync test goes form 975K to

> 992K, 32-thread case from 20.8M to 21.2M IOPS.


That's a nice result!

Reviewed-by: Bart Van Assche <bart.vanassche@wdc.com>
Omar Sandoval Jan. 10, 2018, 6:43 p.m. UTC | #2
On Tue, Jan 09, 2018 at 05:29:27PM -0700, Jens Axboe wrote:
> Move completion related items (like the call single data) near the
> end of the struct, instead of mixing them in with the initial
> queueing related fields.
> 
> Move queuelist below the bio structures. Then we have all
> queueing related bits in the first cache line.
> 
> This yields a 1.5-2% increase in IOPS for a null_blk test, both for
> sync and for high thread count access. Sync test goes form 975K to
> 992K, 32-thread case from 20.8M to 21.2M IOPS.

One nit below, otherwise

Reviewed-by: Omar Sandoval <osandov@fb.com>

> Signed-off-by: Jens Axboe <axboe@kernel.dk>
> ---
>  block/blk-mq.c         | 19 ++++++++++---------
>  include/linux/blkdev.h | 28 +++++++++++++++-------------
>  2 files changed, 25 insertions(+), 22 deletions(-)
> 
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 7248ee043651..ec128001ea8b 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -270,8 +270,6 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
>  	struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
>  	struct request *rq = tags->static_rqs[tag];
>  
> -	rq->rq_flags = 0;
> -
>  	if (data->flags & BLK_MQ_REQ_INTERNAL) {
>  		rq->tag = -1;
>  		rq->internal_tag = tag;
> @@ -285,26 +283,23 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
>  		data->hctx->tags->rqs[rq->tag] = rq;
>  	}
>  
> -	INIT_LIST_HEAD(&rq->queuelist);
>  	/* csd/requeue_work/fifo_time is initialized before use */
>  	rq->q = data->q;
>  	rq->mq_ctx = data->ctx;
> +	rq->rq_flags = 0;
> +	rq->cpu = -1;
>  	rq->cmd_flags = op;
>  	if (data->flags & BLK_MQ_REQ_PREEMPT)
>  		rq->rq_flags |= RQF_PREEMPT;
>  	if (blk_queue_io_stat(data->q))
>  		rq->rq_flags |= RQF_IO_STAT;
> -	rq->cpu = -1;
> +	/* do not touch atomic flags, it needs atomic ops against the timer */

This comment was just removed in a previous patch but it snuck back in.
Jens Axboe Jan. 10, 2018, 6:45 p.m. UTC | #3
On 1/10/18 11:43 AM, Omar Sandoval wrote:
>> -	INIT_LIST_HEAD(&rq->queuelist);
>>  	/* csd/requeue_work/fifo_time is initialized before use */
>>  	rq->q = data->q;
>>  	rq->mq_ctx = data->ctx;
>> +	rq->rq_flags = 0;
>> +	rq->cpu = -1;
>>  	rq->cmd_flags = op;
>>  	if (data->flags & BLK_MQ_REQ_PREEMPT)
>>  		rq->rq_flags |= RQF_PREEMPT;
>>  	if (blk_queue_io_stat(data->q))
>>  		rq->rq_flags |= RQF_IO_STAT;
>> -	rq->cpu = -1;
>> +	/* do not touch atomic flags, it needs atomic ops against the timer */
> 
> This comment was just removed in a previous patch but it snuck back in.

Eagle eyes - thanks, I will kill it.
diff mbox

Patch

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 7248ee043651..ec128001ea8b 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -270,8 +270,6 @@  static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 	struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
 	struct request *rq = tags->static_rqs[tag];
 
-	rq->rq_flags = 0;
-
 	if (data->flags & BLK_MQ_REQ_INTERNAL) {
 		rq->tag = -1;
 		rq->internal_tag = tag;
@@ -285,26 +283,23 @@  static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 		data->hctx->tags->rqs[rq->tag] = rq;
 	}
 
-	INIT_LIST_HEAD(&rq->queuelist);
 	/* csd/requeue_work/fifo_time is initialized before use */
 	rq->q = data->q;
 	rq->mq_ctx = data->ctx;
+	rq->rq_flags = 0;
+	rq->cpu = -1;
 	rq->cmd_flags = op;
 	if (data->flags & BLK_MQ_REQ_PREEMPT)
 		rq->rq_flags |= RQF_PREEMPT;
 	if (blk_queue_io_stat(data->q))
 		rq->rq_flags |= RQF_IO_STAT;
-	rq->cpu = -1;
+	/* do not touch atomic flags, it needs atomic ops against the timer */
+	INIT_LIST_HEAD(&rq->queuelist);
 	INIT_HLIST_NODE(&rq->hash);
 	RB_CLEAR_NODE(&rq->rb_node);
 	rq->rq_disk = NULL;
 	rq->part = NULL;
 	rq->start_time = jiffies;
-#ifdef CONFIG_BLK_CGROUP
-	rq->rl = NULL;
-	set_start_time_ns(rq);
-	rq->io_start_time_ns = 0;
-#endif
 	rq->nr_phys_segments = 0;
 #if defined(CONFIG_BLK_DEV_INTEGRITY)
 	rq->nr_integrity_segments = 0;
@@ -321,6 +316,12 @@  static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 	rq->end_io_data = NULL;
 	rq->next_rq = NULL;
 
+#ifdef CONFIG_BLK_CGROUP
+	rq->rl = NULL;
+	set_start_time_ns(rq);
+	rq->io_start_time_ns = 0;
+#endif
+
 	data->ctx->rq_dispatched[op_is_sync(op)]++;
 	return rq;
 }
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index d4b2f7bb18d6..71a9371c8182 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -141,12 +141,6 @@  typedef __u32 __bitwise req_flags_t;
  * especially blk_mq_rq_ctx_init() to take care of the added fields.
  */
 struct request {
-	struct list_head queuelist;
-	union {
-		call_single_data_t csd;
-		u64 fifo_time;
-	};
-
 	struct request_queue *q;
 	struct blk_mq_ctx *mq_ctx;
 
@@ -164,6 +158,8 @@  struct request {
 	struct bio *bio;
 	struct bio *biotail;
 
+	struct list_head queuelist;
+
 	/*
 	 * The hash is used inside the scheduler, and killed once the
 	 * request reaches the dispatch list. The ipi_list is only used
@@ -211,19 +207,16 @@  struct request {
 	struct hd_struct *part;
 	unsigned long start_time;
 	struct blk_issue_stat issue_stat;
-#ifdef CONFIG_BLK_CGROUP
-	struct request_list *rl;		/* rl this rq is alloced from */
-	unsigned long long start_time_ns;
-	unsigned long long io_start_time_ns;    /* when passed to hardware */
-#endif
 	/* Number of scatter-gather DMA addr+len pairs after
 	 * physical address coalescing is performed.
 	 */
 	unsigned short nr_phys_segments;
+
 #if defined(CONFIG_BLK_DEV_INTEGRITY)
 	unsigned short nr_integrity_segments;
 #endif
 
+	unsigned short write_hint;
 	unsigned short ioprio;
 
 	unsigned int timeout;
@@ -232,8 +225,6 @@  struct request {
 
 	unsigned int extra_len;	/* length of alignment and padding */
 
-	unsigned short write_hint;
-
 	/*
 	 * On blk-mq, the lower bits of ->gstate (generation number and
 	 * state) carry the MQ_RQ_* state value and the upper bits the
@@ -260,6 +251,11 @@  struct request {
 
 	struct list_head timeout_list;
 
+	union {
+		call_single_data_t csd;
+		u64 fifo_time;
+	};
+
 	/*
 	 * completion callback.
 	 */
@@ -268,6 +264,12 @@  struct request {
 
 	/* for bidi */
 	struct request *next_rq;
+
+#ifdef CONFIG_BLK_CGROUP
+	struct request_list *rl;		/* rl this rq is alloced from */
+	unsigned long long start_time_ns;
+	unsigned long long io_start_time_ns;    /* when passed to hardware */
+#endif
 };
 
 static inline bool blk_rq_is_scsi(struct request *rq)