diff mbox series

scsi: avoid to fetch scsi host template instance in IO path

Message ID 20200228093346.31213-1-ming.lei@redhat.com (mailing list archive)
State Changes Requested
Headers show
Series scsi: avoid to fetch scsi host template instance in IO path | expand

Commit Message

Ming Lei Feb. 28, 2020, 9:33 a.m. UTC
scsi host template struct is quite big, and the following three
fields are needed in SCSI IO path:

- queuecommand
- commit_rqs
- cmd_size

Cache them into scsi host intance, so that we can avoid to fetch
big scsi host template instance in IO path.

40% IOPS boost can be observed in my scsi_debug performance test after
applying this change.

Cc: Sumanesh Samanta <sumanesh.samanta@broadcom.com>
Cc: Ewan D . Milne <emilne@redhat.com>
Cc: Hannes Reinecke <hare@suse.de>
Cc: Bart Van Assche <bart.vanassche@wdc.com>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 drivers/scsi/hosts.c     |  4 ++++
 drivers/scsi/scsi_lib.c  | 10 +++++-----
 include/scsi/scsi_host.h | 11 ++++++++++-
 3 files changed, 19 insertions(+), 6 deletions(-)

Comments

Bart Van Assche March 2, 2020, 4:32 a.m. UTC | #1
On 2020-02-28 01:33, Ming Lei wrote:
> scsi host template struct is quite big, and the following three
> fields are needed in SCSI IO path:
> 
> - queuecommand
> - commit_rqs
> - cmd_size
> 
> Cache them into scsi host intance, so that we can avoid to fetch
> big scsi host template instance in IO path.
> 
> 40% IOPS boost can be observed in my scsi_debug performance test after
> applying this change.
> 
> Cc: Sumanesh Samanta <sumanesh.samanta@broadcom.com>
> Cc: Ewan D . Milne <emilne@redhat.com>
> Cc: Hannes Reinecke <hare@suse.de>
> Cc: Bart Van Assche <bart.vanassche@wdc.com>
> Cc: Christoph Hellwig <hch@lst.de>

Please use my current email address instead of the above old and no
longer valid email address. See also the .mailmap file. I think I have
asked this before.

> @@ -1148,7 +1148,7 @@ void scsi_init_command(struct scsi_device *dev, struct scsi_cmnd *cmd)
>  	in_flight = test_bit(SCMD_STATE_INFLIGHT, &cmd->state);
>  	/* zero out the cmd, except for the embedded scsi_request */
>  	memset((char *)cmd + sizeof(cmd->req), 0,
> -		sizeof(*cmd) - sizeof(cmd->req) + dev->host->hostt->cmd_size);
> +		sizeof(*cmd) - sizeof(cmd->req) + dev->host->cmd_size);
>  
>  	cmd->device = dev;
>  	cmd->sense_buffer = buf;

This patch does not apply on Martin's 5.7/scsi-queue branch. Please rebase.

Thanks,

Bart.
John Garry March 2, 2020, 10:58 a.m. UTC | #2
On 28/02/2020 09:33, Ming Lei wrote:
> scsi host template struct is quite big, and the following three
> fields are needed in SCSI IO path:
> 
> - queuecommand
> - commit_rqs
> - cmd_size

Would it have been nearly as good to reorganise Scsi host template 
structure to ensure that these are adjacent?

I say nearly, as it avoids the shost->hostt read.

> 
> Cache them into scsi host intance, so that we can avoid to fetch
> big scsi host template instance in IO path.
> 
> 40% IOPS boost can be observed in my scsi_debug performance test after
> applying this change.
> 
> Cc: Sumanesh Samanta <sumanesh.samanta@broadcom.com>
> Cc: Ewan D . Milne <emilne@redhat.com>
> Cc: Hannes Reinecke <hare@suse.de>
> Cc: Bart Van Assche <bart.vanassche@wdc.com>
> Cc: Christoph Hellwig <hch@lst.de>
> Signed-off-by: Ming Lei <ming.lei@redhat.com>
> ---
>   drivers/scsi/hosts.c     |  4 ++++
>   drivers/scsi/scsi_lib.c  | 10 +++++-----
>   include/scsi/scsi_host.h | 11 ++++++++++-
>   3 files changed, 19 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c
> index 1d669e47b692..8012c1db092e 100644
> --- a/drivers/scsi/hosts.c
> +++ b/drivers/scsi/hosts.c
> @@ -466,6 +466,10 @@ struct Scsi_Host *scsi_host_alloc(struct scsi_host_template *sht, int privsize)
>   	if (sht->virt_boundary_mask)
>   		shost->virt_boundary_mask = sht->virt_boundary_mask;
>   
> +	shost->cmd_size = sht->cmd_size;
> +	shost->queuecommand = sht->queuecommand;
> +	shost->commit_rqs = sht->commit_rqs;
> +
>   	device_initialize(&shost->shost_gendev);
>   	dev_set_name(&shost->shost_gendev, "host%d", shost->host_no);
>   	shost->shost_gendev.bus = &scsi_bus_type;
> diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
> index e7fbf3a9a6aa..f4243ae1d4a9 100644
> --- a/drivers/scsi/scsi_lib.c
> +++ b/drivers/scsi/scsi_lib.c
> @@ -1148,7 +1148,7 @@ void scsi_init_command(struct scsi_device *dev, struct scsi_cmnd *cmd)
>   	in_flight = test_bit(SCMD_STATE_INFLIGHT, &cmd->state);
>   	/* zero out the cmd, except for the embedded scsi_request */
>   	memset((char *)cmd + sizeof(cmd->req), 0,
> -		sizeof(*cmd) - sizeof(cmd->req) + dev->host->hostt->cmd_size);
> +		sizeof(*cmd) - sizeof(cmd->req) + dev->host->cmd_size);
>   
>   	cmd->device = dev;
>   	cmd->sense_buffer = buf;
> @@ -1547,7 +1547,7 @@ static int scsi_dispatch_cmd(struct scsi_cmnd *cmd)
>   	}
>   
>   	trace_scsi_dispatch_cmd_start(cmd);
> -	rtn = host->hostt->queuecommand(host, cmd);
> +	rtn = host->queuecommand(host, cmd);
>   	if (rtn) {
>   		trace_scsi_dispatch_cmd_error(cmd, rtn);
>   		if (rtn != SCSI_MLQUEUE_DEVICE_BUSY &&
> @@ -1584,7 +1584,7 @@ static blk_status_t scsi_mq_prep_fn(struct request *req)
>   	cmd->tag = req->tag;
>   	cmd->prot_op = SCSI_PROT_NORMAL;
>   
> -	sg = (void *)cmd + sizeof(struct scsi_cmnd) + shost->hostt->cmd_size;
> +	sg = (void *)cmd + sizeof(struct scsi_cmnd) + shost->cmd_size;
>   	cmd->sdb.table.sgl = sg;
>   
>   	if (scsi_host_get_prot(shost)) {
> @@ -1752,7 +1752,7 @@ static int scsi_mq_init_request(struct blk_mq_tag_set *set, struct request *rq,
>   
>   	if (scsi_host_get_prot(shost)) {
>   		sg = (void *)cmd + sizeof(struct scsi_cmnd) +
> -			shost->hostt->cmd_size;
> +			shost->cmd_size;
>   		cmd->prot_sdb = (void *)sg + scsi_mq_inline_sgl_size(shost);
>   	}
>   
> @@ -1844,7 +1844,7 @@ static void scsi_commit_rqs(struct blk_mq_hw_ctx *hctx)
>   	struct scsi_device *sdev = q->queuedata;
>   	struct Scsi_Host *shost = sdev->host;
>   
> -	shost->hostt->commit_rqs(shost, hctx->queue_num);
> +	shost->commit_rqs(shost, hctx->queue_num);
>   }
>   
>   static const struct blk_mq_ops scsi_mq_ops = {
> diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h
> index f577647bf5f2..ccd5b9a5de2a 100644
> --- a/include/scsi/scsi_host.h
> +++ b/include/scsi/scsi_host.h
> @@ -522,7 +522,16 @@ struct Scsi_Host {
>   	 */
>   	struct list_head	__devices;
>   	struct list_head	__targets;
> -	
> +
> +	/*
> +	 * cacahe the three fields from scsi_host_template, so that
> +	 * the big host template  instance needn't to be fetched in
> +	 * IO path. Big IOPS boost can be observed by this way.
> +	 */
> +	unsigned int cmd_size;

any reason not to add cacheline alignment? And I think function pointers 
will be at least same size as unsigned int, so it may make more sense to 
put cmd_size last.

> +	int (*queuecommand)(struct Scsi_Host *, struct scsi_cmnd *);
> +	void (*commit_rqs)(struct Scsi_Host *, u16);
> +
>   	struct list_head	starved_list;
>   
>   	spinlock_t		default_lock;
>
Christoph Hellwig March 2, 2020, 1:15 p.m. UTC | #3
On Mon, Mar 02, 2020 at 10:58:55AM +0000, John Garry wrote:
> On 28/02/2020 09:33, Ming Lei wrote:
>> scsi host template struct is quite big, and the following three
>> fields are needed in SCSI IO path:
>>
>> - queuecommand
>> - commit_rqs
>> - cmd_size
>
> Would it have been nearly as good to reorganise Scsi host template 
> structure to ensure that these are adjacent?
>
> I say nearly, as it avoids the shost->hostt read.

That would be worth trying.  Replicating function pointers out of
read-only data structures generally isn't a very good idea.
Ming Lei March 2, 2020, 10:30 p.m. UTC | #4
On Mon, Mar 02, 2020 at 02:15:41PM +0100, Christoph Hellwig wrote:
> On Mon, Mar 02, 2020 at 10:58:55AM +0000, John Garry wrote:
> > On 28/02/2020 09:33, Ming Lei wrote:
> >> scsi host template struct is quite big, and the following three
> >> fields are needed in SCSI IO path:
> >>
> >> - queuecommand
> >> - commit_rqs
> >> - cmd_size
> >
> > Would it have been nearly as good to reorganise Scsi host template 
> > structure to ensure that these are adjacent?
> >
> > I say nearly, as it avoids the shost->hostt read.
> 
> That would be worth trying.  Replicating function pointers out of
> read-only data structures generally isn't a very good idea.
>

OK, I will try to re-organize host template and see if it can reach
same performance, but it still introduces one extra fetch in IO path.

BTW, we replicate function pointer in blk-mq too, such as q->mq_ops.

Thanks,
Ming
diff mbox series

Patch

diff --git a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c
index 1d669e47b692..8012c1db092e 100644
--- a/drivers/scsi/hosts.c
+++ b/drivers/scsi/hosts.c
@@ -466,6 +466,10 @@  struct Scsi_Host *scsi_host_alloc(struct scsi_host_template *sht, int privsize)
 	if (sht->virt_boundary_mask)
 		shost->virt_boundary_mask = sht->virt_boundary_mask;
 
+	shost->cmd_size = sht->cmd_size;
+	shost->queuecommand = sht->queuecommand;
+	shost->commit_rqs = sht->commit_rqs;
+
 	device_initialize(&shost->shost_gendev);
 	dev_set_name(&shost->shost_gendev, "host%d", shost->host_no);
 	shost->shost_gendev.bus = &scsi_bus_type;
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index e7fbf3a9a6aa..f4243ae1d4a9 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1148,7 +1148,7 @@  void scsi_init_command(struct scsi_device *dev, struct scsi_cmnd *cmd)
 	in_flight = test_bit(SCMD_STATE_INFLIGHT, &cmd->state);
 	/* zero out the cmd, except for the embedded scsi_request */
 	memset((char *)cmd + sizeof(cmd->req), 0,
-		sizeof(*cmd) - sizeof(cmd->req) + dev->host->hostt->cmd_size);
+		sizeof(*cmd) - sizeof(cmd->req) + dev->host->cmd_size);
 
 	cmd->device = dev;
 	cmd->sense_buffer = buf;
@@ -1547,7 +1547,7 @@  static int scsi_dispatch_cmd(struct scsi_cmnd *cmd)
 	}
 
 	trace_scsi_dispatch_cmd_start(cmd);
-	rtn = host->hostt->queuecommand(host, cmd);
+	rtn = host->queuecommand(host, cmd);
 	if (rtn) {
 		trace_scsi_dispatch_cmd_error(cmd, rtn);
 		if (rtn != SCSI_MLQUEUE_DEVICE_BUSY &&
@@ -1584,7 +1584,7 @@  static blk_status_t scsi_mq_prep_fn(struct request *req)
 	cmd->tag = req->tag;
 	cmd->prot_op = SCSI_PROT_NORMAL;
 
-	sg = (void *)cmd + sizeof(struct scsi_cmnd) + shost->hostt->cmd_size;
+	sg = (void *)cmd + sizeof(struct scsi_cmnd) + shost->cmd_size;
 	cmd->sdb.table.sgl = sg;
 
 	if (scsi_host_get_prot(shost)) {
@@ -1752,7 +1752,7 @@  static int scsi_mq_init_request(struct blk_mq_tag_set *set, struct request *rq,
 
 	if (scsi_host_get_prot(shost)) {
 		sg = (void *)cmd + sizeof(struct scsi_cmnd) +
-			shost->hostt->cmd_size;
+			shost->cmd_size;
 		cmd->prot_sdb = (void *)sg + scsi_mq_inline_sgl_size(shost);
 	}
 
@@ -1844,7 +1844,7 @@  static void scsi_commit_rqs(struct blk_mq_hw_ctx *hctx)
 	struct scsi_device *sdev = q->queuedata;
 	struct Scsi_Host *shost = sdev->host;
 
-	shost->hostt->commit_rqs(shost, hctx->queue_num);
+	shost->commit_rqs(shost, hctx->queue_num);
 }
 
 static const struct blk_mq_ops scsi_mq_ops = {
diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h
index f577647bf5f2..ccd5b9a5de2a 100644
--- a/include/scsi/scsi_host.h
+++ b/include/scsi/scsi_host.h
@@ -522,7 +522,16 @@  struct Scsi_Host {
 	 */
 	struct list_head	__devices;
 	struct list_head	__targets;
-	
+
+	/*
+	 * cacahe the three fields from scsi_host_template, so that
+	 * the big host template  instance needn't to be fetched in
+	 * IO path. Big IOPS boost can be observed by this way.
+	 */
+	unsigned int cmd_size;
+	int (*queuecommand)(struct Scsi_Host *, struct scsi_cmnd *);
+	void (*commit_rqs)(struct Scsi_Host *, u16);
+
 	struct list_head	starved_list;
 
 	spinlock_t		default_lock;