diff mbox series

[8/9] scsi: megaraid: convert private reply queue to blk-mq hw queue

Message ID 20190531022801.10003-9-ming.lei@redhat.com (mailing list archive)
State New, archived
Headers show
Series blk-mq/scsi: convert private reply queue into blk_mq hw queue | expand

Commit Message

Ming Lei May 31, 2019, 2:28 a.m. UTC
SCSI's reply qeueue is very similar with blk-mq's hw queue, both
assigned by IRQ vector, so map te private reply queue into blk-mq's hw
queue via .host_tagset.

Then the private reply mapping can be removed.

Another benefit is that the request/irq lost issue may be solved in
generic approach because managed IRQ may be shutdown during CPU
hotplug.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 drivers/scsi/megaraid/megaraid_sas_base.c   | 50 ++++++++-------------
 drivers/scsi/megaraid/megaraid_sas_fusion.c |  4 +-
 2 files changed, 20 insertions(+), 34 deletions(-)

Comments

Hannes Reinecke May 31, 2019, 6:22 a.m. UTC | #1
On 5/31/19 4:28 AM, Ming Lei wrote:
> SCSI's reply qeueue is very similar with blk-mq's hw queue, both
> assigned by IRQ vector, so map te private reply queue into blk-mq's hw
> queue via .host_tagset.
> 
> Then the private reply mapping can be removed.
> 
> Another benefit is that the request/irq lost issue may be solved in
> generic approach because managed IRQ may be shutdown during CPU
> hotplug.
> 
> Signed-off-by: Ming Lei <ming.lei@redhat.com>
> ---
>  drivers/scsi/megaraid/megaraid_sas_base.c   | 50 ++++++++-------------
>  drivers/scsi/megaraid/megaraid_sas_fusion.c |  4 +-
>  2 files changed, 20 insertions(+), 34 deletions(-)
> 
> diff --git a/drivers/scsi/megaraid/megaraid_sas_base.c b/drivers/scsi/megaraid/megaraid_sas_base.c
> index 3dd1df472dc6..b49999b90231 100644
> --- a/drivers/scsi/megaraid/megaraid_sas_base.c
> +++ b/drivers/scsi/megaraid/megaraid_sas_base.c
> @@ -33,6 +33,7 @@
>  #include <linux/fs.h>
>  #include <linux/compat.h>
>  #include <linux/blkdev.h>
> +#include <linux/blk-mq-pci.h>
>  #include <linux/mutex.h>
>  #include <linux/poll.h>
>  #include <linux/vmalloc.h>
> @@ -3165,6 +3166,19 @@ megasas_fw_cmds_outstanding_show(struct device *cdev,
>  	return snprintf(buf, PAGE_SIZE, "%d\n", atomic_read(&instance->fw_outstanding));
>  }
>  
> +static int megasas_map_queues(struct Scsi_Host *shost)
> +{
> +	struct megasas_instance *instance = (struct megasas_instance *)
> +		shost->hostdata;
> +	struct blk_mq_queue_map *qmap = &shost->tag_set.map[HCTX_TYPE_DEFAULT];
> +
> +	if (smp_affinity_enable && instance->msix_vectors)
> +		return blk_mq_pci_map_queues(qmap, instance->pdev, 0);
> +	else
> +		return blk_mq_map_queues(qmap);
> +}
> +
> +
>  static DEVICE_ATTR(fw_crash_buffer, S_IRUGO | S_IWUSR,
>  	megasas_fw_crash_buffer_show, megasas_fw_crash_buffer_store);
>  static DEVICE_ATTR(fw_crash_buffer_size, S_IRUGO,

As mentioned, we should be using a common function here.

> @@ -3207,7 +3221,9 @@ static struct scsi_host_template megasas_template = {
>  	.shost_attrs = megaraid_host_attrs,
>  	.bios_param = megasas_bios_param,
>  	.change_queue_depth = scsi_change_queue_depth,
> +	.map_queues =  megasas_map_queues,
>  	.no_write_same = 1,
> +	.host_tagset = 1,
>  };
>  
>  /**
> @@ -5407,26 +5423,6 @@ megasas_setup_jbod_map(struct megasas_instance *instance)
>  		instance->use_seqnum_jbod_fp = false;
>  }
>  
> -static void megasas_setup_reply_map(struct megasas_instance *instance)
> -{
> -	const struct cpumask *mask;
> -	unsigned int queue, cpu;
> -
> -	for (queue = 0; queue < instance->msix_vectors; queue++) {
> -		mask = pci_irq_get_affinity(instance->pdev, queue);
> -		if (!mask)
> -			goto fallback;
> -
> -		for_each_cpu(cpu, mask)
> -			instance->reply_map[cpu] = queue;
> -	}
> -	return;
> -
> -fallback:
> -	for_each_possible_cpu(cpu)
> -		instance->reply_map[cpu] = cpu % instance->msix_vectors;
> -}
> -
>  /**
>   * megasas_get_device_list -	Get the PD and LD device list from FW.
>   * @instance:			Adapter soft state
> @@ -5666,8 +5662,6 @@ static int megasas_init_fw(struct megasas_instance *instance)
>  			goto fail_init_adapter;
>  	}
>  
> -	megasas_setup_reply_map(instance);
> -
>  	dev_info(&instance->pdev->dev,
>  		"firmware supports msix\t: (%d)", fw_msix_count);
>  	dev_info(&instance->pdev->dev,
> @@ -6298,6 +6292,8 @@ static int megasas_io_attach(struct megasas_instance *instance)
>  	host->max_lun = MEGASAS_MAX_LUN;
>  	host->max_cmd_len = 16;
>  
> +	host->nr_hw_queues = instance->msix_vectors ?: 1;
> +
>  	/*
>  	 * Notify the mid-layer about the new controller
>  	 */
> @@ -6464,11 +6460,6 @@ static inline int megasas_alloc_mfi_ctrl_mem(struct megasas_instance *instance)
>   */
>  static int megasas_alloc_ctrl_mem(struct megasas_instance *instance)
>  {
> -	instance->reply_map = kcalloc(nr_cpu_ids, sizeof(unsigned int),
> -				      GFP_KERNEL);
> -	if (!instance->reply_map)
> -		return -ENOMEM;
> -
>  	switch (instance->adapter_type) {
>  	case MFI_SERIES:
>  		if (megasas_alloc_mfi_ctrl_mem(instance))
> @@ -6485,8 +6476,6 @@ static int megasas_alloc_ctrl_mem(struct megasas_instance *instance)
>  
>  	return 0;
>   fail:
> -	kfree(instance->reply_map);
> -	instance->reply_map = NULL;
>  	return -ENOMEM;
>  }
>  
> @@ -6499,7 +6488,6 @@ static int megasas_alloc_ctrl_mem(struct megasas_instance *instance)
>   */
>  static inline void megasas_free_ctrl_mem(struct megasas_instance *instance)
>  {
> -	kfree(instance->reply_map);
>  	if (instance->adapter_type == MFI_SERIES) {
>  		if (instance->producer)
>  			dma_free_coherent(&instance->pdev->dev, sizeof(u32),
> @@ -7142,8 +7130,6 @@ megasas_resume(struct pci_dev *pdev)
>  	if (rval < 0)
>  		goto fail_reenable_msix;
>  
> -	megasas_setup_reply_map(instance);
> -
>  	if (instance->adapter_type != MFI_SERIES) {
>  		megasas_reset_reply_desc(instance);
>  		if (megasas_ioc_init_fusion(instance)) {
> diff --git a/drivers/scsi/megaraid/megaraid_sas_fusion.c b/drivers/scsi/megaraid/megaraid_sas_fusion.c
> index 4dfa0685a86c..4f909f32bf5c 100644
> --- a/drivers/scsi/megaraid/megaraid_sas_fusion.c
> +++ b/drivers/scsi/megaraid/megaraid_sas_fusion.c
> @@ -2699,7 +2699,7 @@ megasas_build_ldio_fusion(struct megasas_instance *instance,
>  	}
>  
>  	cmd->request_desc->SCSIIO.MSIxIndex =
> -		instance->reply_map[raw_smp_processor_id()];
> +		scsi_cmnd_hctx_index(instance->host, scp);
>  
>  	if (instance->adapter_type >= VENTURA_SERIES) {
>  		/* FP for Optimal raid level 1.
> @@ -3013,7 +3013,7 @@ megasas_build_syspd_fusion(struct megasas_instance *instance,
>  	cmd->request_desc->SCSIIO.DevHandle = io_request->DevHandle;
>  
>  	cmd->request_desc->SCSIIO.MSIxIndex =
> -		instance->reply_map[raw_smp_processor_id()];
> +		scsi_cmnd_hctx_index(instance->host, scmd);
>  
>  	if (!fp_possible) {
>  		/* system pd firmware path */
> 
Otherwise:

Reviewed-by: Hannes Reinecke <hare@suse.com>

Cheers,

Hannes
Kashyap Desai June 1, 2019, 9:41 p.m. UTC | #2
> SCSI's reply qeueue is very similar with blk-mq's hw queue, both
assigned by
> IRQ vector, so map te private reply queue into blk-mq's hw queue via
> .host_tagset.
>
> Then the private reply mapping can be removed.
>
> Another benefit is that the request/irq lost issue may be solved in
generic
> approach because managed IRQ may be shutdown during CPU hotplug.

Ming,

I quickly tested this patch series on MegaRaid Aero controller. Without
this patch I can get 3.0M IOPS, but once I apply this patch I see only
1.2M IOPS (40% Performance drop)
HBA supports 5089 can_queue.

<perf top> output without  patch -

    3.39%  [megaraid_sas]  [k] complete_cmd_fusion
     3.36%  [kernel]        [k] scsi_queue_rq
     3.26%  [kernel]        [k] entry_SYSCALL_64
     2.57%  [kernel]        [k] syscall_return_via_sysret
     1.95%  [megaraid_sas]  [k] megasas_build_and_issue_cmd_fusion
     1.88%  [kernel]        [k] _raw_spin_lock_irqsave
     1.79%  [kernel]        [k] gup_pmd_range
     1.73%  [kernel]        [k] _raw_spin_lock
     1.68%  [kernel]        [k] __sched_text_start
     1.19%  [kernel]        [k] irq_entries_start
     1.13%  [kernel]        [k] scsi_dec_host_busy
     1.08%  [kernel]        [k] aio_complete
     1.07%  [kernel]        [k] read_tsc
     1.01%  [kernel]        [k] blk_mq_get_request
     0.93%  [kernel]        [k] __update_load_avg_cfs_rq
     0.92%  [kernel]        [k] aio_read_events
     0.91%  [kernel]        [k] lookup_ioctx
     0.91%  fio             [.] fio_gettime
     0.87%  [kernel]        [k] set_next_entity
     0.87%  [megaraid_sas]  [k] megasas_build_ldio_fusion

<perf top> output with  patch -

    11.30%  [kernel]       [k] native_queued_spin_lock_slowpath
     3.37%  [kernel]       [k] sbitmap_any_bit_set
     2.91%  [kernel]       [k] blk_mq_run_hw_queue
     2.32%  [kernel]       [k] _raw_spin_lock_irqsave
     2.29%  [kernel]       [k] menu_select
     2.04%  [kernel]       [k] entry_SYSCALL_64
     2.03%  [kernel]       [k] __sched_text_start
     1.70%  [kernel]       [k] scsi_queue_rq
     1.66%  [kernel]       [k] _raw_spin_lock
     1.58%  [kernel]       [k] syscall_return_via_sysret
     1.33%  [kernel]       [k] native_write_msr
     1.20%  [kernel]       [k] read_tsc
     1.13%  [kernel]       [k] blk_mq_run_hw_queues
     1.13%  [kernel]       [k] __sbq_wake_up
     1.01%  [kernel]       [k] irq_entries_start
     1.00%  [kernel]       [k] switch_mm_irqs_off
     0.99%  [kernel]       [k] gup_pmd_range
     0.98%  [kernel]       [k] __update_load_avg_cfs_rq
     0.98%  [kernel]       [k] set_next_entity
     0.92%  [kernel]       [k] do_idle

Kashyap
Ming Lei June 2, 2019, 6:42 a.m. UTC | #3
Hi Kashyap,

Thanks for your test.

On Sun, Jun 02, 2019 at 03:11:26AM +0530, Kashyap Desai wrote:
> > SCSI's reply qeueue is very similar with blk-mq's hw queue, both
> assigned by
> > IRQ vector, so map te private reply queue into blk-mq's hw queue via
> > .host_tagset.
> >
> > Then the private reply mapping can be removed.
> >
> > Another benefit is that the request/irq lost issue may be solved in
> generic
> > approach because managed IRQ may be shutdown during CPU hotplug.
> 
> Ming,
> 
> I quickly tested this patch series on MegaRaid Aero controller. Without
> this patch I can get 3.0M IOPS, but once I apply this patch I see only
> 1.2M IOPS (40% Performance drop)
> HBA supports 5089 can_queue.
> 
> <perf top> output without  patch -
> 
>     3.39%  [megaraid_sas]  [k] complete_cmd_fusion
>      3.36%  [kernel]        [k] scsi_queue_rq
>      3.26%  [kernel]        [k] entry_SYSCALL_64
>      2.57%  [kernel]        [k] syscall_return_via_sysret
>      1.95%  [megaraid_sas]  [k] megasas_build_and_issue_cmd_fusion
>      1.88%  [kernel]        [k] _raw_spin_lock_irqsave
>      1.79%  [kernel]        [k] gup_pmd_range
>      1.73%  [kernel]        [k] _raw_spin_lock
>      1.68%  [kernel]        [k] __sched_text_start
>      1.19%  [kernel]        [k] irq_entries_start
>      1.13%  [kernel]        [k] scsi_dec_host_busy
>      1.08%  [kernel]        [k] aio_complete
>      1.07%  [kernel]        [k] read_tsc
>      1.01%  [kernel]        [k] blk_mq_get_request
>      0.93%  [kernel]        [k] __update_load_avg_cfs_rq
>      0.92%  [kernel]        [k] aio_read_events
>      0.91%  [kernel]        [k] lookup_ioctx
>      0.91%  fio             [.] fio_gettime
>      0.87%  [kernel]        [k] set_next_entity
>      0.87%  [megaraid_sas]  [k] megasas_build_ldio_fusion
> 
> <perf top> output with  patch -
> 
>     11.30%  [kernel]       [k] native_queued_spin_lock_slowpath

I guess there must be one global lock required in megaraid submission path,
could you run 'perf record -g -a' to see which lock is and what the stack
trace is?


Thanks,
Ming
Ming Lei June 2, 2019, 7:48 a.m. UTC | #4
On Sun, Jun 02, 2019 at 02:42:02PM +0800, Ming Lei wrote:
> Hi Kashyap,
> 
> Thanks for your test.
> 
> On Sun, Jun 02, 2019 at 03:11:26AM +0530, Kashyap Desai wrote:
> > > SCSI's reply qeueue is very similar with blk-mq's hw queue, both
> > assigned by
> > > IRQ vector, so map te private reply queue into blk-mq's hw queue via
> > > .host_tagset.
> > >
> > > Then the private reply mapping can be removed.
> > >
> > > Another benefit is that the request/irq lost issue may be solved in
> > generic
> > > approach because managed IRQ may be shutdown during CPU hotplug.
> > 
> > Ming,
> > 
> > I quickly tested this patch series on MegaRaid Aero controller. Without
> > this patch I can get 3.0M IOPS, but once I apply this patch I see only
> > 1.2M IOPS (40% Performance drop)
> > HBA supports 5089 can_queue.
> > 
> > <perf top> output without  patch -
> > 
> >     3.39%  [megaraid_sas]  [k] complete_cmd_fusion
> >      3.36%  [kernel]        [k] scsi_queue_rq
> >      3.26%  [kernel]        [k] entry_SYSCALL_64
> >      2.57%  [kernel]        [k] syscall_return_via_sysret
> >      1.95%  [megaraid_sas]  [k] megasas_build_and_issue_cmd_fusion
> >      1.88%  [kernel]        [k] _raw_spin_lock_irqsave
> >      1.79%  [kernel]        [k] gup_pmd_range
> >      1.73%  [kernel]        [k] _raw_spin_lock
> >      1.68%  [kernel]        [k] __sched_text_start
> >      1.19%  [kernel]        [k] irq_entries_start
> >      1.13%  [kernel]        [k] scsi_dec_host_busy
> >      1.08%  [kernel]        [k] aio_complete
> >      1.07%  [kernel]        [k] read_tsc
> >      1.01%  [kernel]        [k] blk_mq_get_request
> >      0.93%  [kernel]        [k] __update_load_avg_cfs_rq
> >      0.92%  [kernel]        [k] aio_read_events
> >      0.91%  [kernel]        [k] lookup_ioctx
> >      0.91%  fio             [.] fio_gettime
> >      0.87%  [kernel]        [k] set_next_entity
> >      0.87%  [megaraid_sas]  [k] megasas_build_ldio_fusion
> > 
> > <perf top> output with  patch -
> > 
> >     11.30%  [kernel]       [k] native_queued_spin_lock_slowpath
> 
> I guess there must be one global lock required in megaraid submission path,
> could you run 'perf record -g -a' to see which lock is and what the stack
> trace is?

Meantime please try the following patch and see if difference can be made.

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 49d73d979cb3..d2abec3b0f60 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -589,7 +589,7 @@ static void __blk_mq_complete_request(struct request *rq)
 	 * So complete IO reqeust in softirq context in case of single queue
 	 * for not degrading IO performance by irqsoff latency.
 	 */
-	if (q->nr_hw_queues == 1) {
+	if (q->nr_hw_queues == 1 || (rq->mq_hctx->flags & BLK_MQ_F_HOST_TAGS)) {
 		__blk_complete_request(rq);
 		return;
 	}
@@ -1977,7 +1977,8 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 		/* bypass scheduler for flush rq */
 		blk_insert_flush(rq);
 		blk_mq_run_hw_queue(data.hctx, true);
-	} else if (plug && (q->nr_hw_queues == 1 || q->mq_ops->commit_rqs)) {
+	} else if (plug && (q->nr_hw_queues == 1 || q->mq_ops->commit_rqs ||
+				(data.hctx->flags & BLK_MQ_F_HOST_TAGS))) {
 		/*
 		 * Use plugging if we have a ->commit_rqs() hook as well, as
 		 * we know the driver uses bd->last in a smart fashion.

thanks,
Ming
Kashyap Desai June 2, 2019, 4:34 p.m. UTC | #5
> Meantime please try the following patch and see if difference can be
made.
>
> diff --git a/block/blk-mq.c b/block/blk-mq.c index
> 49d73d979cb3..d2abec3b0f60 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -589,7 +589,7 @@ static void __blk_mq_complete_request(struct
> request *rq)
>  	 * So complete IO reqeust in softirq context in case of single
queue
>  	 * for not degrading IO performance by irqsoff latency.
>  	 */
> -	if (q->nr_hw_queues == 1) {
> +	if (q->nr_hw_queues == 1 || (rq->mq_hctx->flags &
> BLK_MQ_F_HOST_TAGS))
> +{
>  		__blk_complete_request(rq);
>  		return;
>  	}
> @@ -1977,7 +1977,8 @@ static blk_qc_t blk_mq_make_request(struct
> request_queue *q, struct bio *bio)
>  		/* bypass scheduler for flush rq */
>  		blk_insert_flush(rq);
>  		blk_mq_run_hw_queue(data.hctx, true);
> -	} else if (plug && (q->nr_hw_queues == 1 || q->mq_ops-
> >commit_rqs)) {
> +	} else if (plug && (q->nr_hw_queues == 1 || q->mq_ops->commit_rqs
> ||
> +				(data.hctx->flags & BLK_MQ_F_HOST_TAGS)))
> {
>  		/*
>  		 * Use plugging if we have a ->commit_rqs() hook as well,
as
>  		 * we know the driver uses bd->last in a smart fashion.

Ming -

I tried above patch and no improvement in performance.

Below is perf record data - lock contention is while getting the tag
(blk_mq_get_tag )

6.67%     6.67%  fio              [kernel.vmlinux]  [k]
native_queued_spin_lock_slowpath
   - 6.66% io_submit
      - 6.66% entry_SYSCALL_64
         - do_syscall_64
            - 6.66% __x64_sys_io_submit
               - 6.66% io_submit_one
                  - 6.66% aio_read
                     - 6.66% generic_file_read_iter
                        - 6.66% blkdev_direct_IO
                           - 6.65% submit_bio
                              - generic_make_request
                                 - 6.65% blk_mq_make_request
                                    - 6.65% blk_mq_get_request
                                       - 6.65% blk_mq_get_tag
                                          - 6.58%
prepare_to_wait_exclusive
                                             - 6.57%
_raw_spin_lock_irqsave

queued_spin_lock_slowpath

>
> thanks,
> Ming
Ming Lei June 3, 2019, 3:56 a.m. UTC | #6
Hi Kashyap,

Thanks for collecting the log.

On Sun, Jun 02, 2019 at 10:04:01PM +0530, Kashyap Desai wrote:
> > Meantime please try the following patch and see if difference can be
> made.
> >
> > diff --git a/block/blk-mq.c b/block/blk-mq.c index
> > 49d73d979cb3..d2abec3b0f60 100644
> > --- a/block/blk-mq.c
> > +++ b/block/blk-mq.c
> > @@ -589,7 +589,7 @@ static void __blk_mq_complete_request(struct
> > request *rq)
> >  	 * So complete IO reqeust in softirq context in case of single
> queue
> >  	 * for not degrading IO performance by irqsoff latency.
> >  	 */
> > -	if (q->nr_hw_queues == 1) {
> > +	if (q->nr_hw_queues == 1 || (rq->mq_hctx->flags &
> > BLK_MQ_F_HOST_TAGS))
> > +{
> >  		__blk_complete_request(rq);
> >  		return;
> >  	}
> > @@ -1977,7 +1977,8 @@ static blk_qc_t blk_mq_make_request(struct
> > request_queue *q, struct bio *bio)
> >  		/* bypass scheduler for flush rq */
> >  		blk_insert_flush(rq);
> >  		blk_mq_run_hw_queue(data.hctx, true);
> > -	} else if (plug && (q->nr_hw_queues == 1 || q->mq_ops-
> > >commit_rqs)) {
> > +	} else if (plug && (q->nr_hw_queues == 1 || q->mq_ops->commit_rqs
> > ||
> > +				(data.hctx->flags & BLK_MQ_F_HOST_TAGS)))
> > {
> >  		/*
> >  		 * Use plugging if we have a ->commit_rqs() hook as well,
> as
> >  		 * we know the driver uses bd->last in a smart fashion.
> 
> Ming -
> 
> I tried above patch and no improvement in performance.
> 
> Below is perf record data - lock contention is while getting the tag
> (blk_mq_get_tag )
> 
> 6.67%     6.67%  fio              [kernel.vmlinux]  [k]
> native_queued_spin_lock_slowpath
>    - 6.66% io_submit
>       - 6.66% entry_SYSCALL_64
>          - do_syscall_64
>             - 6.66% __x64_sys_io_submit
>                - 6.66% io_submit_one
>                   - 6.66% aio_read
>                      - 6.66% generic_file_read_iter
>                         - 6.66% blkdev_direct_IO
>                            - 6.65% submit_bio
>                               - generic_make_request
>                                  - 6.65% blk_mq_make_request
>                                     - 6.65% blk_mq_get_request
>                                        - 6.65% blk_mq_get_tag
>                                           - 6.58%
> prepare_to_wait_exclusive
>                                              - 6.57%
> _raw_spin_lock_irqsave
> 
> queued_spin_lock_slowpath

Please drop the patch in my last email, and apply the following patch
and see if we can make a difference:

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 3d6780504dcb..69d6bffcc8ff 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -627,6 +627,9 @@ static int hctx_active_show(void *data, struct seq_file *m)
 {
 	struct blk_mq_hw_ctx *hctx = data;
 
+	if (hctx->flags & BLK_MQ_F_HOST_TAGS)
+		hctx = blk_mq_master_hctx(hctx);
+
 	seq_printf(m, "%d\n", atomic_read(&hctx->nr_active));
 	return 0;
 }
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 309ec5079f3f..58ef83a34fda 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -30,6 +30,9 @@ bool blk_mq_has_free_tags(struct blk_mq_tags *tags)
  */
 bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
 {
+	if (hctx->flags & BLK_MQ_F_HOST_TAGS)
+		hctx = blk_mq_master_hctx(hctx);
+
 	if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) &&
 	    !test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
 		atomic_inc(&hctx->tags->active_queues);
@@ -55,6 +58,9 @@ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
 {
 	struct blk_mq_tags *tags = hctx->tags;
 
+	if (hctx->flags & BLK_MQ_F_HOST_TAGS)
+		hctx = blk_mq_master_hctx(hctx);
+
 	if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
 		return;
 
@@ -74,6 +80,10 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
 
 	if (!hctx || !(hctx->flags & BLK_MQ_F_TAG_SHARED))
 		return true;
+
+	if (hctx->flags & BLK_MQ_F_HOST_TAGS)
+		hctx = blk_mq_master_hctx(hctx);
+
 	if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
 		return true;
 
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index 61deab0b5a5a..84e9b46ffc78 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -36,11 +36,22 @@ extern void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool);
 void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
 		void *priv);
 
+static inline struct blk_mq_hw_ctx *blk_mq_master_hctx(
+		struct blk_mq_hw_ctx *hctx)
+{
+	return hctx->queue->queue_hw_ctx[0];
+}
+
+
 static inline struct sbq_wait_state *bt_wait_ptr(struct sbitmap_queue *bt,
 						 struct blk_mq_hw_ctx *hctx)
 {
 	if (!hctx)
 		return &bt->ws[0];
+
+	if (hctx->flags & BLK_MQ_F_HOST_TAGS)
+		hctx = blk_mq_master_hctx(hctx);
+
 	return sbq_wait_ptr(bt, &hctx->wait_index);
 }
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 49d73d979cb3..4196ed3b0085 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -303,7 +303,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 	} else {
 		if (data->hctx->flags & BLK_MQ_F_TAG_SHARED) {
 			rq_flags = RQF_MQ_INFLIGHT;
-			atomic_inc(&data->hctx->nr_active);
+			blk_mq_inc_nr_active(data->hctx);
 		}
 		rq->tag = tag;
 		rq->internal_tag = -1;
@@ -517,7 +517,7 @@ void blk_mq_free_request(struct request *rq)
 
 	ctx->rq_completed[rq_is_sync(rq)]++;
 	if (rq->rq_flags & RQF_MQ_INFLIGHT)
-		atomic_dec(&hctx->nr_active);
+		blk_mq_dec_nr_active(hctx);
 
 	if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq)))
 		laptop_io_completion(q->backing_dev_info);
@@ -1064,7 +1064,7 @@ bool blk_mq_get_driver_tag(struct request *rq)
 	if (rq->tag >= 0) {
 		if (shared) {
 			rq->rq_flags |= RQF_MQ_INFLIGHT;
-			atomic_inc(&data.hctx->nr_active);
+			blk_mq_inc_nr_active(data.hctx);
 		}
 		data.hctx->tags->rqs[rq->tag] = rq;
 	}
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 633a5a77ee8b..f1279b8c2289 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -193,6 +193,20 @@ unsigned int blk_mq_in_flight(struct request_queue *q, struct hd_struct *part);
 void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part,
 			 unsigned int inflight[2]);
 
+static inline void blk_mq_inc_nr_active(struct blk_mq_hw_ctx *hctx)
+{
+	if (hctx->flags & BLK_MQ_F_HOST_TAGS)
+		hctx = blk_mq_master_hctx(hctx);
+	atomic_inc(&hctx->nr_active);
+}
+
+static inline void blk_mq_dec_nr_active(struct blk_mq_hw_ctx *hctx)
+{
+	if (hctx->flags & BLK_MQ_F_HOST_TAGS)
+		hctx = blk_mq_master_hctx(hctx);
+	atomic_dec(&hctx->nr_active);
+}
+
 static inline void blk_mq_put_dispatch_budget(struct blk_mq_hw_ctx *hctx)
 {
 	struct request_queue *q = hctx->queue;
@@ -218,7 +232,7 @@ static inline void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx,
 
 	if (rq->rq_flags & RQF_MQ_INFLIGHT) {
 		rq->rq_flags &= ~RQF_MQ_INFLIGHT;
-		atomic_dec(&hctx->nr_active);
+		blk_mq_dec_nr_active(hctx);
 	}
 }
 
Thanks,
Ming
Kashyap Desai June 3, 2019, 10 a.m. UTC | #7
>
> Please drop the patch in my last email, and apply the following patch
and see
> if we can make a difference:

Ming,

I dropped early patch and applied the below patched.  Now, I am getting
expected performance (3.0M IOPS).
Below patch fix the performance issue.  See perf report after applying the
same -

     8.52%  [kernel]        [k] sbitmap_any_bit_set
     4.19%  [kernel]        [k] blk_mq_run_hw_queue
     3.76%  [megaraid_sas]  [k] complete_cmd_fusion
     3.24%  [kernel]        [k] scsi_queue_rq
     2.53%  [megaraid_sas]  [k] megasas_build_ldio_fusion
     2.34%  [megaraid_sas]  [k] megasas_build_and_issue_cmd_fusion
     2.18%  [kernel]        [k] entry_SYSCALL_64
     1.85%  [kernel]        [k] syscall_return_via_sysret
     1.78%  [kernel]        [k] blk_mq_run_hw_queues
     1.59%  [kernel]        [k] gup_pmd_range
     1.49%  [kernel]        [k] _raw_spin_lock_irqsave
     1.24%  [kernel]        [k] scsi_dec_host_busy
     1.23%  [kernel]        [k] blk_mq_free_request
     1.23%  [kernel]        [k] blk_mq_get_request
     0.96%  [kernel]        [k] __slab_free
     0.91%  [kernel]        [k] aio_complete
     0.90%  [kernel]        [k] __sched_text_start
     0.89%  [megaraid_sas]  [k] megasas_queue_command
     0.85%  [kernel]        [k] __fget
     0.84%  [kernel]        [k] scsi_mq_get_budget

I will do some more testing and update the results.

Kashyap

>
> diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index
> 3d6780504dcb..69d6bffcc8ff 100644
> --- a/block/blk-mq-debugfs.c
> +++ b/block/blk-mq-debugfs.c
> @@ -627,6 +627,9 @@ static int hctx_active_show(void *data, struct
seq_file
> *m)  {
>  	struct blk_mq_hw_ctx *hctx = data;
>
> +	if (hctx->flags & BLK_MQ_F_HOST_TAGS)
> +		hctx = blk_mq_master_hctx(hctx);
> +
>  	seq_printf(m, "%d\n", atomic_read(&hctx->nr_active));
>  	return 0;
>  }
> diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index
> 309ec5079f3f..58ef83a34fda 100644
> --- a/block/blk-mq-tag.c
> +++ b/block/blk-mq-tag.c
> @@ -30,6 +30,9 @@ bool blk_mq_has_free_tags(struct blk_mq_tags *tags)
>   */
>  bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)  {
> +	if (hctx->flags & BLK_MQ_F_HOST_TAGS)
> +		hctx = blk_mq_master_hctx(hctx);
> +
>  	if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) &&
>  	    !test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
>  		atomic_inc(&hctx->tags->active_queues);
> @@ -55,6 +58,9 @@ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)  {
>  	struct blk_mq_tags *tags = hctx->tags;
>
> +	if (hctx->flags & BLK_MQ_F_HOST_TAGS)
> +		hctx = blk_mq_master_hctx(hctx);
> +
>  	if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
>  		return;
>
> @@ -74,6 +80,10 @@ static inline bool hctx_may_queue(struct
> blk_mq_hw_ctx *hctx,
>
>  	if (!hctx || !(hctx->flags & BLK_MQ_F_TAG_SHARED))
>  		return true;
> +
> +	if (hctx->flags & BLK_MQ_F_HOST_TAGS)
> +		hctx = blk_mq_master_hctx(hctx);
> +
>  	if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
>  		return true;
>
> diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index
> 61deab0b5a5a..84e9b46ffc78 100644
> --- a/block/blk-mq-tag.h
> +++ b/block/blk-mq-tag.h
> @@ -36,11 +36,22 @@ extern void blk_mq_tag_wakeup_all(struct
> blk_mq_tags *tags, bool);  void blk_mq_queue_tag_busy_iter(struct
> request_queue *q, busy_iter_fn *fn,
>  		void *priv);
>
> +static inline struct blk_mq_hw_ctx *blk_mq_master_hctx(
> +		struct blk_mq_hw_ctx *hctx)
> +{
> +	return hctx->queue->queue_hw_ctx[0];
> +}
> +
> +
>  static inline struct sbq_wait_state *bt_wait_ptr(struct sbitmap_queue
*bt,
>  						 struct blk_mq_hw_ctx
*hctx)
>  {
>  	if (!hctx)
>  		return &bt->ws[0];
> +
> +	if (hctx->flags & BLK_MQ_F_HOST_TAGS)
> +		hctx = blk_mq_master_hctx(hctx);
> +
>  	return sbq_wait_ptr(bt, &hctx->wait_index);  }
>
> diff --git a/block/blk-mq.c b/block/blk-mq.c index
> 49d73d979cb3..4196ed3b0085 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -303,7 +303,7 @@ static struct request *blk_mq_rq_ctx_init(struct
> blk_mq_alloc_data *data,
>  	} else {
>  		if (data->hctx->flags & BLK_MQ_F_TAG_SHARED) {
>  			rq_flags = RQF_MQ_INFLIGHT;
> -			atomic_inc(&data->hctx->nr_active);
> +			blk_mq_inc_nr_active(data->hctx);
>  		}
>  		rq->tag = tag;
>  		rq->internal_tag = -1;
> @@ -517,7 +517,7 @@ void blk_mq_free_request(struct request *rq)
>
>  	ctx->rq_completed[rq_is_sync(rq)]++;
>  	if (rq->rq_flags & RQF_MQ_INFLIGHT)
> -		atomic_dec(&hctx->nr_active);
> +		blk_mq_dec_nr_active(hctx);
>
>  	if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq)))
>  		laptop_io_completion(q->backing_dev_info);
> @@ -1064,7 +1064,7 @@ bool blk_mq_get_driver_tag(struct request *rq)
>  	if (rq->tag >= 0) {
>  		if (shared) {
>  			rq->rq_flags |= RQF_MQ_INFLIGHT;
> -			atomic_inc(&data.hctx->nr_active);
> +			blk_mq_inc_nr_active(data.hctx);
>  		}
>  		data.hctx->tags->rqs[rq->tag] = rq;
>  	}
> diff --git a/block/blk-mq.h b/block/blk-mq.h index
> 633a5a77ee8b..f1279b8c2289 100644
> --- a/block/blk-mq.h
> +++ b/block/blk-mq.h
> @@ -193,6 +193,20 @@ unsigned int blk_mq_in_flight(struct request_queue
> *q, struct hd_struct *part);  void blk_mq_in_flight_rw(struct
request_queue
> *q, struct hd_struct *part,
>  			 unsigned int inflight[2]);
>
> +static inline void blk_mq_inc_nr_active(struct blk_mq_hw_ctx *hctx) {
> +	if (hctx->flags & BLK_MQ_F_HOST_TAGS)
> +		hctx = blk_mq_master_hctx(hctx);
> +	atomic_inc(&hctx->nr_active);
> +}
> +
> +static inline void blk_mq_dec_nr_active(struct blk_mq_hw_ctx *hctx) {
> +	if (hctx->flags & BLK_MQ_F_HOST_TAGS)
> +		hctx = blk_mq_master_hctx(hctx);
> +	atomic_dec(&hctx->nr_active);
> +}
> +
>  static inline void blk_mq_put_dispatch_budget(struct blk_mq_hw_ctx
*hctx)
> {
>  	struct request_queue *q = hctx->queue; @@ -218,7 +232,7 @@ static
> inline void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx,
>
>  	if (rq->rq_flags & RQF_MQ_INFLIGHT) {
>  		rq->rq_flags &= ~RQF_MQ_INFLIGHT;
> -		atomic_dec(&hctx->nr_active);
> +		blk_mq_dec_nr_active(hctx);
>  	}
>  }
>
> Thanks,
> Ming
Kashyap Desai June 7, 2019, 9:45 a.m. UTC | #8
>
> >
> > Please drop the patch in my last email, and apply the following patch
> > and see if we can make a difference:
>
> Ming,
>
> I dropped early patch and applied the below patched.  Now, I am getting
> expected performance (3.0M IOPS).
> Below patch fix the performance issue.  See perf report after applying
the
> same -
>
>      8.52%  [kernel]        [k] sbitmap_any_bit_set
>      4.19%  [kernel]        [k] blk_mq_run_hw_queue
>      3.76%  [megaraid_sas]  [k] complete_cmd_fusion
>      3.24%  [kernel]        [k] scsi_queue_rq
>      2.53%  [megaraid_sas]  [k] megasas_build_ldio_fusion
>      2.34%  [megaraid_sas]  [k] megasas_build_and_issue_cmd_fusion
>      2.18%  [kernel]        [k] entry_SYSCALL_64
>      1.85%  [kernel]        [k] syscall_return_via_sysret
>      1.78%  [kernel]        [k] blk_mq_run_hw_queues
>      1.59%  [kernel]        [k] gup_pmd_range
>      1.49%  [kernel]        [k] _raw_spin_lock_irqsave
>      1.24%  [kernel]        [k] scsi_dec_host_busy
>      1.23%  [kernel]        [k] blk_mq_free_request
>      1.23%  [kernel]        [k] blk_mq_get_request
>      0.96%  [kernel]        [k] __slab_free
>      0.91%  [kernel]        [k] aio_complete
>      0.90%  [kernel]        [k] __sched_text_start
>      0.89%  [megaraid_sas]  [k] megasas_queue_command
>      0.85%  [kernel]        [k] __fget
>      0.84%  [kernel]        [k] scsi_mq_get_budget
>
> I will do some more testing and update the results.

Ming, I did testing on AMD Dual Socket server (AMD EPYC 7601 32-Core
Processor). System has total 128 logical cores.

Without patch, performance can go upto 2.8M IOPS. See below perf top
output.

   7.37%  [megaraid_sas]      [k] complete_cmd_fusion
   2.51%  [kernel]            [k] copy_user_generic_string
   2.48%  [kernel]            [k] read_tsc
   2.10%  fio                 [.] thread_main
   2.06%  [kernel]            [k] gup_pgd_range
   1.98%  [kernel]            [k] __get_user_4
   1.92%  [kernel]            [k] entry_SYSCALL_64
   1.58%  [kernel]            [k] scsi_queue_rq
   1.55%  [megaraid_sas]      [k] megasas_queue_command
   1.52%  [kernel]            [k] irq_entries_start
   1.43%  fio                 [.] get_io_u
   1.39%  [kernel]            [k] blkdev_direct_IO
   1.34%  [kernel]            [k] __audit_syscall_exit
   1.31%  [megaraid_sas]      [k] megasas_build_and_issue_cmd_fusion
   1.27%  [kernel]            [k] syscall_slow_exit_work
   1.23%  [kernel]            [k] io_submit_one
   1.20%  [kernel]            [k] do_syscall_64
   1.17%  fio                 [.] td_io_queue
   1.16%  [kernel]            [k] lookup_ioctx
   1.14%  [kernel]            [k] kmem_cache_alloc
   1.10%  [megaraid_sas]      [k] megasas_build_ldio_fusion
   1.07%  [kernel]            [k] __memset
   1.06%  [kernel]            [k] __virt_addr_valid
   0.98%  [kernel]            [k] blk_mq_get_request
   0.94%  [kernel]            [k] note_interrupt
   0.91%  [kernel]            [k] __get_user_8
   0.91%  [kernel]            [k] aio_read_events
   0.85%  [kernel]            [k] __put_user_4
   0.78%  fio                 [.] fio_libaio_commit
   0.74%  [megaraid_sas]      [k] MR_BuildRaidContext
   0.70%  [kernel]            [k] __x64_sys_io_submit
   0.69%  fio                 [.] utime_since_now


With your patch - Performance can go upto 1.7M IOPS. See below perf top
output.

 23.01%  [kernel]              [k] sbitmap_any_bit_set
   6.42%  [kernel]              [k] blk_mq_run_hw_queue
   4.44%  [megaraid_sas]        [k] complete_cmd_fusion
   4.23%  [kernel]              [k] blk_mq_run_hw_queues
   1.80%  [kernel]              [k] read_tsc
   1.60%  [kernel]              [k] copy_user_generic_string
   1.33%  fio                   [.] thread_main
   1.27%  [kernel]              [k] irq_entries_start
   1.22%  [kernel]              [k] gup_pgd_range
   1.20%  [kernel]              [k] __get_user_4
   1.20%  [kernel]              [k] entry_SYSCALL_64
   1.07%  [kernel]              [k] scsi_queue_rq
   0.88%  fio                   [.] get_io_u
   0.87%  [megaraid_sas]        [k] megasas_queue_command
   0.86%  [kernel]              [k] blkdev_direct_IO
   0.85%  fio                   [.] td_io_queue
   0.80%  [kernel]              [k] note_interrupt
   0.76%  [kernel]              [k] lookup_ioctx
   0.76%  [kernel]              [k] do_syscall_64
   0.75%  [megaraid_sas]        [k] megasas_build_and_issue_cmd_fusion
   0.74%  [megaraid_sas]        [k] megasas_build_ldio_fusion
   0.72%  [kernel]              [k] kmem_cache_alloc
   0.71%  [kernel]              [k] __audit_syscall_exit
   0.67%  [kernel]              [k] __virt_addr_valid
   0.65%  [kernel]              [k] blk_mq_get_request
   0.64%  [kernel]              [k] __memset
   0.62%  [kernel]              [k] syscall_slow_exit_work
   0.60%  [kernel]              [k] io_submit_one
   0.59%  [kernel]              [k] ktime_get
   0.58%  fio                   [.] fio_libaio_commit
   0.57%  [kernel]              [k] aio_read_events
   0.54%  [kernel]              [k] __get_user_8
   0.53%  [kernel]              [k] aio_complete_rw
   0.51%  [kernel]              [k] kmem_cache_free

With your patch + reducing logical cpu core to 64 (CPU hotplugged),
performance can go upto 2.2M IOPS. See below perf top output.

   9.56%  [kernel]            [k] sbitmap_any_bit_set
   4.62%  [megaraid_sas]      [k] complete_cmd_fusion
   3.02%  [kernel]            [k] blk_mq_run_hw_queue
   2.15%  [kernel]            [k] copy_user_generic_string
   2.13%  [kernel]            [k] blk_mq_run_hw_queues
   2.09%  [kernel]            [k] read_tsc
   1.66%  [kernel]            [k] __get_user_4
   1.59%  [kernel]            [k] entry_SYSCALL_64
   1.57%  [kernel]            [k] gup_pgd_range
   1.55%  fio                 [.] thread_main
   1.51%  [kernel]            [k] scsi_queue_rq
   1.31%  [kernel]            [k] __memset
   1.21%  [megaraid_sas]      [k] megasas_build_and_issue_cmd_fusion
   1.16%  [megaraid_sas]      [k] megasas_queue_command
   1.13%  fio                 [.] get_io_u
   1.12%  [kernel]            [k] blk_mq_get_request
   1.07%  [kernel]            [k] blkdev_direct_IO
   1.06%  [kernel]            [k] __put_user_4
   1.05%  fio                 [.] td_io_queue
   1.02%  [kernel]            [k] syscall_slow_exit_work
   1.00%  [megaraid_sas]      [k] megasas_build_ldio_fusion


In summary, Part of the performance drop may be correlated with number of
hctx created in block layer. I can provide more details and can test
follow up patch.

Kashyap


>
> Kashyap
>
> >
> > diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index
> > 3d6780504dcb..69d6bffcc8ff 100644
> > --- a/block/blk-mq-debugfs.c
> > +++ b/block/blk-mq-debugfs.c
> > @@ -627,6 +627,9 @@ static int hctx_active_show(void *data, struct
> > seq_file
> > *m)  {
> >  	struct blk_mq_hw_ctx *hctx = data;
> >
> > +	if (hctx->flags & BLK_MQ_F_HOST_TAGS)
> > +		hctx = blk_mq_master_hctx(hctx);
> > +
> >  	seq_printf(m, "%d\n", atomic_read(&hctx->nr_active));
> >  	return 0;
> >  }
> > diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index
> > 309ec5079f3f..58ef83a34fda 100644
> > --- a/block/blk-mq-tag.c
> > +++ b/block/blk-mq-tag.c
> > @@ -30,6 +30,9 @@ bool blk_mq_has_free_tags(struct blk_mq_tags *tags)
> >   */
> >  bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)  {
> > +	if (hctx->flags & BLK_MQ_F_HOST_TAGS)
> > +		hctx = blk_mq_master_hctx(hctx);
> > +
> >  	if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) &&
> >  	    !test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
> >  		atomic_inc(&hctx->tags->active_queues);
> > @@ -55,6 +58,9 @@ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
> {
> >  	struct blk_mq_tags *tags = hctx->tags;
> >
> > +	if (hctx->flags & BLK_MQ_F_HOST_TAGS)
> > +		hctx = blk_mq_master_hctx(hctx);
> > +
> >  	if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
> >  		return;
> >
> > @@ -74,6 +80,10 @@ static inline bool hctx_may_queue(struct
> > blk_mq_hw_ctx *hctx,
> >
> >  	if (!hctx || !(hctx->flags & BLK_MQ_F_TAG_SHARED))
> >  		return true;
> > +
> > +	if (hctx->flags & BLK_MQ_F_HOST_TAGS)
> > +		hctx = blk_mq_master_hctx(hctx);
> > +
> >  	if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
> >  		return true;
> >
> > diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index
> > 61deab0b5a5a..84e9b46ffc78 100644
> > --- a/block/blk-mq-tag.h
> > +++ b/block/blk-mq-tag.h
> > @@ -36,11 +36,22 @@ extern void blk_mq_tag_wakeup_all(struct
> > blk_mq_tags *tags, bool);  void blk_mq_queue_tag_busy_iter(struct
> > request_queue *q, busy_iter_fn *fn,
> >  		void *priv);
> >
> > +static inline struct blk_mq_hw_ctx *blk_mq_master_hctx(
> > +		struct blk_mq_hw_ctx *hctx)
> > +{
> > +	return hctx->queue->queue_hw_ctx[0]; }
> > +
> > +
> >  static inline struct sbq_wait_state *bt_wait_ptr(struct sbitmap_queue
*bt,
> >  						 struct blk_mq_hw_ctx
*hctx)
> >  {
> >  	if (!hctx)
> >  		return &bt->ws[0];
> > +
> > +	if (hctx->flags & BLK_MQ_F_HOST_TAGS)
> > +		hctx = blk_mq_master_hctx(hctx);
> > +
> >  	return sbq_wait_ptr(bt, &hctx->wait_index);  }
> >
> > diff --git a/block/blk-mq.c b/block/blk-mq.c index
> > 49d73d979cb3..4196ed3b0085 100644
> > --- a/block/blk-mq.c
> > +++ b/block/blk-mq.c
> > @@ -303,7 +303,7 @@ static struct request *blk_mq_rq_ctx_init(struct
> > blk_mq_alloc_data *data,
> >  	} else {
> >  		if (data->hctx->flags & BLK_MQ_F_TAG_SHARED) {
> >  			rq_flags = RQF_MQ_INFLIGHT;
> > -			atomic_inc(&data->hctx->nr_active);
> > +			blk_mq_inc_nr_active(data->hctx);
> >  		}
> >  		rq->tag = tag;
> >  		rq->internal_tag = -1;
> > @@ -517,7 +517,7 @@ void blk_mq_free_request(struct request *rq)
> >
> >  	ctx->rq_completed[rq_is_sync(rq)]++;
> >  	if (rq->rq_flags & RQF_MQ_INFLIGHT)
> > -		atomic_dec(&hctx->nr_active);
> > +		blk_mq_dec_nr_active(hctx);
> >
> >  	if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq)))
> >  		laptop_io_completion(q->backing_dev_info);
> > @@ -1064,7 +1064,7 @@ bool blk_mq_get_driver_tag(struct request *rq)
> >  	if (rq->tag >= 0) {
> >  		if (shared) {
> >  			rq->rq_flags |= RQF_MQ_INFLIGHT;
> > -			atomic_inc(&data.hctx->nr_active);
> > +			blk_mq_inc_nr_active(data.hctx);
> >  		}
> >  		data.hctx->tags->rqs[rq->tag] = rq;
> >  	}
> > diff --git a/block/blk-mq.h b/block/blk-mq.h index
> > 633a5a77ee8b..f1279b8c2289 100644
> > --- a/block/blk-mq.h
> > +++ b/block/blk-mq.h
> > @@ -193,6 +193,20 @@ unsigned int blk_mq_in_flight(struct
> > request_queue *q, struct hd_struct *part);  void
> > blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part,
> >  			 unsigned int inflight[2]);
> >
> > +static inline void blk_mq_inc_nr_active(struct blk_mq_hw_ctx *hctx) {
> > +	if (hctx->flags & BLK_MQ_F_HOST_TAGS)
> > +		hctx = blk_mq_master_hctx(hctx);
> > +	atomic_inc(&hctx->nr_active);
> > +}
> > +
> > +static inline void blk_mq_dec_nr_active(struct blk_mq_hw_ctx *hctx) {
> > +	if (hctx->flags & BLK_MQ_F_HOST_TAGS)
> > +		hctx = blk_mq_master_hctx(hctx);
> > +	atomic_dec(&hctx->nr_active);
> > +}
> > +
> >  static inline void blk_mq_put_dispatch_budget(struct blk_mq_hw_ctx
> > *hctx) {
> >  	struct request_queue *q = hctx->queue; @@ -218,7 +232,7 @@ static
> > inline void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx,
> >
> >  	if (rq->rq_flags & RQF_MQ_INFLIGHT) {
> >  		rq->rq_flags &= ~RQF_MQ_INFLIGHT;
> > -		atomic_dec(&hctx->nr_active);
> > +		blk_mq_dec_nr_active(hctx);
> >  	}
> >  }
> >
> > Thanks,
> > Ming
diff mbox series

Patch

diff --git a/drivers/scsi/megaraid/megaraid_sas_base.c b/drivers/scsi/megaraid/megaraid_sas_base.c
index 3dd1df472dc6..b49999b90231 100644
--- a/drivers/scsi/megaraid/megaraid_sas_base.c
+++ b/drivers/scsi/megaraid/megaraid_sas_base.c
@@ -33,6 +33,7 @@ 
 #include <linux/fs.h>
 #include <linux/compat.h>
 #include <linux/blkdev.h>
+#include <linux/blk-mq-pci.h>
 #include <linux/mutex.h>
 #include <linux/poll.h>
 #include <linux/vmalloc.h>
@@ -3165,6 +3166,19 @@  megasas_fw_cmds_outstanding_show(struct device *cdev,
 	return snprintf(buf, PAGE_SIZE, "%d\n", atomic_read(&instance->fw_outstanding));
 }
 
+static int megasas_map_queues(struct Scsi_Host *shost)
+{
+	struct megasas_instance *instance = (struct megasas_instance *)
+		shost->hostdata;
+	struct blk_mq_queue_map *qmap = &shost->tag_set.map[HCTX_TYPE_DEFAULT];
+
+	if (smp_affinity_enable && instance->msix_vectors)
+		return blk_mq_pci_map_queues(qmap, instance->pdev, 0);
+	else
+		return blk_mq_map_queues(qmap);
+}
+
+
 static DEVICE_ATTR(fw_crash_buffer, S_IRUGO | S_IWUSR,
 	megasas_fw_crash_buffer_show, megasas_fw_crash_buffer_store);
 static DEVICE_ATTR(fw_crash_buffer_size, S_IRUGO,
@@ -3207,7 +3221,9 @@  static struct scsi_host_template megasas_template = {
 	.shost_attrs = megaraid_host_attrs,
 	.bios_param = megasas_bios_param,
 	.change_queue_depth = scsi_change_queue_depth,
+	.map_queues =  megasas_map_queues,
 	.no_write_same = 1,
+	.host_tagset = 1,
 };
 
 /**
@@ -5407,26 +5423,6 @@  megasas_setup_jbod_map(struct megasas_instance *instance)
 		instance->use_seqnum_jbod_fp = false;
 }
 
-static void megasas_setup_reply_map(struct megasas_instance *instance)
-{
-	const struct cpumask *mask;
-	unsigned int queue, cpu;
-
-	for (queue = 0; queue < instance->msix_vectors; queue++) {
-		mask = pci_irq_get_affinity(instance->pdev, queue);
-		if (!mask)
-			goto fallback;
-
-		for_each_cpu(cpu, mask)
-			instance->reply_map[cpu] = queue;
-	}
-	return;
-
-fallback:
-	for_each_possible_cpu(cpu)
-		instance->reply_map[cpu] = cpu % instance->msix_vectors;
-}
-
 /**
  * megasas_get_device_list -	Get the PD and LD device list from FW.
  * @instance:			Adapter soft state
@@ -5666,8 +5662,6 @@  static int megasas_init_fw(struct megasas_instance *instance)
 			goto fail_init_adapter;
 	}
 
-	megasas_setup_reply_map(instance);
-
 	dev_info(&instance->pdev->dev,
 		"firmware supports msix\t: (%d)", fw_msix_count);
 	dev_info(&instance->pdev->dev,
@@ -6298,6 +6292,8 @@  static int megasas_io_attach(struct megasas_instance *instance)
 	host->max_lun = MEGASAS_MAX_LUN;
 	host->max_cmd_len = 16;
 
+	host->nr_hw_queues = instance->msix_vectors ?: 1;
+
 	/*
 	 * Notify the mid-layer about the new controller
 	 */
@@ -6464,11 +6460,6 @@  static inline int megasas_alloc_mfi_ctrl_mem(struct megasas_instance *instance)
  */
 static int megasas_alloc_ctrl_mem(struct megasas_instance *instance)
 {
-	instance->reply_map = kcalloc(nr_cpu_ids, sizeof(unsigned int),
-				      GFP_KERNEL);
-	if (!instance->reply_map)
-		return -ENOMEM;
-
 	switch (instance->adapter_type) {
 	case MFI_SERIES:
 		if (megasas_alloc_mfi_ctrl_mem(instance))
@@ -6485,8 +6476,6 @@  static int megasas_alloc_ctrl_mem(struct megasas_instance *instance)
 
 	return 0;
  fail:
-	kfree(instance->reply_map);
-	instance->reply_map = NULL;
 	return -ENOMEM;
 }
 
@@ -6499,7 +6488,6 @@  static int megasas_alloc_ctrl_mem(struct megasas_instance *instance)
  */
 static inline void megasas_free_ctrl_mem(struct megasas_instance *instance)
 {
-	kfree(instance->reply_map);
 	if (instance->adapter_type == MFI_SERIES) {
 		if (instance->producer)
 			dma_free_coherent(&instance->pdev->dev, sizeof(u32),
@@ -7142,8 +7130,6 @@  megasas_resume(struct pci_dev *pdev)
 	if (rval < 0)
 		goto fail_reenable_msix;
 
-	megasas_setup_reply_map(instance);
-
 	if (instance->adapter_type != MFI_SERIES) {
 		megasas_reset_reply_desc(instance);
 		if (megasas_ioc_init_fusion(instance)) {
diff --git a/drivers/scsi/megaraid/megaraid_sas_fusion.c b/drivers/scsi/megaraid/megaraid_sas_fusion.c
index 4dfa0685a86c..4f909f32bf5c 100644
--- a/drivers/scsi/megaraid/megaraid_sas_fusion.c
+++ b/drivers/scsi/megaraid/megaraid_sas_fusion.c
@@ -2699,7 +2699,7 @@  megasas_build_ldio_fusion(struct megasas_instance *instance,
 	}
 
 	cmd->request_desc->SCSIIO.MSIxIndex =
-		instance->reply_map[raw_smp_processor_id()];
+		scsi_cmnd_hctx_index(instance->host, scp);
 
 	if (instance->adapter_type >= VENTURA_SERIES) {
 		/* FP for Optimal raid level 1.
@@ -3013,7 +3013,7 @@  megasas_build_syspd_fusion(struct megasas_instance *instance,
 	cmd->request_desc->SCSIIO.DevHandle = io_request->DevHandle;
 
 	cmd->request_desc->SCSIIO.MSIxIndex =
-		instance->reply_map[raw_smp_processor_id()];
+		scsi_cmnd_hctx_index(instance->host, scmd);
 
 	if (!fp_possible) {
 		/* system pd firmware path */