Message ID | 20190531022801.10003-9-ming.lei@redhat.com (mailing list archive) |
---|---|
State | Changes Requested |
Headers | show |
Series | blk-mq/scsi: convert private reply queue into blk_mq hw queue | expand |
On 5/31/19 4:28 AM, Ming Lei wrote: > SCSI's reply qeueue is very similar with blk-mq's hw queue, both > assigned by IRQ vector, so map te private reply queue into blk-mq's hw > queue via .host_tagset. > > Then the private reply mapping can be removed. > > Another benefit is that the request/irq lost issue may be solved in > generic approach because managed IRQ may be shutdown during CPU > hotplug. > > Signed-off-by: Ming Lei <ming.lei@redhat.com> > --- > drivers/scsi/megaraid/megaraid_sas_base.c | 50 ++++++++------------- > drivers/scsi/megaraid/megaraid_sas_fusion.c | 4 +- > 2 files changed, 20 insertions(+), 34 deletions(-) > > diff --git a/drivers/scsi/megaraid/megaraid_sas_base.c b/drivers/scsi/megaraid/megaraid_sas_base.c > index 3dd1df472dc6..b49999b90231 100644 > --- a/drivers/scsi/megaraid/megaraid_sas_base.c > +++ b/drivers/scsi/megaraid/megaraid_sas_base.c > @@ -33,6 +33,7 @@ > #include <linux/fs.h> > #include <linux/compat.h> > #include <linux/blkdev.h> > +#include <linux/blk-mq-pci.h> > #include <linux/mutex.h> > #include <linux/poll.h> > #include <linux/vmalloc.h> > @@ -3165,6 +3166,19 @@ megasas_fw_cmds_outstanding_show(struct device *cdev, > return snprintf(buf, PAGE_SIZE, "%d\n", atomic_read(&instance->fw_outstanding)); > } > > +static int megasas_map_queues(struct Scsi_Host *shost) > +{ > + struct megasas_instance *instance = (struct megasas_instance *) > + shost->hostdata; > + struct blk_mq_queue_map *qmap = &shost->tag_set.map[HCTX_TYPE_DEFAULT]; > + > + if (smp_affinity_enable && instance->msix_vectors) > + return blk_mq_pci_map_queues(qmap, instance->pdev, 0); > + else > + return blk_mq_map_queues(qmap); > +} > + > + > static DEVICE_ATTR(fw_crash_buffer, S_IRUGO | S_IWUSR, > megasas_fw_crash_buffer_show, megasas_fw_crash_buffer_store); > static DEVICE_ATTR(fw_crash_buffer_size, S_IRUGO, As mentioned, we should be using a common function here. > @@ -3207,7 +3221,9 @@ static struct scsi_host_template megasas_template = { > .shost_attrs = megaraid_host_attrs, > .bios_param = megasas_bios_param, > .change_queue_depth = scsi_change_queue_depth, > + .map_queues = megasas_map_queues, > .no_write_same = 1, > + .host_tagset = 1, > }; > > /** > @@ -5407,26 +5423,6 @@ megasas_setup_jbod_map(struct megasas_instance *instance) > instance->use_seqnum_jbod_fp = false; > } > > -static void megasas_setup_reply_map(struct megasas_instance *instance) > -{ > - const struct cpumask *mask; > - unsigned int queue, cpu; > - > - for (queue = 0; queue < instance->msix_vectors; queue++) { > - mask = pci_irq_get_affinity(instance->pdev, queue); > - if (!mask) > - goto fallback; > - > - for_each_cpu(cpu, mask) > - instance->reply_map[cpu] = queue; > - } > - return; > - > -fallback: > - for_each_possible_cpu(cpu) > - instance->reply_map[cpu] = cpu % instance->msix_vectors; > -} > - > /** > * megasas_get_device_list - Get the PD and LD device list from FW. > * @instance: Adapter soft state > @@ -5666,8 +5662,6 @@ static int megasas_init_fw(struct megasas_instance *instance) > goto fail_init_adapter; > } > > - megasas_setup_reply_map(instance); > - > dev_info(&instance->pdev->dev, > "firmware supports msix\t: (%d)", fw_msix_count); > dev_info(&instance->pdev->dev, > @@ -6298,6 +6292,8 @@ static int megasas_io_attach(struct megasas_instance *instance) > host->max_lun = MEGASAS_MAX_LUN; > host->max_cmd_len = 16; > > + host->nr_hw_queues = instance->msix_vectors ?: 1; > + > /* > * Notify the mid-layer about the new controller > */ > @@ -6464,11 +6460,6 @@ static inline int megasas_alloc_mfi_ctrl_mem(struct megasas_instance *instance) > */ > static int megasas_alloc_ctrl_mem(struct megasas_instance *instance) > { > - instance->reply_map = kcalloc(nr_cpu_ids, sizeof(unsigned int), > - GFP_KERNEL); > - if (!instance->reply_map) > - return -ENOMEM; > - > switch (instance->adapter_type) { > case MFI_SERIES: > if (megasas_alloc_mfi_ctrl_mem(instance)) > @@ -6485,8 +6476,6 @@ static int megasas_alloc_ctrl_mem(struct megasas_instance *instance) > > return 0; > fail: > - kfree(instance->reply_map); > - instance->reply_map = NULL; > return -ENOMEM; > } > > @@ -6499,7 +6488,6 @@ static int megasas_alloc_ctrl_mem(struct megasas_instance *instance) > */ > static inline void megasas_free_ctrl_mem(struct megasas_instance *instance) > { > - kfree(instance->reply_map); > if (instance->adapter_type == MFI_SERIES) { > if (instance->producer) > dma_free_coherent(&instance->pdev->dev, sizeof(u32), > @@ -7142,8 +7130,6 @@ megasas_resume(struct pci_dev *pdev) > if (rval < 0) > goto fail_reenable_msix; > > - megasas_setup_reply_map(instance); > - > if (instance->adapter_type != MFI_SERIES) { > megasas_reset_reply_desc(instance); > if (megasas_ioc_init_fusion(instance)) { > diff --git a/drivers/scsi/megaraid/megaraid_sas_fusion.c b/drivers/scsi/megaraid/megaraid_sas_fusion.c > index 4dfa0685a86c..4f909f32bf5c 100644 > --- a/drivers/scsi/megaraid/megaraid_sas_fusion.c > +++ b/drivers/scsi/megaraid/megaraid_sas_fusion.c > @@ -2699,7 +2699,7 @@ megasas_build_ldio_fusion(struct megasas_instance *instance, > } > > cmd->request_desc->SCSIIO.MSIxIndex = > - instance->reply_map[raw_smp_processor_id()]; > + scsi_cmnd_hctx_index(instance->host, scp); > > if (instance->adapter_type >= VENTURA_SERIES) { > /* FP for Optimal raid level 1. > @@ -3013,7 +3013,7 @@ megasas_build_syspd_fusion(struct megasas_instance *instance, > cmd->request_desc->SCSIIO.DevHandle = io_request->DevHandle; > > cmd->request_desc->SCSIIO.MSIxIndex = > - instance->reply_map[raw_smp_processor_id()]; > + scsi_cmnd_hctx_index(instance->host, scmd); > > if (!fp_possible) { > /* system pd firmware path */ > Otherwise: Reviewed-by: Hannes Reinecke <hare@suse.com> Cheers, Hannes
> SCSI's reply qeueue is very similar with blk-mq's hw queue, both assigned by > IRQ vector, so map te private reply queue into blk-mq's hw queue via > .host_tagset. > > Then the private reply mapping can be removed. > > Another benefit is that the request/irq lost issue may be solved in generic > approach because managed IRQ may be shutdown during CPU hotplug. Ming, I quickly tested this patch series on MegaRaid Aero controller. Without this patch I can get 3.0M IOPS, but once I apply this patch I see only 1.2M IOPS (40% Performance drop) HBA supports 5089 can_queue. <perf top> output without patch - 3.39% [megaraid_sas] [k] complete_cmd_fusion 3.36% [kernel] [k] scsi_queue_rq 3.26% [kernel] [k] entry_SYSCALL_64 2.57% [kernel] [k] syscall_return_via_sysret 1.95% [megaraid_sas] [k] megasas_build_and_issue_cmd_fusion 1.88% [kernel] [k] _raw_spin_lock_irqsave 1.79% [kernel] [k] gup_pmd_range 1.73% [kernel] [k] _raw_spin_lock 1.68% [kernel] [k] __sched_text_start 1.19% [kernel] [k] irq_entries_start 1.13% [kernel] [k] scsi_dec_host_busy 1.08% [kernel] [k] aio_complete 1.07% [kernel] [k] read_tsc 1.01% [kernel] [k] blk_mq_get_request 0.93% [kernel] [k] __update_load_avg_cfs_rq 0.92% [kernel] [k] aio_read_events 0.91% [kernel] [k] lookup_ioctx 0.91% fio [.] fio_gettime 0.87% [kernel] [k] set_next_entity 0.87% [megaraid_sas] [k] megasas_build_ldio_fusion <perf top> output with patch - 11.30% [kernel] [k] native_queued_spin_lock_slowpath 3.37% [kernel] [k] sbitmap_any_bit_set 2.91% [kernel] [k] blk_mq_run_hw_queue 2.32% [kernel] [k] _raw_spin_lock_irqsave 2.29% [kernel] [k] menu_select 2.04% [kernel] [k] entry_SYSCALL_64 2.03% [kernel] [k] __sched_text_start 1.70% [kernel] [k] scsi_queue_rq 1.66% [kernel] [k] _raw_spin_lock 1.58% [kernel] [k] syscall_return_via_sysret 1.33% [kernel] [k] native_write_msr 1.20% [kernel] [k] read_tsc 1.13% [kernel] [k] blk_mq_run_hw_queues 1.13% [kernel] [k] __sbq_wake_up 1.01% [kernel] [k] irq_entries_start 1.00% [kernel] [k] switch_mm_irqs_off 0.99% [kernel] [k] gup_pmd_range 0.98% [kernel] [k] __update_load_avg_cfs_rq 0.98% [kernel] [k] set_next_entity 0.92% [kernel] [k] do_idle Kashyap
Hi Kashyap, Thanks for your test. On Sun, Jun 02, 2019 at 03:11:26AM +0530, Kashyap Desai wrote: > > SCSI's reply qeueue is very similar with blk-mq's hw queue, both > assigned by > > IRQ vector, so map te private reply queue into blk-mq's hw queue via > > .host_tagset. > > > > Then the private reply mapping can be removed. > > > > Another benefit is that the request/irq lost issue may be solved in > generic > > approach because managed IRQ may be shutdown during CPU hotplug. > > Ming, > > I quickly tested this patch series on MegaRaid Aero controller. Without > this patch I can get 3.0M IOPS, but once I apply this patch I see only > 1.2M IOPS (40% Performance drop) > HBA supports 5089 can_queue. > > <perf top> output without patch - > > 3.39% [megaraid_sas] [k] complete_cmd_fusion > 3.36% [kernel] [k] scsi_queue_rq > 3.26% [kernel] [k] entry_SYSCALL_64 > 2.57% [kernel] [k] syscall_return_via_sysret > 1.95% [megaraid_sas] [k] megasas_build_and_issue_cmd_fusion > 1.88% [kernel] [k] _raw_spin_lock_irqsave > 1.79% [kernel] [k] gup_pmd_range > 1.73% [kernel] [k] _raw_spin_lock > 1.68% [kernel] [k] __sched_text_start > 1.19% [kernel] [k] irq_entries_start > 1.13% [kernel] [k] scsi_dec_host_busy > 1.08% [kernel] [k] aio_complete > 1.07% [kernel] [k] read_tsc > 1.01% [kernel] [k] blk_mq_get_request > 0.93% [kernel] [k] __update_load_avg_cfs_rq > 0.92% [kernel] [k] aio_read_events > 0.91% [kernel] [k] lookup_ioctx > 0.91% fio [.] fio_gettime > 0.87% [kernel] [k] set_next_entity > 0.87% [megaraid_sas] [k] megasas_build_ldio_fusion > > <perf top> output with patch - > > 11.30% [kernel] [k] native_queued_spin_lock_slowpath I guess there must be one global lock required in megaraid submission path, could you run 'perf record -g -a' to see which lock is and what the stack trace is? Thanks, Ming
On Sun, Jun 02, 2019 at 02:42:02PM +0800, Ming Lei wrote: > Hi Kashyap, > > Thanks for your test. > > On Sun, Jun 02, 2019 at 03:11:26AM +0530, Kashyap Desai wrote: > > > SCSI's reply qeueue is very similar with blk-mq's hw queue, both > > assigned by > > > IRQ vector, so map te private reply queue into blk-mq's hw queue via > > > .host_tagset. > > > > > > Then the private reply mapping can be removed. > > > > > > Another benefit is that the request/irq lost issue may be solved in > > generic > > > approach because managed IRQ may be shutdown during CPU hotplug. > > > > Ming, > > > > I quickly tested this patch series on MegaRaid Aero controller. Without > > this patch I can get 3.0M IOPS, but once I apply this patch I see only > > 1.2M IOPS (40% Performance drop) > > HBA supports 5089 can_queue. > > > > <perf top> output without patch - > > > > 3.39% [megaraid_sas] [k] complete_cmd_fusion > > 3.36% [kernel] [k] scsi_queue_rq > > 3.26% [kernel] [k] entry_SYSCALL_64 > > 2.57% [kernel] [k] syscall_return_via_sysret > > 1.95% [megaraid_sas] [k] megasas_build_and_issue_cmd_fusion > > 1.88% [kernel] [k] _raw_spin_lock_irqsave > > 1.79% [kernel] [k] gup_pmd_range > > 1.73% [kernel] [k] _raw_spin_lock > > 1.68% [kernel] [k] __sched_text_start > > 1.19% [kernel] [k] irq_entries_start > > 1.13% [kernel] [k] scsi_dec_host_busy > > 1.08% [kernel] [k] aio_complete > > 1.07% [kernel] [k] read_tsc > > 1.01% [kernel] [k] blk_mq_get_request > > 0.93% [kernel] [k] __update_load_avg_cfs_rq > > 0.92% [kernel] [k] aio_read_events > > 0.91% [kernel] [k] lookup_ioctx > > 0.91% fio [.] fio_gettime > > 0.87% [kernel] [k] set_next_entity > > 0.87% [megaraid_sas] [k] megasas_build_ldio_fusion > > > > <perf top> output with patch - > > > > 11.30% [kernel] [k] native_queued_spin_lock_slowpath > > I guess there must be one global lock required in megaraid submission path, > could you run 'perf record -g -a' to see which lock is and what the stack > trace is? Meantime please try the following patch and see if difference can be made. diff --git a/block/blk-mq.c b/block/blk-mq.c index 49d73d979cb3..d2abec3b0f60 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -589,7 +589,7 @@ static void __blk_mq_complete_request(struct request *rq) * So complete IO reqeust in softirq context in case of single queue * for not degrading IO performance by irqsoff latency. */ - if (q->nr_hw_queues == 1) { + if (q->nr_hw_queues == 1 || (rq->mq_hctx->flags & BLK_MQ_F_HOST_TAGS)) { __blk_complete_request(rq); return; } @@ -1977,7 +1977,8 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) /* bypass scheduler for flush rq */ blk_insert_flush(rq); blk_mq_run_hw_queue(data.hctx, true); - } else if (plug && (q->nr_hw_queues == 1 || q->mq_ops->commit_rqs)) { + } else if (plug && (q->nr_hw_queues == 1 || q->mq_ops->commit_rqs || + (data.hctx->flags & BLK_MQ_F_HOST_TAGS))) { /* * Use plugging if we have a ->commit_rqs() hook as well, as * we know the driver uses bd->last in a smart fashion. thanks, Ming
> Meantime please try the following patch and see if difference can be made. > > diff --git a/block/blk-mq.c b/block/blk-mq.c index > 49d73d979cb3..d2abec3b0f60 100644 > --- a/block/blk-mq.c > +++ b/block/blk-mq.c > @@ -589,7 +589,7 @@ static void __blk_mq_complete_request(struct > request *rq) > * So complete IO reqeust in softirq context in case of single queue > * for not degrading IO performance by irqsoff latency. > */ > - if (q->nr_hw_queues == 1) { > + if (q->nr_hw_queues == 1 || (rq->mq_hctx->flags & > BLK_MQ_F_HOST_TAGS)) > +{ > __blk_complete_request(rq); > return; > } > @@ -1977,7 +1977,8 @@ static blk_qc_t blk_mq_make_request(struct > request_queue *q, struct bio *bio) > /* bypass scheduler for flush rq */ > blk_insert_flush(rq); > blk_mq_run_hw_queue(data.hctx, true); > - } else if (plug && (q->nr_hw_queues == 1 || q->mq_ops- > >commit_rqs)) { > + } else if (plug && (q->nr_hw_queues == 1 || q->mq_ops->commit_rqs > || > + (data.hctx->flags & BLK_MQ_F_HOST_TAGS))) > { > /* > * Use plugging if we have a ->commit_rqs() hook as well, as > * we know the driver uses bd->last in a smart fashion. Ming - I tried above patch and no improvement in performance. Below is perf record data - lock contention is while getting the tag (blk_mq_get_tag ) 6.67% 6.67% fio [kernel.vmlinux] [k] native_queued_spin_lock_slowpath - 6.66% io_submit - 6.66% entry_SYSCALL_64 - do_syscall_64 - 6.66% __x64_sys_io_submit - 6.66% io_submit_one - 6.66% aio_read - 6.66% generic_file_read_iter - 6.66% blkdev_direct_IO - 6.65% submit_bio - generic_make_request - 6.65% blk_mq_make_request - 6.65% blk_mq_get_request - 6.65% blk_mq_get_tag - 6.58% prepare_to_wait_exclusive - 6.57% _raw_spin_lock_irqsave queued_spin_lock_slowpath > > thanks, > Ming
Hi Kashyap, Thanks for collecting the log. On Sun, Jun 02, 2019 at 10:04:01PM +0530, Kashyap Desai wrote: > > Meantime please try the following patch and see if difference can be > made. > > > > diff --git a/block/blk-mq.c b/block/blk-mq.c index > > 49d73d979cb3..d2abec3b0f60 100644 > > --- a/block/blk-mq.c > > +++ b/block/blk-mq.c > > @@ -589,7 +589,7 @@ static void __blk_mq_complete_request(struct > > request *rq) > > * So complete IO reqeust in softirq context in case of single > queue > > * for not degrading IO performance by irqsoff latency. > > */ > > - if (q->nr_hw_queues == 1) { > > + if (q->nr_hw_queues == 1 || (rq->mq_hctx->flags & > > BLK_MQ_F_HOST_TAGS)) > > +{ > > __blk_complete_request(rq); > > return; > > } > > @@ -1977,7 +1977,8 @@ static blk_qc_t blk_mq_make_request(struct > > request_queue *q, struct bio *bio) > > /* bypass scheduler for flush rq */ > > blk_insert_flush(rq); > > blk_mq_run_hw_queue(data.hctx, true); > > - } else if (plug && (q->nr_hw_queues == 1 || q->mq_ops- > > >commit_rqs)) { > > + } else if (plug && (q->nr_hw_queues == 1 || q->mq_ops->commit_rqs > > || > > + (data.hctx->flags & BLK_MQ_F_HOST_TAGS))) > > { > > /* > > * Use plugging if we have a ->commit_rqs() hook as well, > as > > * we know the driver uses bd->last in a smart fashion. > > Ming - > > I tried above patch and no improvement in performance. > > Below is perf record data - lock contention is while getting the tag > (blk_mq_get_tag ) > > 6.67% 6.67% fio [kernel.vmlinux] [k] > native_queued_spin_lock_slowpath > - 6.66% io_submit > - 6.66% entry_SYSCALL_64 > - do_syscall_64 > - 6.66% __x64_sys_io_submit > - 6.66% io_submit_one > - 6.66% aio_read > - 6.66% generic_file_read_iter > - 6.66% blkdev_direct_IO > - 6.65% submit_bio > - generic_make_request > - 6.65% blk_mq_make_request > - 6.65% blk_mq_get_request > - 6.65% blk_mq_get_tag > - 6.58% > prepare_to_wait_exclusive > - 6.57% > _raw_spin_lock_irqsave > > queued_spin_lock_slowpath Please drop the patch in my last email, and apply the following patch and see if we can make a difference: diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 3d6780504dcb..69d6bffcc8ff 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -627,6 +627,9 @@ static int hctx_active_show(void *data, struct seq_file *m) { struct blk_mq_hw_ctx *hctx = data; + if (hctx->flags & BLK_MQ_F_HOST_TAGS) + hctx = blk_mq_master_hctx(hctx); + seq_printf(m, "%d\n", atomic_read(&hctx->nr_active)); return 0; } diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 309ec5079f3f..58ef83a34fda 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -30,6 +30,9 @@ bool blk_mq_has_free_tags(struct blk_mq_tags *tags) */ bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) { + if (hctx->flags & BLK_MQ_F_HOST_TAGS) + hctx = blk_mq_master_hctx(hctx); + if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) && !test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) atomic_inc(&hctx->tags->active_queues); @@ -55,6 +58,9 @@ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) { struct blk_mq_tags *tags = hctx->tags; + if (hctx->flags & BLK_MQ_F_HOST_TAGS) + hctx = blk_mq_master_hctx(hctx); + if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) return; @@ -74,6 +80,10 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, if (!hctx || !(hctx->flags & BLK_MQ_F_TAG_SHARED)) return true; + + if (hctx->flags & BLK_MQ_F_HOST_TAGS) + hctx = blk_mq_master_hctx(hctx); + if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) return true; diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index 61deab0b5a5a..84e9b46ffc78 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h @@ -36,11 +36,22 @@ extern void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool); void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, void *priv); +static inline struct blk_mq_hw_ctx *blk_mq_master_hctx( + struct blk_mq_hw_ctx *hctx) +{ + return hctx->queue->queue_hw_ctx[0]; +} + + static inline struct sbq_wait_state *bt_wait_ptr(struct sbitmap_queue *bt, struct blk_mq_hw_ctx *hctx) { if (!hctx) return &bt->ws[0]; + + if (hctx->flags & BLK_MQ_F_HOST_TAGS) + hctx = blk_mq_master_hctx(hctx); + return sbq_wait_ptr(bt, &hctx->wait_index); } diff --git a/block/blk-mq.c b/block/blk-mq.c index 49d73d979cb3..4196ed3b0085 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -303,7 +303,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, } else { if (data->hctx->flags & BLK_MQ_F_TAG_SHARED) { rq_flags = RQF_MQ_INFLIGHT; - atomic_inc(&data->hctx->nr_active); + blk_mq_inc_nr_active(data->hctx); } rq->tag = tag; rq->internal_tag = -1; @@ -517,7 +517,7 @@ void blk_mq_free_request(struct request *rq) ctx->rq_completed[rq_is_sync(rq)]++; if (rq->rq_flags & RQF_MQ_INFLIGHT) - atomic_dec(&hctx->nr_active); + blk_mq_dec_nr_active(hctx); if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq))) laptop_io_completion(q->backing_dev_info); @@ -1064,7 +1064,7 @@ bool blk_mq_get_driver_tag(struct request *rq) if (rq->tag >= 0) { if (shared) { rq->rq_flags |= RQF_MQ_INFLIGHT; - atomic_inc(&data.hctx->nr_active); + blk_mq_inc_nr_active(data.hctx); } data.hctx->tags->rqs[rq->tag] = rq; } diff --git a/block/blk-mq.h b/block/blk-mq.h index 633a5a77ee8b..f1279b8c2289 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -193,6 +193,20 @@ unsigned int blk_mq_in_flight(struct request_queue *q, struct hd_struct *part); void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part, unsigned int inflight[2]); +static inline void blk_mq_inc_nr_active(struct blk_mq_hw_ctx *hctx) +{ + if (hctx->flags & BLK_MQ_F_HOST_TAGS) + hctx = blk_mq_master_hctx(hctx); + atomic_inc(&hctx->nr_active); +} + +static inline void blk_mq_dec_nr_active(struct blk_mq_hw_ctx *hctx) +{ + if (hctx->flags & BLK_MQ_F_HOST_TAGS) + hctx = blk_mq_master_hctx(hctx); + atomic_dec(&hctx->nr_active); +} + static inline void blk_mq_put_dispatch_budget(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; @@ -218,7 +232,7 @@ static inline void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx, if (rq->rq_flags & RQF_MQ_INFLIGHT) { rq->rq_flags &= ~RQF_MQ_INFLIGHT; - atomic_dec(&hctx->nr_active); + blk_mq_dec_nr_active(hctx); } } Thanks, Ming
> > Please drop the patch in my last email, and apply the following patch and see > if we can make a difference: Ming, I dropped early patch and applied the below patched. Now, I am getting expected performance (3.0M IOPS). Below patch fix the performance issue. See perf report after applying the same - 8.52% [kernel] [k] sbitmap_any_bit_set 4.19% [kernel] [k] blk_mq_run_hw_queue 3.76% [megaraid_sas] [k] complete_cmd_fusion 3.24% [kernel] [k] scsi_queue_rq 2.53% [megaraid_sas] [k] megasas_build_ldio_fusion 2.34% [megaraid_sas] [k] megasas_build_and_issue_cmd_fusion 2.18% [kernel] [k] entry_SYSCALL_64 1.85% [kernel] [k] syscall_return_via_sysret 1.78% [kernel] [k] blk_mq_run_hw_queues 1.59% [kernel] [k] gup_pmd_range 1.49% [kernel] [k] _raw_spin_lock_irqsave 1.24% [kernel] [k] scsi_dec_host_busy 1.23% [kernel] [k] blk_mq_free_request 1.23% [kernel] [k] blk_mq_get_request 0.96% [kernel] [k] __slab_free 0.91% [kernel] [k] aio_complete 0.90% [kernel] [k] __sched_text_start 0.89% [megaraid_sas] [k] megasas_queue_command 0.85% [kernel] [k] __fget 0.84% [kernel] [k] scsi_mq_get_budget I will do some more testing and update the results. Kashyap > > diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index > 3d6780504dcb..69d6bffcc8ff 100644 > --- a/block/blk-mq-debugfs.c > +++ b/block/blk-mq-debugfs.c > @@ -627,6 +627,9 @@ static int hctx_active_show(void *data, struct seq_file > *m) { > struct blk_mq_hw_ctx *hctx = data; > > + if (hctx->flags & BLK_MQ_F_HOST_TAGS) > + hctx = blk_mq_master_hctx(hctx); > + > seq_printf(m, "%d\n", atomic_read(&hctx->nr_active)); > return 0; > } > diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index > 309ec5079f3f..58ef83a34fda 100644 > --- a/block/blk-mq-tag.c > +++ b/block/blk-mq-tag.c > @@ -30,6 +30,9 @@ bool blk_mq_has_free_tags(struct blk_mq_tags *tags) > */ > bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) { > + if (hctx->flags & BLK_MQ_F_HOST_TAGS) > + hctx = blk_mq_master_hctx(hctx); > + > if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) && > !test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) > atomic_inc(&hctx->tags->active_queues); > @@ -55,6 +58,9 @@ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) { > struct blk_mq_tags *tags = hctx->tags; > > + if (hctx->flags & BLK_MQ_F_HOST_TAGS) > + hctx = blk_mq_master_hctx(hctx); > + > if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) > return; > > @@ -74,6 +80,10 @@ static inline bool hctx_may_queue(struct > blk_mq_hw_ctx *hctx, > > if (!hctx || !(hctx->flags & BLK_MQ_F_TAG_SHARED)) > return true; > + > + if (hctx->flags & BLK_MQ_F_HOST_TAGS) > + hctx = blk_mq_master_hctx(hctx); > + > if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) > return true; > > diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index > 61deab0b5a5a..84e9b46ffc78 100644 > --- a/block/blk-mq-tag.h > +++ b/block/blk-mq-tag.h > @@ -36,11 +36,22 @@ extern void blk_mq_tag_wakeup_all(struct > blk_mq_tags *tags, bool); void blk_mq_queue_tag_busy_iter(struct > request_queue *q, busy_iter_fn *fn, > void *priv); > > +static inline struct blk_mq_hw_ctx *blk_mq_master_hctx( > + struct blk_mq_hw_ctx *hctx) > +{ > + return hctx->queue->queue_hw_ctx[0]; > +} > + > + > static inline struct sbq_wait_state *bt_wait_ptr(struct sbitmap_queue *bt, > struct blk_mq_hw_ctx *hctx) > { > if (!hctx) > return &bt->ws[0]; > + > + if (hctx->flags & BLK_MQ_F_HOST_TAGS) > + hctx = blk_mq_master_hctx(hctx); > + > return sbq_wait_ptr(bt, &hctx->wait_index); } > > diff --git a/block/blk-mq.c b/block/blk-mq.c index > 49d73d979cb3..4196ed3b0085 100644 > --- a/block/blk-mq.c > +++ b/block/blk-mq.c > @@ -303,7 +303,7 @@ static struct request *blk_mq_rq_ctx_init(struct > blk_mq_alloc_data *data, > } else { > if (data->hctx->flags & BLK_MQ_F_TAG_SHARED) { > rq_flags = RQF_MQ_INFLIGHT; > - atomic_inc(&data->hctx->nr_active); > + blk_mq_inc_nr_active(data->hctx); > } > rq->tag = tag; > rq->internal_tag = -1; > @@ -517,7 +517,7 @@ void blk_mq_free_request(struct request *rq) > > ctx->rq_completed[rq_is_sync(rq)]++; > if (rq->rq_flags & RQF_MQ_INFLIGHT) > - atomic_dec(&hctx->nr_active); > + blk_mq_dec_nr_active(hctx); > > if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq))) > laptop_io_completion(q->backing_dev_info); > @@ -1064,7 +1064,7 @@ bool blk_mq_get_driver_tag(struct request *rq) > if (rq->tag >= 0) { > if (shared) { > rq->rq_flags |= RQF_MQ_INFLIGHT; > - atomic_inc(&data.hctx->nr_active); > + blk_mq_inc_nr_active(data.hctx); > } > data.hctx->tags->rqs[rq->tag] = rq; > } > diff --git a/block/blk-mq.h b/block/blk-mq.h index > 633a5a77ee8b..f1279b8c2289 100644 > --- a/block/blk-mq.h > +++ b/block/blk-mq.h > @@ -193,6 +193,20 @@ unsigned int blk_mq_in_flight(struct request_queue > *q, struct hd_struct *part); void blk_mq_in_flight_rw(struct request_queue > *q, struct hd_struct *part, > unsigned int inflight[2]); > > +static inline void blk_mq_inc_nr_active(struct blk_mq_hw_ctx *hctx) { > + if (hctx->flags & BLK_MQ_F_HOST_TAGS) > + hctx = blk_mq_master_hctx(hctx); > + atomic_inc(&hctx->nr_active); > +} > + > +static inline void blk_mq_dec_nr_active(struct blk_mq_hw_ctx *hctx) { > + if (hctx->flags & BLK_MQ_F_HOST_TAGS) > + hctx = blk_mq_master_hctx(hctx); > + atomic_dec(&hctx->nr_active); > +} > + > static inline void blk_mq_put_dispatch_budget(struct blk_mq_hw_ctx *hctx) > { > struct request_queue *q = hctx->queue; @@ -218,7 +232,7 @@ static > inline void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx, > > if (rq->rq_flags & RQF_MQ_INFLIGHT) { > rq->rq_flags &= ~RQF_MQ_INFLIGHT; > - atomic_dec(&hctx->nr_active); > + blk_mq_dec_nr_active(hctx); > } > } > > Thanks, > Ming
> > > > > Please drop the patch in my last email, and apply the following patch > > and see if we can make a difference: > > Ming, > > I dropped early patch and applied the below patched. Now, I am getting > expected performance (3.0M IOPS). > Below patch fix the performance issue. See perf report after applying the > same - > > 8.52% [kernel] [k] sbitmap_any_bit_set > 4.19% [kernel] [k] blk_mq_run_hw_queue > 3.76% [megaraid_sas] [k] complete_cmd_fusion > 3.24% [kernel] [k] scsi_queue_rq > 2.53% [megaraid_sas] [k] megasas_build_ldio_fusion > 2.34% [megaraid_sas] [k] megasas_build_and_issue_cmd_fusion > 2.18% [kernel] [k] entry_SYSCALL_64 > 1.85% [kernel] [k] syscall_return_via_sysret > 1.78% [kernel] [k] blk_mq_run_hw_queues > 1.59% [kernel] [k] gup_pmd_range > 1.49% [kernel] [k] _raw_spin_lock_irqsave > 1.24% [kernel] [k] scsi_dec_host_busy > 1.23% [kernel] [k] blk_mq_free_request > 1.23% [kernel] [k] blk_mq_get_request > 0.96% [kernel] [k] __slab_free > 0.91% [kernel] [k] aio_complete > 0.90% [kernel] [k] __sched_text_start > 0.89% [megaraid_sas] [k] megasas_queue_command > 0.85% [kernel] [k] __fget > 0.84% [kernel] [k] scsi_mq_get_budget > > I will do some more testing and update the results. Ming, I did testing on AMD Dual Socket server (AMD EPYC 7601 32-Core Processor). System has total 128 logical cores. Without patch, performance can go upto 2.8M IOPS. See below perf top output. 7.37% [megaraid_sas] [k] complete_cmd_fusion 2.51% [kernel] [k] copy_user_generic_string 2.48% [kernel] [k] read_tsc 2.10% fio [.] thread_main 2.06% [kernel] [k] gup_pgd_range 1.98% [kernel] [k] __get_user_4 1.92% [kernel] [k] entry_SYSCALL_64 1.58% [kernel] [k] scsi_queue_rq 1.55% [megaraid_sas] [k] megasas_queue_command 1.52% [kernel] [k] irq_entries_start 1.43% fio [.] get_io_u 1.39% [kernel] [k] blkdev_direct_IO 1.34% [kernel] [k] __audit_syscall_exit 1.31% [megaraid_sas] [k] megasas_build_and_issue_cmd_fusion 1.27% [kernel] [k] syscall_slow_exit_work 1.23% [kernel] [k] io_submit_one 1.20% [kernel] [k] do_syscall_64 1.17% fio [.] td_io_queue 1.16% [kernel] [k] lookup_ioctx 1.14% [kernel] [k] kmem_cache_alloc 1.10% [megaraid_sas] [k] megasas_build_ldio_fusion 1.07% [kernel] [k] __memset 1.06% [kernel] [k] __virt_addr_valid 0.98% [kernel] [k] blk_mq_get_request 0.94% [kernel] [k] note_interrupt 0.91% [kernel] [k] __get_user_8 0.91% [kernel] [k] aio_read_events 0.85% [kernel] [k] __put_user_4 0.78% fio [.] fio_libaio_commit 0.74% [megaraid_sas] [k] MR_BuildRaidContext 0.70% [kernel] [k] __x64_sys_io_submit 0.69% fio [.] utime_since_now With your patch - Performance can go upto 1.7M IOPS. See below perf top output. 23.01% [kernel] [k] sbitmap_any_bit_set 6.42% [kernel] [k] blk_mq_run_hw_queue 4.44% [megaraid_sas] [k] complete_cmd_fusion 4.23% [kernel] [k] blk_mq_run_hw_queues 1.80% [kernel] [k] read_tsc 1.60% [kernel] [k] copy_user_generic_string 1.33% fio [.] thread_main 1.27% [kernel] [k] irq_entries_start 1.22% [kernel] [k] gup_pgd_range 1.20% [kernel] [k] __get_user_4 1.20% [kernel] [k] entry_SYSCALL_64 1.07% [kernel] [k] scsi_queue_rq 0.88% fio [.] get_io_u 0.87% [megaraid_sas] [k] megasas_queue_command 0.86% [kernel] [k] blkdev_direct_IO 0.85% fio [.] td_io_queue 0.80% [kernel] [k] note_interrupt 0.76% [kernel] [k] lookup_ioctx 0.76% [kernel] [k] do_syscall_64 0.75% [megaraid_sas] [k] megasas_build_and_issue_cmd_fusion 0.74% [megaraid_sas] [k] megasas_build_ldio_fusion 0.72% [kernel] [k] kmem_cache_alloc 0.71% [kernel] [k] __audit_syscall_exit 0.67% [kernel] [k] __virt_addr_valid 0.65% [kernel] [k] blk_mq_get_request 0.64% [kernel] [k] __memset 0.62% [kernel] [k] syscall_slow_exit_work 0.60% [kernel] [k] io_submit_one 0.59% [kernel] [k] ktime_get 0.58% fio [.] fio_libaio_commit 0.57% [kernel] [k] aio_read_events 0.54% [kernel] [k] __get_user_8 0.53% [kernel] [k] aio_complete_rw 0.51% [kernel] [k] kmem_cache_free With your patch + reducing logical cpu core to 64 (CPU hotplugged), performance can go upto 2.2M IOPS. See below perf top output. 9.56% [kernel] [k] sbitmap_any_bit_set 4.62% [megaraid_sas] [k] complete_cmd_fusion 3.02% [kernel] [k] blk_mq_run_hw_queue 2.15% [kernel] [k] copy_user_generic_string 2.13% [kernel] [k] blk_mq_run_hw_queues 2.09% [kernel] [k] read_tsc 1.66% [kernel] [k] __get_user_4 1.59% [kernel] [k] entry_SYSCALL_64 1.57% [kernel] [k] gup_pgd_range 1.55% fio [.] thread_main 1.51% [kernel] [k] scsi_queue_rq 1.31% [kernel] [k] __memset 1.21% [megaraid_sas] [k] megasas_build_and_issue_cmd_fusion 1.16% [megaraid_sas] [k] megasas_queue_command 1.13% fio [.] get_io_u 1.12% [kernel] [k] blk_mq_get_request 1.07% [kernel] [k] blkdev_direct_IO 1.06% [kernel] [k] __put_user_4 1.05% fio [.] td_io_queue 1.02% [kernel] [k] syscall_slow_exit_work 1.00% [megaraid_sas] [k] megasas_build_ldio_fusion In summary, Part of the performance drop may be correlated with number of hctx created in block layer. I can provide more details and can test follow up patch. Kashyap > > Kashyap > > > > > diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index > > 3d6780504dcb..69d6bffcc8ff 100644 > > --- a/block/blk-mq-debugfs.c > > +++ b/block/blk-mq-debugfs.c > > @@ -627,6 +627,9 @@ static int hctx_active_show(void *data, struct > > seq_file > > *m) { > > struct blk_mq_hw_ctx *hctx = data; > > > > + if (hctx->flags & BLK_MQ_F_HOST_TAGS) > > + hctx = blk_mq_master_hctx(hctx); > > + > > seq_printf(m, "%d\n", atomic_read(&hctx->nr_active)); > > return 0; > > } > > diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index > > 309ec5079f3f..58ef83a34fda 100644 > > --- a/block/blk-mq-tag.c > > +++ b/block/blk-mq-tag.c > > @@ -30,6 +30,9 @@ bool blk_mq_has_free_tags(struct blk_mq_tags *tags) > > */ > > bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) { > > + if (hctx->flags & BLK_MQ_F_HOST_TAGS) > > + hctx = blk_mq_master_hctx(hctx); > > + > > if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) && > > !test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) > > atomic_inc(&hctx->tags->active_queues); > > @@ -55,6 +58,9 @@ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) > { > > struct blk_mq_tags *tags = hctx->tags; > > > > + if (hctx->flags & BLK_MQ_F_HOST_TAGS) > > + hctx = blk_mq_master_hctx(hctx); > > + > > if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) > > return; > > > > @@ -74,6 +80,10 @@ static inline bool hctx_may_queue(struct > > blk_mq_hw_ctx *hctx, > > > > if (!hctx || !(hctx->flags & BLK_MQ_F_TAG_SHARED)) > > return true; > > + > > + if (hctx->flags & BLK_MQ_F_HOST_TAGS) > > + hctx = blk_mq_master_hctx(hctx); > > + > > if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) > > return true; > > > > diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index > > 61deab0b5a5a..84e9b46ffc78 100644 > > --- a/block/blk-mq-tag.h > > +++ b/block/blk-mq-tag.h > > @@ -36,11 +36,22 @@ extern void blk_mq_tag_wakeup_all(struct > > blk_mq_tags *tags, bool); void blk_mq_queue_tag_busy_iter(struct > > request_queue *q, busy_iter_fn *fn, > > void *priv); > > > > +static inline struct blk_mq_hw_ctx *blk_mq_master_hctx( > > + struct blk_mq_hw_ctx *hctx) > > +{ > > + return hctx->queue->queue_hw_ctx[0]; } > > + > > + > > static inline struct sbq_wait_state *bt_wait_ptr(struct sbitmap_queue *bt, > > struct blk_mq_hw_ctx *hctx) > > { > > if (!hctx) > > return &bt->ws[0]; > > + > > + if (hctx->flags & BLK_MQ_F_HOST_TAGS) > > + hctx = blk_mq_master_hctx(hctx); > > + > > return sbq_wait_ptr(bt, &hctx->wait_index); } > > > > diff --git a/block/blk-mq.c b/block/blk-mq.c index > > 49d73d979cb3..4196ed3b0085 100644 > > --- a/block/blk-mq.c > > +++ b/block/blk-mq.c > > @@ -303,7 +303,7 @@ static struct request *blk_mq_rq_ctx_init(struct > > blk_mq_alloc_data *data, > > } else { > > if (data->hctx->flags & BLK_MQ_F_TAG_SHARED) { > > rq_flags = RQF_MQ_INFLIGHT; > > - atomic_inc(&data->hctx->nr_active); > > + blk_mq_inc_nr_active(data->hctx); > > } > > rq->tag = tag; > > rq->internal_tag = -1; > > @@ -517,7 +517,7 @@ void blk_mq_free_request(struct request *rq) > > > > ctx->rq_completed[rq_is_sync(rq)]++; > > if (rq->rq_flags & RQF_MQ_INFLIGHT) > > - atomic_dec(&hctx->nr_active); > > + blk_mq_dec_nr_active(hctx); > > > > if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq))) > > laptop_io_completion(q->backing_dev_info); > > @@ -1064,7 +1064,7 @@ bool blk_mq_get_driver_tag(struct request *rq) > > if (rq->tag >= 0) { > > if (shared) { > > rq->rq_flags |= RQF_MQ_INFLIGHT; > > - atomic_inc(&data.hctx->nr_active); > > + blk_mq_inc_nr_active(data.hctx); > > } > > data.hctx->tags->rqs[rq->tag] = rq; > > } > > diff --git a/block/blk-mq.h b/block/blk-mq.h index > > 633a5a77ee8b..f1279b8c2289 100644 > > --- a/block/blk-mq.h > > +++ b/block/blk-mq.h > > @@ -193,6 +193,20 @@ unsigned int blk_mq_in_flight(struct > > request_queue *q, struct hd_struct *part); void > > blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part, > > unsigned int inflight[2]); > > > > +static inline void blk_mq_inc_nr_active(struct blk_mq_hw_ctx *hctx) { > > + if (hctx->flags & BLK_MQ_F_HOST_TAGS) > > + hctx = blk_mq_master_hctx(hctx); > > + atomic_inc(&hctx->nr_active); > > +} > > + > > +static inline void blk_mq_dec_nr_active(struct blk_mq_hw_ctx *hctx) { > > + if (hctx->flags & BLK_MQ_F_HOST_TAGS) > > + hctx = blk_mq_master_hctx(hctx); > > + atomic_dec(&hctx->nr_active); > > +} > > + > > static inline void blk_mq_put_dispatch_budget(struct blk_mq_hw_ctx > > *hctx) { > > struct request_queue *q = hctx->queue; @@ -218,7 +232,7 @@ static > > inline void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx, > > > > if (rq->rq_flags & RQF_MQ_INFLIGHT) { > > rq->rq_flags &= ~RQF_MQ_INFLIGHT; > > - atomic_dec(&hctx->nr_active); > > + blk_mq_dec_nr_active(hctx); > > } > > } > > > > Thanks, > > Ming
diff --git a/drivers/scsi/megaraid/megaraid_sas_base.c b/drivers/scsi/megaraid/megaraid_sas_base.c index 3dd1df472dc6..b49999b90231 100644 --- a/drivers/scsi/megaraid/megaraid_sas_base.c +++ b/drivers/scsi/megaraid/megaraid_sas_base.c @@ -33,6 +33,7 @@ #include <linux/fs.h> #include <linux/compat.h> #include <linux/blkdev.h> +#include <linux/blk-mq-pci.h> #include <linux/mutex.h> #include <linux/poll.h> #include <linux/vmalloc.h> @@ -3165,6 +3166,19 @@ megasas_fw_cmds_outstanding_show(struct device *cdev, return snprintf(buf, PAGE_SIZE, "%d\n", atomic_read(&instance->fw_outstanding)); } +static int megasas_map_queues(struct Scsi_Host *shost) +{ + struct megasas_instance *instance = (struct megasas_instance *) + shost->hostdata; + struct blk_mq_queue_map *qmap = &shost->tag_set.map[HCTX_TYPE_DEFAULT]; + + if (smp_affinity_enable && instance->msix_vectors) + return blk_mq_pci_map_queues(qmap, instance->pdev, 0); + else + return blk_mq_map_queues(qmap); +} + + static DEVICE_ATTR(fw_crash_buffer, S_IRUGO | S_IWUSR, megasas_fw_crash_buffer_show, megasas_fw_crash_buffer_store); static DEVICE_ATTR(fw_crash_buffer_size, S_IRUGO, @@ -3207,7 +3221,9 @@ static struct scsi_host_template megasas_template = { .shost_attrs = megaraid_host_attrs, .bios_param = megasas_bios_param, .change_queue_depth = scsi_change_queue_depth, + .map_queues = megasas_map_queues, .no_write_same = 1, + .host_tagset = 1, }; /** @@ -5407,26 +5423,6 @@ megasas_setup_jbod_map(struct megasas_instance *instance) instance->use_seqnum_jbod_fp = false; } -static void megasas_setup_reply_map(struct megasas_instance *instance) -{ - const struct cpumask *mask; - unsigned int queue, cpu; - - for (queue = 0; queue < instance->msix_vectors; queue++) { - mask = pci_irq_get_affinity(instance->pdev, queue); - if (!mask) - goto fallback; - - for_each_cpu(cpu, mask) - instance->reply_map[cpu] = queue; - } - return; - -fallback: - for_each_possible_cpu(cpu) - instance->reply_map[cpu] = cpu % instance->msix_vectors; -} - /** * megasas_get_device_list - Get the PD and LD device list from FW. * @instance: Adapter soft state @@ -5666,8 +5662,6 @@ static int megasas_init_fw(struct megasas_instance *instance) goto fail_init_adapter; } - megasas_setup_reply_map(instance); - dev_info(&instance->pdev->dev, "firmware supports msix\t: (%d)", fw_msix_count); dev_info(&instance->pdev->dev, @@ -6298,6 +6292,8 @@ static int megasas_io_attach(struct megasas_instance *instance) host->max_lun = MEGASAS_MAX_LUN; host->max_cmd_len = 16; + host->nr_hw_queues = instance->msix_vectors ?: 1; + /* * Notify the mid-layer about the new controller */ @@ -6464,11 +6460,6 @@ static inline int megasas_alloc_mfi_ctrl_mem(struct megasas_instance *instance) */ static int megasas_alloc_ctrl_mem(struct megasas_instance *instance) { - instance->reply_map = kcalloc(nr_cpu_ids, sizeof(unsigned int), - GFP_KERNEL); - if (!instance->reply_map) - return -ENOMEM; - switch (instance->adapter_type) { case MFI_SERIES: if (megasas_alloc_mfi_ctrl_mem(instance)) @@ -6485,8 +6476,6 @@ static int megasas_alloc_ctrl_mem(struct megasas_instance *instance) return 0; fail: - kfree(instance->reply_map); - instance->reply_map = NULL; return -ENOMEM; } @@ -6499,7 +6488,6 @@ static int megasas_alloc_ctrl_mem(struct megasas_instance *instance) */ static inline void megasas_free_ctrl_mem(struct megasas_instance *instance) { - kfree(instance->reply_map); if (instance->adapter_type == MFI_SERIES) { if (instance->producer) dma_free_coherent(&instance->pdev->dev, sizeof(u32), @@ -7142,8 +7130,6 @@ megasas_resume(struct pci_dev *pdev) if (rval < 0) goto fail_reenable_msix; - megasas_setup_reply_map(instance); - if (instance->adapter_type != MFI_SERIES) { megasas_reset_reply_desc(instance); if (megasas_ioc_init_fusion(instance)) { diff --git a/drivers/scsi/megaraid/megaraid_sas_fusion.c b/drivers/scsi/megaraid/megaraid_sas_fusion.c index 4dfa0685a86c..4f909f32bf5c 100644 --- a/drivers/scsi/megaraid/megaraid_sas_fusion.c +++ b/drivers/scsi/megaraid/megaraid_sas_fusion.c @@ -2699,7 +2699,7 @@ megasas_build_ldio_fusion(struct megasas_instance *instance, } cmd->request_desc->SCSIIO.MSIxIndex = - instance->reply_map[raw_smp_processor_id()]; + scsi_cmnd_hctx_index(instance->host, scp); if (instance->adapter_type >= VENTURA_SERIES) { /* FP for Optimal raid level 1. @@ -3013,7 +3013,7 @@ megasas_build_syspd_fusion(struct megasas_instance *instance, cmd->request_desc->SCSIIO.DevHandle = io_request->DevHandle; cmd->request_desc->SCSIIO.MSIxIndex = - instance->reply_map[raw_smp_processor_id()]; + scsi_cmnd_hctx_index(instance->host, scmd); if (!fp_possible) { /* system pd firmware path */
SCSI's reply qeueue is very similar with blk-mq's hw queue, both assigned by IRQ vector, so map te private reply queue into blk-mq's hw queue via .host_tagset. Then the private reply mapping can be removed. Another benefit is that the request/irq lost issue may be solved in generic approach because managed IRQ may be shutdown during CPU hotplug. Signed-off-by: Ming Lei <ming.lei@redhat.com> --- drivers/scsi/megaraid/megaraid_sas_base.c | 50 ++++++++------------- drivers/scsi/megaraid/megaraid_sas_fusion.c | 4 +- 2 files changed, 20 insertions(+), 34 deletions(-)