Message ID | 541C28E0.7010705@acm.org (mailing list archive) |
---|---|
State | Superseded, archived |
Headers | show |
On Fri, Sep 19, 2014 at 9:00 PM, Bart Van Assche <bvanassche@acm.org> wrote: > Improve performance by using multiple RDMA/RC channels per SCSI host > for communicating with an SRP target. > > Signed-off-by: Bart Van Assche <bvanassche@acm.org> > --- > Documentation/ABI/stable/sysfs-driver-ib_srp | 25 +- > drivers/infiniband/ulp/srp/ib_srp.c | 337 ++++++++++++++++++++------- > drivers/infiniband/ulp/srp/ib_srp.h | 20 +- > 3 files changed, 287 insertions(+), 95 deletions(-) > > diff --git a/Documentation/ABI/stable/sysfs-driver-ib_srp b/Documentation/ABI/stable/sysfs-driver-ib_srp > index b9688de..d5a459e 100644 > --- a/Documentation/ABI/stable/sysfs-driver-ib_srp > +++ b/Documentation/ABI/stable/sysfs-driver-ib_srp > @@ -55,12 +55,12 @@ Description: Interface for making ib_srp connect to a new target. > only safe with partial memory descriptor list support enabled > (allow_ext_sg=1). > * comp_vector, a number in the range 0..n-1 specifying the > - MSI-X completion vector. Some HCA's allocate multiple (n) > - MSI-X vectors per HCA port. If the IRQ affinity masks of > - these interrupts have been configured such that each MSI-X > - interrupt is handled by a different CPU then the comp_vector > - parameter can be used to spread the SRP completion workload > - over multiple CPU's. > + MSI-X completion vector of the first RDMA channel. Some > + HCA's allocate multiple (n) MSI-X vectors per HCA port. If > + the IRQ affinity masks of these interrupts have been > + configured such that each MSI-X interrupt is handled by a > + different CPU then the comp_vector parameter can be used to > + spread the SRP completion workload over multiple CPU's. > * tl_retry_count, a number in the range 2..7 specifying the > IB RC retry count. > * queue_size, the maximum number of commands that the > @@ -88,6 +88,13 @@ Description: Whether ib_srp is allowed to include a partial memory > descriptor list in an SRP_CMD when communicating with an SRP > target. > > +What: /sys/class/scsi_host/host<n>/ch_count > +Date: November 1, 2014 > +KernelVersion: 3.18 > +Contact: linux-rdma@vger.kernel.org > +Description: Number of RDMA channels used for communication with the SRP > + target. > + > What: /sys/class/scsi_host/host<n>/cmd_sg_entries > Date: May 19, 2011 > KernelVersion: 2.6.39 > @@ -95,6 +102,12 @@ Contact: linux-rdma@vger.kernel.org > Description: Maximum number of data buffer descriptors that may be sent to > the target in a single SRP_CMD request. > > +What: /sys/class/scsi_host/host<n>/comp_vector > +Date: September 2, 2013 > +KernelVersion: 3.11 > +Contact: linux-rdma@vger.kernel.org > +Description: Completion vector used for the first RDMA channel. > + > What: /sys/class/scsi_host/host<n>/dgid > Date: June 17, 2006 > KernelVersion: 2.6.17 > diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c > index 9feeea1..58ca618 100644 > --- a/drivers/infiniband/ulp/srp/ib_srp.c > +++ b/drivers/infiniband/ulp/srp/ib_srp.c > @@ -123,6 +123,16 @@ MODULE_PARM_DESC(dev_loss_tmo, > " if fast_io_fail_tmo has not been set. \"off\" means that" > " this functionality is disabled."); > > +static unsigned ch_count; > +module_param(ch_count, uint, 0444); > +MODULE_PARM_DESC(ch_count, > + "Number of RDMA channels to use for communication with an SRP" > + " target. Using more than one channel improves performance" > + " if the HCA supports multiple completion vectors. The" > + " default value is the minimum of four times the number of" > + " online CPU sockets and the number of completion vectors" > + " supported by the HCA."); > + > static void srp_add_one(struct ib_device *device); > static void srp_remove_one(struct ib_device *device); > static void srp_recv_completion(struct ib_cq *cq, void *ch_ptr); > @@ -556,17 +566,32 @@ err: > * Note: this function may be called without srp_alloc_iu_bufs() having been > * invoked. Hence the ch->[rt]x_ring checks. > */ > -static void srp_free_ch_ib(struct srp_rdma_ch *ch) > +static void srp_free_ch_ib(struct srp_target_port *target, > + struct srp_rdma_ch *ch) > { > - struct srp_target_port *target = ch->target; > struct srp_device *dev = target->srp_host->srp_dev; > int i; > > + if (!ch->target) > + return; > + > + /* > + * Avoid that the SCSI error handler tries to use this channel after > + * it has been freed. The SCSI error handler can namely continue > + * trying to perform recovery actions after scsi_remove_host() > + * returned. > + */ > + ch->target = NULL; > + > if (ch->cm_id) { > ib_destroy_cm_id(ch->cm_id); > ch->cm_id = NULL; > } > > + /* If srp_new_cm_id() succeeded but srp_create_ch_ib() not, return. */ > + if (!ch->qp) > + return; > + > if (dev->use_fast_reg) { > if (ch->fr_pool) > srp_destroy_fr_pool(ch->fr_pool); > @@ -647,7 +672,7 @@ static int srp_lookup_path(struct srp_rdma_ch *ch) > return ch->status; > } > > -static int srp_send_req(struct srp_rdma_ch *ch) > +static int srp_send_req(struct srp_rdma_ch *ch, bool multich) > { > struct srp_target_port *target = ch->target; > struct { > @@ -688,6 +713,8 @@ static int srp_send_req(struct srp_rdma_ch *ch) > req->priv.req_it_iu_len = cpu_to_be32(target->max_iu_len); > req->priv.req_buf_fmt = cpu_to_be16(SRP_BUF_FORMAT_DIRECT | > SRP_BUF_FORMAT_INDIRECT); > + req->priv.req_flags = (multich ? SRP_MULTICHAN_MULTI : > + SRP_MULTICHAN_SINGLE); > /* > * In the published SRP specification (draft rev. 16a), the > * port identifier format is 8 bytes of ID extension followed > @@ -769,27 +796,31 @@ static bool srp_change_conn_state(struct srp_target_port *target, > > static void srp_disconnect_target(struct srp_target_port *target) > { > - struct srp_rdma_ch *ch = &target->ch; > + struct srp_rdma_ch *ch; > + int i; > > if (srp_change_conn_state(target, false)) { > /* XXX should send SRP_I_LOGOUT request */ > > - if (ib_send_cm_dreq(ch->cm_id, NULL, 0)) { > - shost_printk(KERN_DEBUG, target->scsi_host, > - PFX "Sending CM DREQ failed\n"); > + for (i = 0; i < target->ch_count; i++) { > + ch = &target->ch[i]; > + if (ch->cm_id && ib_send_cm_dreq(ch->cm_id, NULL, 0)) { > + shost_printk(KERN_DEBUG, target->scsi_host, > + PFX "Sending CM DREQ failed\n"); > + } > } > } > } > > -static void srp_free_req_data(struct srp_rdma_ch *ch) > +static void srp_free_req_data(struct srp_target_port *target, > + struct srp_rdma_ch *ch) > { > - struct srp_target_port *target = ch->target; > struct srp_device *dev = target->srp_host->srp_dev; > struct ib_device *ibdev = dev->dev; > struct srp_request *req; > int i; > > - if (!ch->req_ring) > + if (!ch->target || !ch->req_ring) > return; > > for (i = 0; i < target->req_ring_size; ++i) { > @@ -853,7 +884,7 @@ static int srp_alloc_req_data(struct srp_rdma_ch *ch) > goto out; > > req->indirect_dma_addr = dma_addr; > - req->index = i; > + req->tag = build_srp_tag(ch - target->ch, i); > list_add_tail(&req->list, &ch->free_reqs); > } > ret = 0; > @@ -879,7 +910,8 @@ static void srp_del_scsi_host_attr(struct Scsi_Host *shost) > > static void srp_remove_target(struct srp_target_port *target) > { > - struct srp_rdma_ch *ch = &target->ch; > + struct srp_rdma_ch *ch; > + int i; > > WARN_ON_ONCE(target->state != SRP_TARGET_REMOVED); > > @@ -889,10 +921,18 @@ static void srp_remove_target(struct srp_target_port *target) > scsi_remove_host(target->scsi_host); > srp_stop_rport_timers(target->rport); > srp_disconnect_target(target); > - srp_free_ch_ib(ch); > + for (i = 0; i < target->ch_count; i++) { > + ch = &target->ch[i]; > + srp_free_ch_ib(target, ch); > + } > cancel_work_sync(&target->tl_err_work); > srp_rport_put(target->rport); > - srp_free_req_data(ch); > + for (i = 0; i < target->ch_count; i++) { > + ch = &target->ch[i]; > + srp_free_req_data(target, ch); > + } > + kfree(target->ch); > + target->ch = NULL; > > spin_lock(&target->srp_host->target_lock); > list_del(&target->list); > @@ -918,12 +958,12 @@ static void srp_rport_delete(struct srp_rport *rport) > srp_queue_remove_work(target); > } > > -static int srp_connect_ch(struct srp_rdma_ch *ch) > +static int srp_connect_ch(struct srp_rdma_ch *ch, bool multich) > { > struct srp_target_port *target = ch->target; > int ret; > > - WARN_ON_ONCE(target->connected); > + WARN_ON_ONCE(!multich && target->connected); > > target->qp_in_error = false; > > @@ -933,7 +973,7 @@ static int srp_connect_ch(struct srp_rdma_ch *ch) > > while (1) { > init_completion(&ch->done); > - ret = srp_send_req(ch); > + ret = srp_send_req(ch, multich); > if (ret) > return ret; > ret = wait_for_completion_interruptible(&ch->done); > @@ -1095,10 +1135,10 @@ static void srp_finish_req(struct srp_rdma_ch *ch, struct srp_request *req, > static void srp_terminate_io(struct srp_rport *rport) > { > struct srp_target_port *target = rport->lld_data; > - struct srp_rdma_ch *ch = &target->ch; > + struct srp_rdma_ch *ch; > struct Scsi_Host *shost = target->scsi_host; > struct scsi_device *sdev; > - int i; > + int i, j; > > /* > * Invoking srp_terminate_io() while srp_queuecommand() is running > @@ -1107,10 +1147,15 @@ static void srp_terminate_io(struct srp_rport *rport) > shost_for_each_device(sdev, shost) > WARN_ON_ONCE(sdev->request_queue->request_fn_active); > > - for (i = 0; i < target->req_ring_size; ++i) { > - struct srp_request *req = &ch->req_ring[i]; > + for (i = 0; i < target->ch_count; i++) { > + ch = &target->ch[i]; > + > + for (j = 0; j < target->req_ring_size; ++j) { > + struct srp_request *req = &ch->req_ring[j]; > > - srp_finish_req(ch, req, NULL, DID_TRANSPORT_FAILFAST << 16); > + srp_finish_req(ch, req, NULL, > + DID_TRANSPORT_FAILFAST << 16); > + } > } > } > > @@ -1126,8 +1171,9 @@ static void srp_terminate_io(struct srp_rport *rport) > static int srp_rport_reconnect(struct srp_rport *rport) > { > struct srp_target_port *target = rport->lld_data; > - struct srp_rdma_ch *ch = &target->ch; > - int i, ret; > + struct srp_rdma_ch *ch; > + int i, j, ret = 0; > + bool multich = false; > > srp_disconnect_target(target); > > @@ -1139,27 +1185,43 @@ static int srp_rport_reconnect(struct srp_rport *rport) > * case things are really fouled up. Doing so also ensures that all CM > * callbacks will have finished before a new QP is allocated. > */ > - ret = srp_new_cm_id(ch); > - > - for (i = 0; i < target->req_ring_size; ++i) { > - struct srp_request *req = &ch->req_ring[i]; > - > - srp_finish_req(ch, req, NULL, DID_RESET << 16); > + for (i = 0; i < target->ch_count; i++) { > + ch = &target->ch[i]; > + if (!ch->target) > + return -ENODEV; > + ret += srp_new_cm_id(ch); > + } > + for (i = 0; i < target->ch_count; i++) { > + ch = &target->ch[i]; > + for (j = 0; j < target->req_ring_size; ++j) { > + struct srp_request *req = &ch->req_ring[j]; > + > + srp_finish_req(ch, req, NULL, DID_RESET << 16); > + } > } > + for (i = 0; i < target->ch_count; i++) { > + ch = &target->ch[i]; > + /* > + * Whether or not creating a new CM ID succeeded, create a new > + * QP. This guarantees that all completion callback function > + * invocations have finished before request resetting starts. > + */ > + ret += srp_create_ch_ib(ch); > > - /* > - * Whether or not creating a new CM ID succeeded, create a new > - * QP. This guarantees that all callback functions for the old QP have > - * finished before any send requests are posted on the new QP. > - */ > - ret += srp_create_ch_ib(ch); > - > - INIT_LIST_HEAD(&ch->free_tx); > - for (i = 0; i < target->queue_size; ++i) > - list_add(&ch->tx_ring[i]->list, &ch->free_tx); > - > - if (ret == 0) > - ret = srp_connect_ch(ch); > + INIT_LIST_HEAD(&ch->free_tx); > + for (j = 0; j < target->queue_size; ++j) > + list_add(&ch->tx_ring[j]->list, &ch->free_tx); > + } > + for (i = 0; i < target->ch_count; i++) { > + ch = &target->ch[i]; > + if (ret) { > + if (i > 1) > + ret = 0; > + break; > + } > + ret = srp_connect_ch(ch, multich); > + multich = true; > + } > > if (ret == 0) > shost_printk(KERN_INFO, target->scsi_host, > @@ -1573,7 +1635,7 @@ static struct srp_iu *__srp_get_tx_iu(struct srp_rdma_ch *ch, > s32 rsv = (iu_type == SRP_IU_TSK_MGMT) ? 0 : SRP_TSK_MGMT_SQ_SIZE; > struct srp_iu *iu; > > - srp_send_completion(ch->send_cq, target); > + srp_send_completion(ch->send_cq, ch); > > if (list_empty(&ch->free_tx)) > return NULL; > @@ -1637,6 +1699,7 @@ static void srp_process_rsp(struct srp_rdma_ch *ch, struct srp_rsp *rsp) > struct srp_request *req; > struct scsi_cmnd *scmnd; > unsigned long flags; > + unsigned i; > > if (unlikely(rsp->tag & SRP_TAG_TSK_MGMT)) { > spin_lock_irqsave(&ch->lock, flags); > @@ -1648,12 +1711,20 @@ static void srp_process_rsp(struct srp_rdma_ch *ch, struct srp_rsp *rsp) > ch->tsk_mgmt_status = rsp->data[3]; > complete(&ch->tsk_mgmt_done); > } else { > - req = &ch->req_ring[rsp->tag]; > - scmnd = srp_claim_req(ch, req, NULL, NULL); > + if (srp_tag_ch(rsp->tag) != ch - target->ch) > + pr_err("Channel idx mismatch: tag %#llx <> ch %#lx\n", > + rsp->tag, ch - target->ch); > + i = srp_tag_idx(rsp->tag); > + if (i < target->req_ring_size) { > + req = &ch->req_ring[i]; > + scmnd = srp_claim_req(ch, req, NULL, NULL); > + } else { > + scmnd = NULL; > + } > if (!scmnd) { > shost_printk(KERN_ERR, target->scsi_host, > - "Null scmnd for RSP w/tag %016llx\n", > - (unsigned long long) rsp->tag); > + "Null scmnd for RSP w/tag %#016llx received on ch %ld / QP %#x\n", > + rsp->tag, ch - target->ch, ch->qp->qp_num); > > spin_lock_irqsave(&ch->lock, flags); > ch->req_lim += be32_to_cpu(rsp->req_lim_delta); > @@ -1879,7 +1950,8 @@ static void srp_send_completion(struct ib_cq *cq, void *ch_ptr) > } > } > > -static int srp_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *scmnd) > +static int srp_queuecommand(unsigned hwq, struct Scsi_Host *shost, > + struct scsi_cmnd *scmnd) > { > struct srp_target_port *target = host_to_target(shost); > struct srp_rport *rport = target->rport; > @@ -1905,7 +1977,7 @@ static int srp_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *scmnd) > if (unlikely(scmnd->result)) > goto err; > > - ch = &target->ch; > + ch = &target->ch[hwq]; > > spin_lock_irqsave(&ch->lock, flags); > iu = __srp_get_tx_iu(ch, SRP_IU_CMD); > @@ -1927,7 +1999,7 @@ static int srp_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *scmnd) > > cmd->opcode = SRP_CMD; > cmd->lun = cpu_to_be64((u64) scmnd->device->lun << 48); > - cmd->tag = req->index; > + cmd->tag = req->tag; > memcpy(cmd->cdb, scmnd->cmnd, scmnd->cmd_len); > > req->scmnd = scmnd; > @@ -1993,6 +2065,17 @@ err: > goto unlock_rport; > } > > +static int srp_sq_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *scmnd) > +{ > + return srp_queuecommand(0, shost, scmnd); > +} > + > +static int srp_mq_queuecommand(struct blk_mq_hw_ctx *hctx, > + struct scsi_cmnd *scmnd) > +{ > + return srp_queuecommand(hctx->queue_num, scmnd->device->host, scmnd); > +} > + > /* > * Note: the resources allocated in this function are freed in > * srp_free_ch_ib(). > @@ -2409,15 +2492,23 @@ static int srp_abort(struct scsi_cmnd *scmnd) > { > struct srp_target_port *target = host_to_target(scmnd->device->host); > struct srp_request *req = (struct srp_request *) scmnd->host_scribble; > + u16 ch_idx; > struct srp_rdma_ch *ch; > int ret; > > shost_printk(KERN_ERR, target->scsi_host, "SRP abort called\n"); > > - ch = &target->ch; > - if (!req || !srp_claim_req(ch, req, NULL, scmnd)) > + if (!req) > + return SUCCESS; > + ch_idx = srp_tag_ch(req->tag); > + if (WARN_ON_ONCE(ch_idx >= target->ch_count)) > return SUCCESS; > - if (srp_send_tsk_mgmt(ch, req->index, scmnd->device->lun, > + ch = &target->ch[ch_idx]; > + if (!srp_claim_req(ch, req, NULL, scmnd)) > + return SUCCESS; > + shost_printk(KERN_ERR, target->scsi_host, > + "Sending SRP abort for tag %#x\n", req->tag); > + if (srp_send_tsk_mgmt(ch, req->tag, scmnd->device->lun, > SRP_TSK_ABORT_TASK) == 0) > ret = SUCCESS; > else if (target->rport->state == SRP_RPORT_LOST) > @@ -2434,21 +2525,25 @@ static int srp_abort(struct scsi_cmnd *scmnd) > static int srp_reset_device(struct scsi_cmnd *scmnd) > { > struct srp_target_port *target = host_to_target(scmnd->device->host); > - struct srp_rdma_ch *ch = &target->ch; > + struct srp_rdma_ch *ch; > int i; > > shost_printk(KERN_ERR, target->scsi_host, "SRP reset_device called\n"); > > + ch = &target->ch[0]; > if (srp_send_tsk_mgmt(ch, SRP_TAG_NO_REQ, scmnd->device->lun, > SRP_TSK_LUN_RESET)) > return FAILED; > if (ch->tsk_mgmt_status) > return FAILED; > > - for (i = 0; i < target->req_ring_size; ++i) { > - struct srp_request *req = &ch->req_ring[i]; > + for (i = 0; i < target->ch_count; i++) { > + ch = &target->ch[i]; > + for (i = 0; i < target->req_ring_size; ++i) { > + struct srp_request *req = &ch->req_ring[i]; > > - srp_finish_req(ch, req, scmnd->device, DID_RESET << 16); > + srp_finish_req(ch, req, scmnd->device, DID_RESET << 16); > + } > } > > return SUCCESS; > @@ -2525,7 +2620,7 @@ static ssize_t show_dgid(struct device *dev, struct device_attribute *attr, > char *buf) > { > struct srp_target_port *target = host_to_target(class_to_shost(dev)); > - struct srp_rdma_ch *ch = &target->ch; > + struct srp_rdma_ch *ch = &target->ch[0]; > > return sprintf(buf, "%pI6\n", ch->path.dgid.raw); > } > @@ -2542,8 +2637,14 @@ static ssize_t show_req_lim(struct device *dev, > struct device_attribute *attr, char *buf) > { > struct srp_target_port *target = host_to_target(class_to_shost(dev)); > + struct srp_rdma_ch *ch; > + int i, req_lim = INT_MAX; > > - return sprintf(buf, "%d\n", target->ch.req_lim); > + for (i = 0; i < target->ch_count; i++) { > + ch = &target->ch[i]; > + req_lim = min(req_lim, ch->req_lim); > + } > + return sprintf(buf, "%d\n", req_lim); > } > > static ssize_t show_zero_req_lim(struct device *dev, > @@ -2570,6 +2671,14 @@ static ssize_t show_local_ib_device(struct device *dev, > return sprintf(buf, "%s\n", target->srp_host->srp_dev->dev->name); > } > > +static ssize_t show_ch_count(struct device *dev, struct device_attribute *attr, > + char *buf) > +{ > + struct srp_target_port *target = host_to_target(class_to_shost(dev)); > + > + return sprintf(buf, "%d\n", target->ch_count); > +} > + > static ssize_t show_comp_vector(struct device *dev, > struct device_attribute *attr, char *buf) > { > @@ -2613,6 +2722,7 @@ static DEVICE_ATTR(req_lim, S_IRUGO, show_req_lim, NULL); > static DEVICE_ATTR(zero_req_lim, S_IRUGO, show_zero_req_lim, NULL); > static DEVICE_ATTR(local_ib_port, S_IRUGO, show_local_ib_port, NULL); > static DEVICE_ATTR(local_ib_device, S_IRUGO, show_local_ib_device, NULL); > +static DEVICE_ATTR(ch_count, S_IRUGO, show_ch_count, NULL); > static DEVICE_ATTR(comp_vector, S_IRUGO, show_comp_vector, NULL); > static DEVICE_ATTR(tl_retry_count, S_IRUGO, show_tl_retry_count, NULL); > static DEVICE_ATTR(cmd_sg_entries, S_IRUGO, show_cmd_sg_entries, NULL); > @@ -2630,6 +2740,7 @@ static struct device_attribute *srp_host_attrs[] = { > &dev_attr_zero_req_lim, > &dev_attr_local_ib_port, > &dev_attr_local_ib_device, > + &dev_attr_ch_count, > &dev_attr_comp_vector, > &dev_attr_tl_retry_count, > &dev_attr_cmd_sg_entries, > @@ -2643,7 +2754,8 @@ static struct scsi_host_template srp_template = { > .proc_name = DRV_NAME, > .slave_configure = srp_slave_configure, > .info = srp_target_info, > - .queuecommand = srp_queuecommand, > + .queuecommand = srp_sq_queuecommand, > + .mq_queuecommand = srp_mq_queuecommand, Another choice is to obtain hctx from request directly, then mq can reuse the .queuecommand interface too. > .change_queue_depth = srp_change_queue_depth, > .change_queue_type = srp_change_queue_type, > .eh_abort_handler = srp_abort, > @@ -3038,7 +3150,8 @@ static ssize_t srp_create_target(struct device *dev, > struct srp_rdma_ch *ch; > struct srp_device *srp_dev = host->srp_dev; > struct ib_device *ibdev = srp_dev->dev; > - int ret; > + int ret, node_idx, node, cpu, i; > + bool multich = false; > > target_host = scsi_host_alloc(&srp_template, > sizeof (struct srp_target_port)); > @@ -3098,34 +3211,82 @@ static ssize_t srp_create_target(struct device *dev, > INIT_WORK(&target->tl_err_work, srp_tl_err_work); > INIT_WORK(&target->remove_work, srp_remove_work); > spin_lock_init(&target->lock); > - ch = &target->ch; > - ch->target = target; > - ch->comp_vector = target->comp_vector; > - spin_lock_init(&ch->lock); > - INIT_LIST_HEAD(&ch->free_tx); > - ret = srp_alloc_req_data(ch); > - if (ret) > - goto err_free_mem; > - > ret = ib_query_gid(ibdev, host->port, 0, &target->sgid); > if (ret) > - goto err_free_mem; > + goto err; > > - ret = srp_create_ch_ib(ch); > - if (ret) > - goto err_free_mem; > + ret = -ENOMEM; > + target->ch_count = max_t(unsigned, num_online_nodes(), > + min(ch_count ? : > + min(4 * num_online_nodes(), > + ibdev->num_comp_vectors), > + num_online_cpus())); > + target->ch = kcalloc(target->ch_count, sizeof(*target->ch), > + GFP_KERNEL); > + if (!target->ch) > + goto err; > > - ret = srp_new_cm_id(ch); > - if (ret) > - goto err_free_ib; > + node_idx = 0; > + for_each_online_node(node) { > + const int ch_start = (node_idx * target->ch_count / > + num_online_nodes()); > + const int ch_end = ((node_idx + 1) * target->ch_count / > + num_online_nodes()); > + const int cv_start = (node_idx * ibdev->num_comp_vectors / > + num_online_nodes() + target->comp_vector) > + % ibdev->num_comp_vectors; > + const int cv_end = ((node_idx + 1) * ibdev->num_comp_vectors / > + num_online_nodes() + target->comp_vector) > + % ibdev->num_comp_vectors; > + int cpu_idx = 0; > + > + for_each_online_cpu(cpu) { > + if (cpu_to_node(cpu) != node) > + continue; > + if (ch_start + cpu_idx >= ch_end) > + continue; > + ch = &target->ch[ch_start + cpu_idx]; > + ch->target = target; > + ch->comp_vector = cv_start == cv_end ? cv_start : > + cv_start + cpu_idx % (cv_end - cv_start); > + spin_lock_init(&ch->lock); > + INIT_LIST_HEAD(&ch->free_tx); > + ret = srp_new_cm_id(ch); > + if (ret) > + goto err_disconnect; > > - ret = srp_connect_ch(ch); > - if (ret) { > - shost_printk(KERN_ERR, target->scsi_host, > - PFX "Connection failed\n"); > - goto err_free_ib; > + ret = srp_create_ch_ib(ch); > + if (ret) > + goto err_disconnect; > + > + ret = srp_alloc_req_data(ch); > + if (ret) > + goto err_disconnect; > + > + ret = srp_connect_ch(ch, multich); > + if (ret) { > + shost_printk(KERN_ERR, target->scsi_host, > + PFX "Connection %d/%d failed\n", > + ch_start + cpu_idx, > + target->ch_count); > + if (node_idx == 0 && cpu_idx == 0) { > + goto err_disconnect; > + } else { > + srp_free_ch_ib(target, ch); > + srp_free_req_data(target, ch); > + target->ch_count = ch - target->ch; > + break; > + } > + } > + > + multich = true; > + cpu_idx++; > + } > + node_idx++; > } > > + target->scsi_host->nr_hw_queues = target->ch_count; > + > ret = srp_add_target(host, target); > if (ret) > goto err_disconnect; > @@ -3154,11 +3315,13 @@ out: > err_disconnect: > srp_disconnect_target(target); > > -err_free_ib: > - srp_free_ch_ib(ch); > + for (i = 0; i < target->ch_count; i++) { > + ch = &target->ch[i]; > + srp_free_ch_ib(target, ch); > + srp_free_req_data(target, ch); > + } > > -err_free_mem: > - srp_free_req_data(ch); > + kfree(target->ch); > > err: > scsi_host_put(target_host); > diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h > index 0609124..d9660e1 100644 > --- a/drivers/infiniband/ulp/srp/ib_srp.h > +++ b/drivers/infiniband/ulp/srp/ib_srp.h > @@ -84,6 +84,21 @@ enum srp_iu_type { > SRP_IU_RSP, > }; > > +static inline u32 build_srp_tag(u16 ch, u16 req_idx) > +{ > + return ch << 16 | req_idx; > +} > + > +static inline u16 srp_tag_ch(u32 tag) > +{ > + return tag >> 16; > +} > + > +static inline u16 srp_tag_idx(u32 tag) > +{ > + return tag & ((1 << 16) - 1); > +} > + > /* > * @mr_page_mask: HCA memory registration page mask. > * @mr_page_size: HCA memory registration page size. > @@ -127,7 +142,7 @@ struct srp_request { > struct srp_direct_buf *indirect_desc; > dma_addr_t indirect_dma_addr; > short nmdesc; > - short index; > + uint32_t tag; > }; > > struct srp_rdma_ch { > @@ -173,8 +188,9 @@ struct srp_target_port { > /* read and written in the hot path */ > spinlock_t lock; > > - struct srp_rdma_ch ch; > /* read only in the hot path */ > + struct srp_rdma_ch *ch; > + u32 ch_count; > u32 lkey; > u32 rkey; > enum srp_target_state state; > -- > 1.8.4.5 > > -- > To unsubscribe from this list: send the line "unsubscribe linux-scsi" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 09/19/14 16:28, Ming Lei wrote: > On Fri, Sep 19, 2014 at 9:00 PM, Bart Van Assche <bvanassche@acm.org> wrote: >> @@ -2643,7 +2754,8 @@ static struct scsi_host_template srp_template = { >> .proc_name = DRV_NAME, >> .slave_configure = srp_slave_configure, >> .info = srp_target_info, >> - .queuecommand = srp_queuecommand, >> + .queuecommand = srp_sq_queuecommand, >> + .mq_queuecommand = srp_mq_queuecommand, > > Another choice is to obtain hctx from request directly, then mq can > reuse the .queuecommand interface too. Hello Ming, Is the hctx information already available in the request data structure ? I have found a mq_ctx member but no hctx member. Did I perhaps overlook something ? Thanks, Bart. -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, Sep 19, 2014 at 11:21 PM, Bart Van Assche <bvanassche@acm.org> wrote: > On 09/19/14 16:28, Ming Lei wrote: >> >> On Fri, Sep 19, 2014 at 9:00 PM, Bart Van Assche <bvanassche@acm.org> >> wrote: >>> >>> @@ -2643,7 +2754,8 @@ static struct scsi_host_template srp_template = { >>> .proc_name = DRV_NAME, >>> .slave_configure = srp_slave_configure, >>> .info = srp_target_info, >>> - .queuecommand = srp_queuecommand, >>> + .queuecommand = srp_sq_queuecommand, >>> + .mq_queuecommand = srp_mq_queuecommand, >> >> >> Another choice is to obtain hctx from request directly, then mq can >> reuse the .queuecommand interface too. > > > Hello Ming, > > Is the hctx information already available in the request data structure ? I > have found a mq_ctx member but no hctx member. Did I perhaps overlook > something ? You are right, but the mq_ctx can be mapped to hctx like below way: ctx = rq->mq_ctx; hctx = q->mq_ops->map_queue(q, ctx->cpu); Thanks, -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 9/19/2014 4:00 PM, Bart Van Assche wrote: > Improve performance by using multiple RDMA/RC channels per SCSI host > for communicating with an SRP target. > Hey Bart, Since you don't seem to negotiate/declare multichannel with the target, did you test this code with some target implementations other than SCST that happen to be out there? Overall, I think this patch would be easier to review if you also provide a list of logical changes (which obviously are introduced in this patch). Patch 7/8 can use some more information of target-channel relations as well. > Signed-off-by: Bart Van Assche <bvanassche@acm.org> > --- > Documentation/ABI/stable/sysfs-driver-ib_srp | 25 +- > drivers/infiniband/ulp/srp/ib_srp.c | 337 ++++++++++++++++++++------- > drivers/infiniband/ulp/srp/ib_srp.h | 20 +- > 3 files changed, 287 insertions(+), 95 deletions(-) > > diff --git a/Documentation/ABI/stable/sysfs-driver-ib_srp b/Documentation/ABI/stable/sysfs-driver-ib_srp > index b9688de..d5a459e 100644 > --- a/Documentation/ABI/stable/sysfs-driver-ib_srp > +++ b/Documentation/ABI/stable/sysfs-driver-ib_srp > @@ -55,12 +55,12 @@ Description: Interface for making ib_srp connect to a new target. > only safe with partial memory descriptor list support enabled > (allow_ext_sg=1). > * comp_vector, a number in the range 0..n-1 specifying the > - MSI-X completion vector. Some HCA's allocate multiple (n) > - MSI-X vectors per HCA port. If the IRQ affinity masks of > - these interrupts have been configured such that each MSI-X > - interrupt is handled by a different CPU then the comp_vector > - parameter can be used to spread the SRP completion workload > - over multiple CPU's. > + MSI-X completion vector of the first RDMA channel. Some > + HCA's allocate multiple (n) MSI-X vectors per HCA port. If > + the IRQ affinity masks of these interrupts have been > + configured such that each MSI-X interrupt is handled by a > + different CPU then the comp_vector parameter can be used to > + spread the SRP completion workload over multiple CPU's. Why do you want the first channel vector placement? Why can't you start with obvious 0? > * tl_retry_count, a number in the range 2..7 specifying the > IB RC retry count. > * queue_size, the maximum number of commands that the > @@ -88,6 +88,13 @@ Description: Whether ib_srp is allowed to include a partial memory > descriptor list in an SRP_CMD when communicating with an SRP > target. > > +What: /sys/class/scsi_host/host<n>/ch_count > +Date: November 1, 2014 > +KernelVersion: 3.18 > +Contact: linux-rdma@vger.kernel.org > +Description: Number of RDMA channels used for communication with the SRP > + target. > + > What: /sys/class/scsi_host/host<n>/cmd_sg_entries > Date: May 19, 2011 > KernelVersion: 2.6.39 > @@ -95,6 +102,12 @@ Contact: linux-rdma@vger.kernel.org > Description: Maximum number of data buffer descriptors that may be sent to > the target in a single SRP_CMD request. > > +What: /sys/class/scsi_host/host<n>/comp_vector > +Date: September 2, 2013 > +KernelVersion: 3.11 > +Contact: linux-rdma@vger.kernel.org > +Description: Completion vector used for the first RDMA channel. > + > What: /sys/class/scsi_host/host<n>/dgid > Date: June 17, 2006 > KernelVersion: 2.6.17 > diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c > index 9feeea1..58ca618 100644 > --- a/drivers/infiniband/ulp/srp/ib_srp.c > +++ b/drivers/infiniband/ulp/srp/ib_srp.c > @@ -123,6 +123,16 @@ MODULE_PARM_DESC(dev_loss_tmo, > " if fast_io_fail_tmo has not been set. \"off\" means that" > " this functionality is disabled."); > > +static unsigned ch_count; > +module_param(ch_count, uint, 0444); > +MODULE_PARM_DESC(ch_count, > + "Number of RDMA channels to use for communication with an SRP" > + " target. Using more than one channel improves performance" > + " if the HCA supports multiple completion vectors. The" > + " default value is the minimum of four times the number of" > + " online CPU sockets and the number of completion vectors" > + " supported by the HCA."); > + Can you explain the default math? how did you end-up with 4*numa_nodes? wouldn't per-cpu be a better fit? Moreover, while using multiple channels you don't suffice for less requests of less FMRs/FRs. I'm a bit concerned here about scalability of multi-channel. Should we take care of cases where the user will want lots of channels to lots of targets and might run out of resources? > static void srp_add_one(struct ib_device *device); > static void srp_remove_one(struct ib_device *device); > static void srp_recv_completion(struct ib_cq *cq, void *ch_ptr); > @@ -556,17 +566,32 @@ err: > * Note: this function may be called without srp_alloc_iu_bufs() having been > * invoked. Hence the ch->[rt]x_ring checks. > */ > -static void srp_free_ch_ib(struct srp_rdma_ch *ch) > +static void srp_free_ch_ib(struct srp_target_port *target, > + struct srp_rdma_ch *ch) > { > - struct srp_target_port *target = ch->target; > struct srp_device *dev = target->srp_host->srp_dev; > int i; > > + if (!ch->target) > + return; How did this condition pop up here? As I don't feel this is a trivial condition, I would like to see a comment of how can this routine be called twice and why is this peek safe? > + > + /* > + * Avoid that the SCSI error handler tries to use this channel after > + * it has been freed. The SCSI error handler can namely continue > + * trying to perform recovery actions after scsi_remove_host() > + * returned. > + */ > + ch->target = NULL; > + > if (ch->cm_id) { > ib_destroy_cm_id(ch->cm_id); > ch->cm_id = NULL; > } > > + /* If srp_new_cm_id() succeeded but srp_create_ch_ib() not, return. */ > + if (!ch->qp) > + return; > + > if (dev->use_fast_reg) { > if (ch->fr_pool) > srp_destroy_fr_pool(ch->fr_pool); > @@ -647,7 +672,7 @@ static int srp_lookup_path(struct srp_rdma_ch *ch) > return ch->status; > } > > -static int srp_send_req(struct srp_rdma_ch *ch) > +static int srp_send_req(struct srp_rdma_ch *ch, bool multich) > { > struct srp_target_port *target = ch->target; > struct { > @@ -688,6 +713,8 @@ static int srp_send_req(struct srp_rdma_ch *ch) > req->priv.req_it_iu_len = cpu_to_be32(target->max_iu_len); > req->priv.req_buf_fmt = cpu_to_be16(SRP_BUF_FORMAT_DIRECT | > SRP_BUF_FORMAT_INDIRECT); > + req->priv.req_flags = (multich ? SRP_MULTICHAN_MULTI : > + SRP_MULTICHAN_SINGLE); > /* > * In the published SRP specification (draft rev. 16a), the > * port identifier format is 8 bytes of ID extension followed > @@ -769,27 +796,31 @@ static bool srp_change_conn_state(struct srp_target_port *target, > > static void srp_disconnect_target(struct srp_target_port *target) > { > - struct srp_rdma_ch *ch = &target->ch; > + struct srp_rdma_ch *ch; > + int i; > > if (srp_change_conn_state(target, false)) { > /* XXX should send SRP_I_LOGOUT request */ > > - if (ib_send_cm_dreq(ch->cm_id, NULL, 0)) { > - shost_printk(KERN_DEBUG, target->scsi_host, > - PFX "Sending CM DREQ failed\n"); > + for (i = 0; i < target->ch_count; i++) { > + ch = &target->ch[i]; > + if (ch->cm_id && ib_send_cm_dreq(ch->cm_id, NULL, 0)) { > + shost_printk(KERN_DEBUG, target->scsi_host, > + PFX "Sending CM DREQ failed\n"); > + } > } > } > } > > -static void srp_free_req_data(struct srp_rdma_ch *ch) > +static void srp_free_req_data(struct srp_target_port *target, > + struct srp_rdma_ch *ch) > { > - struct srp_target_port *target = ch->target; > struct srp_device *dev = target->srp_host->srp_dev; > struct ib_device *ibdev = dev->dev; > struct srp_request *req; > int i; > > - if (!ch->req_ring) > + if (!ch->target || !ch->req_ring) > return; > > for (i = 0; i < target->req_ring_size; ++i) { > @@ -853,7 +884,7 @@ static int srp_alloc_req_data(struct srp_rdma_ch *ch) > goto out; > > req->indirect_dma_addr = dma_addr; > - req->index = i; > + req->tag = build_srp_tag(ch - target->ch, i); > list_add_tail(&req->list, &ch->free_reqs); > } > ret = 0; > @@ -879,7 +910,8 @@ static void srp_del_scsi_host_attr(struct Scsi_Host *shost) > > static void srp_remove_target(struct srp_target_port *target) > { > - struct srp_rdma_ch *ch = &target->ch; > + struct srp_rdma_ch *ch; > + int i; > > WARN_ON_ONCE(target->state != SRP_TARGET_REMOVED); > > @@ -889,10 +921,18 @@ static void srp_remove_target(struct srp_target_port *target) > scsi_remove_host(target->scsi_host); > srp_stop_rport_timers(target->rport); > srp_disconnect_target(target); > - srp_free_ch_ib(ch); > + for (i = 0; i < target->ch_count; i++) { > + ch = &target->ch[i]; > + srp_free_ch_ib(target, ch); > + } > cancel_work_sync(&target->tl_err_work); > srp_rport_put(target->rport); > - srp_free_req_data(ch); > + for (i = 0; i < target->ch_count; i++) { > + ch = &target->ch[i]; > + srp_free_req_data(target, ch); > + } > + kfree(target->ch); > + target->ch = NULL; > > spin_lock(&target->srp_host->target_lock); > list_del(&target->list); > @@ -918,12 +958,12 @@ static void srp_rport_delete(struct srp_rport *rport) > srp_queue_remove_work(target); > } > > -static int srp_connect_ch(struct srp_rdma_ch *ch) > +static int srp_connect_ch(struct srp_rdma_ch *ch, bool multich) > { > struct srp_target_port *target = ch->target; > int ret; > > - WARN_ON_ONCE(target->connected); > + WARN_ON_ONCE(!multich && target->connected); > > target->qp_in_error = false; > > @@ -933,7 +973,7 @@ static int srp_connect_ch(struct srp_rdma_ch *ch) > > while (1) { > init_completion(&ch->done); > - ret = srp_send_req(ch); > + ret = srp_send_req(ch, multich); > if (ret) > return ret; > ret = wait_for_completion_interruptible(&ch->done); > @@ -1095,10 +1135,10 @@ static void srp_finish_req(struct srp_rdma_ch *ch, struct srp_request *req, > static void srp_terminate_io(struct srp_rport *rport) > { > struct srp_target_port *target = rport->lld_data; > - struct srp_rdma_ch *ch = &target->ch; > + struct srp_rdma_ch *ch; > struct Scsi_Host *shost = target->scsi_host; > struct scsi_device *sdev; > - int i; > + int i, j; > > /* > * Invoking srp_terminate_io() while srp_queuecommand() is running > @@ -1107,10 +1147,15 @@ static void srp_terminate_io(struct srp_rport *rport) > shost_for_each_device(sdev, shost) > WARN_ON_ONCE(sdev->request_queue->request_fn_active); > > - for (i = 0; i < target->req_ring_size; ++i) { > - struct srp_request *req = &ch->req_ring[i]; > + for (i = 0; i < target->ch_count; i++) { > + ch = &target->ch[i]; > + > + for (j = 0; j < target->req_ring_size; ++j) { > + struct srp_request *req = &ch->req_ring[j]; > > - srp_finish_req(ch, req, NULL, DID_TRANSPORT_FAILFAST << 16); > + srp_finish_req(ch, req, NULL, > + DID_TRANSPORT_FAILFAST << 16); > + } > } > } > > @@ -1126,8 +1171,9 @@ static void srp_terminate_io(struct srp_rport *rport) > static int srp_rport_reconnect(struct srp_rport *rport) > { > struct srp_target_port *target = rport->lld_data; > - struct srp_rdma_ch *ch = &target->ch; > - int i, ret; > + struct srp_rdma_ch *ch; > + int i, j, ret = 0; > + bool multich = false; > > srp_disconnect_target(target); > > @@ -1139,27 +1185,43 @@ static int srp_rport_reconnect(struct srp_rport *rport) > * case things are really fouled up. Doing so also ensures that all CM > * callbacks will have finished before a new QP is allocated. > */ > - ret = srp_new_cm_id(ch); > - > - for (i = 0; i < target->req_ring_size; ++i) { > - struct srp_request *req = &ch->req_ring[i]; > - > - srp_finish_req(ch, req, NULL, DID_RESET << 16); > + for (i = 0; i < target->ch_count; i++) { > + ch = &target->ch[i]; > + if (!ch->target) > + return -ENODEV; > + ret += srp_new_cm_id(ch); > + } > + for (i = 0; i < target->ch_count; i++) { > + ch = &target->ch[i]; > + for (j = 0; j < target->req_ring_size; ++j) { > + struct srp_request *req = &ch->req_ring[j]; > + > + srp_finish_req(ch, req, NULL, DID_RESET << 16); > + } > } > + for (i = 0; i < target->ch_count; i++) { > + ch = &target->ch[i]; > + /* > + * Whether or not creating a new CM ID succeeded, create a new > + * QP. This guarantees that all completion callback function > + * invocations have finished before request resetting starts. > + */ > + ret += srp_create_ch_ib(ch); > > - /* > - * Whether or not creating a new CM ID succeeded, create a new > - * QP. This guarantees that all callback functions for the old QP have > - * finished before any send requests are posted on the new QP. > - */ > - ret += srp_create_ch_ib(ch); > - > - INIT_LIST_HEAD(&ch->free_tx); > - for (i = 0; i < target->queue_size; ++i) > - list_add(&ch->tx_ring[i]->list, &ch->free_tx); > - > - if (ret == 0) > - ret = srp_connect_ch(ch); > + INIT_LIST_HEAD(&ch->free_tx); > + for (j = 0; j < target->queue_size; ++j) > + list_add(&ch->tx_ring[j]->list, &ch->free_tx); > + } > + for (i = 0; i < target->ch_count; i++) { > + ch = &target->ch[i]; > + if (ret) { > + if (i > 1) > + ret = 0; > + break; > + } > + ret = srp_connect_ch(ch, multich); > + multich = true; > + } > > if (ret == 0) > shost_printk(KERN_INFO, target->scsi_host, > @@ -1573,7 +1635,7 @@ static struct srp_iu *__srp_get_tx_iu(struct srp_rdma_ch *ch, > s32 rsv = (iu_type == SRP_IU_TSK_MGMT) ? 0 : SRP_TSK_MGMT_SQ_SIZE; > struct srp_iu *iu; > > - srp_send_completion(ch->send_cq, target); > + srp_send_completion(ch->send_cq, ch); > > if (list_empty(&ch->free_tx)) > return NULL; > @@ -1637,6 +1699,7 @@ static void srp_process_rsp(struct srp_rdma_ch *ch, struct srp_rsp *rsp) > struct srp_request *req; > struct scsi_cmnd *scmnd; > unsigned long flags; > + unsigned i; > > if (unlikely(rsp->tag & SRP_TAG_TSK_MGMT)) { > spin_lock_irqsave(&ch->lock, flags); > @@ -1648,12 +1711,20 @@ static void srp_process_rsp(struct srp_rdma_ch *ch, struct srp_rsp *rsp) > ch->tsk_mgmt_status = rsp->data[3]; > complete(&ch->tsk_mgmt_done); > } else { > - req = &ch->req_ring[rsp->tag]; > - scmnd = srp_claim_req(ch, req, NULL, NULL); > + if (srp_tag_ch(rsp->tag) != ch - target->ch) > + pr_err("Channel idx mismatch: tag %#llx <> ch %#lx\n", > + rsp->tag, ch - target->ch); > + i = srp_tag_idx(rsp->tag); > + if (i < target->req_ring_size) { > + req = &ch->req_ring[i]; > + scmnd = srp_claim_req(ch, req, NULL, NULL); > + } else { > + scmnd = NULL; > + } > if (!scmnd) { > shost_printk(KERN_ERR, target->scsi_host, > - "Null scmnd for RSP w/tag %016llx\n", > - (unsigned long long) rsp->tag); > + "Null scmnd for RSP w/tag %#016llx received on ch %ld / QP %#x\n", > + rsp->tag, ch - target->ch, ch->qp->qp_num); > > spin_lock_irqsave(&ch->lock, flags); > ch->req_lim += be32_to_cpu(rsp->req_lim_delta); > @@ -1879,7 +1950,8 @@ static void srp_send_completion(struct ib_cq *cq, void *ch_ptr) > } > } > > -static int srp_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *scmnd) > +static int srp_queuecommand(unsigned hwq, struct Scsi_Host *shost, > + struct scsi_cmnd *scmnd) > { > struct srp_target_port *target = host_to_target(shost); > struct srp_rport *rport = target->rport; > @@ -1905,7 +1977,7 @@ static int srp_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *scmnd) > if (unlikely(scmnd->result)) > goto err; > > - ch = &target->ch; > + ch = &target->ch[hwq]; > > spin_lock_irqsave(&ch->lock, flags); > iu = __srp_get_tx_iu(ch, SRP_IU_CMD); > @@ -1927,7 +1999,7 @@ static int srp_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *scmnd) > > cmd->opcode = SRP_CMD; > cmd->lun = cpu_to_be64((u64) scmnd->device->lun << 48); > - cmd->tag = req->index; > + cmd->tag = req->tag; > memcpy(cmd->cdb, scmnd->cmnd, scmnd->cmd_len); > > req->scmnd = scmnd; > @@ -1993,6 +2065,17 @@ err: > goto unlock_rport; > } > > +static int srp_sq_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *scmnd) > +{ > + return srp_queuecommand(0, shost, scmnd); > +} > + > +static int srp_mq_queuecommand(struct blk_mq_hw_ctx *hctx, > + struct scsi_cmnd *scmnd) > +{ > + return srp_queuecommand(hctx->queue_num, scmnd->device->host, scmnd); > +} > + > /* > * Note: the resources allocated in this function are freed in > * srp_free_ch_ib(). > @@ -2409,15 +2492,23 @@ static int srp_abort(struct scsi_cmnd *scmnd) > { > struct srp_target_port *target = host_to_target(scmnd->device->host); > struct srp_request *req = (struct srp_request *) scmnd->host_scribble; > + u16 ch_idx; > struct srp_rdma_ch *ch; > int ret; > > shost_printk(KERN_ERR, target->scsi_host, "SRP abort called\n"); > > - ch = &target->ch; > - if (!req || !srp_claim_req(ch, req, NULL, scmnd)) > + if (!req) > + return SUCCESS; > + ch_idx = srp_tag_ch(req->tag); > + if (WARN_ON_ONCE(ch_idx >= target->ch_count)) Can you explain how can this happen? and why do you continue as if nothing happened? > return SUCCESS; > - if (srp_send_tsk_mgmt(ch, req->index, scmnd->device->lun, > + ch = &target->ch[ch_idx]; > + if (!srp_claim_req(ch, req, NULL, scmnd)) > + return SUCCESS; > + shost_printk(KERN_ERR, target->scsi_host, > + "Sending SRP abort for tag %#x\n", req->tag); > + if (srp_send_tsk_mgmt(ch, req->tag, scmnd->device->lun, > SRP_TSK_ABORT_TASK) == 0) > ret = SUCCESS; > else if (target->rport->state == SRP_RPORT_LOST) > @@ -2434,21 +2525,25 @@ static int srp_abort(struct scsi_cmnd *scmnd) > static int srp_reset_device(struct scsi_cmnd *scmnd) > { > struct srp_target_port *target = host_to_target(scmnd->device->host); > - struct srp_rdma_ch *ch = &target->ch; > + struct srp_rdma_ch *ch; > int i; > > shost_printk(KERN_ERR, target->scsi_host, "SRP reset_device called\n"); > > + ch = &target->ch[0]; > if (srp_send_tsk_mgmt(ch, SRP_TAG_NO_REQ, scmnd->device->lun, > SRP_TSK_LUN_RESET)) > return FAILED; > if (ch->tsk_mgmt_status) > return FAILED; > > - for (i = 0; i < target->req_ring_size; ++i) { > - struct srp_request *req = &ch->req_ring[i]; > + for (i = 0; i < target->ch_count; i++) { Just a Nit, This channels loop appears several times in the code. Might be nicer to macro it to for_each_rdma_ch() - not a must though... > + ch = &target->ch[i]; > + for (i = 0; i < target->req_ring_size; ++i) { > + struct srp_request *req = &ch->req_ring[i]; > > - srp_finish_req(ch, req, scmnd->device, DID_RESET << 16); > + srp_finish_req(ch, req, scmnd->device, DID_RESET << 16); > + } > } > > return SUCCESS; > @@ -2525,7 +2620,7 @@ static ssize_t show_dgid(struct device *dev, struct device_attribute *attr, > char *buf) > { > struct srp_target_port *target = host_to_target(class_to_shost(dev)); > - struct srp_rdma_ch *ch = &target->ch; > + struct srp_rdma_ch *ch = &target->ch[0]; > > return sprintf(buf, "%pI6\n", ch->path.dgid.raw); > } > @@ -2542,8 +2637,14 @@ static ssize_t show_req_lim(struct device *dev, > struct device_attribute *attr, char *buf) > { > struct srp_target_port *target = host_to_target(class_to_shost(dev)); > + struct srp_rdma_ch *ch; > + int i, req_lim = INT_MAX; > > - return sprintf(buf, "%d\n", target->ch.req_lim); > + for (i = 0; i < target->ch_count; i++) { > + ch = &target->ch[i]; No lock here? > + req_lim = min(req_lim, ch->req_lim); > + } > + return sprintf(buf, "%d\n", req_lim); > } > > static ssize_t show_zero_req_lim(struct device *dev, > @@ -2570,6 +2671,14 @@ static ssize_t show_local_ib_device(struct device *dev, > return sprintf(buf, "%s\n", target->srp_host->srp_dev->dev->name); > } > > +static ssize_t show_ch_count(struct device *dev, struct device_attribute *attr, > + char *buf) > +{ > + struct srp_target_port *target = host_to_target(class_to_shost(dev)); > + > + return sprintf(buf, "%d\n", target->ch_count); > +} > + > static ssize_t show_comp_vector(struct device *dev, > struct device_attribute *attr, char *buf) > { > @@ -2613,6 +2722,7 @@ static DEVICE_ATTR(req_lim, S_IRUGO, show_req_lim, NULL); > static DEVICE_ATTR(zero_req_lim, S_IRUGO, show_zero_req_lim, NULL); > static DEVICE_ATTR(local_ib_port, S_IRUGO, show_local_ib_port, NULL); > static DEVICE_ATTR(local_ib_device, S_IRUGO, show_local_ib_device, NULL); > +static DEVICE_ATTR(ch_count, S_IRUGO, show_ch_count, NULL); > static DEVICE_ATTR(comp_vector, S_IRUGO, show_comp_vector, NULL); > static DEVICE_ATTR(tl_retry_count, S_IRUGO, show_tl_retry_count, NULL); > static DEVICE_ATTR(cmd_sg_entries, S_IRUGO, show_cmd_sg_entries, NULL); > @@ -2630,6 +2740,7 @@ static struct device_attribute *srp_host_attrs[] = { > &dev_attr_zero_req_lim, > &dev_attr_local_ib_port, > &dev_attr_local_ib_device, > + &dev_attr_ch_count, > &dev_attr_comp_vector, > &dev_attr_tl_retry_count, > &dev_attr_cmd_sg_entries, > @@ -2643,7 +2754,8 @@ static struct scsi_host_template srp_template = { > .proc_name = DRV_NAME, > .slave_configure = srp_slave_configure, > .info = srp_target_info, > - .queuecommand = srp_queuecommand, > + .queuecommand = srp_sq_queuecommand, > + .mq_queuecommand = srp_mq_queuecommand, > .change_queue_depth = srp_change_queue_depth, > .change_queue_type = srp_change_queue_type, > .eh_abort_handler = srp_abort, > @@ -3038,7 +3150,8 @@ static ssize_t srp_create_target(struct device *dev, > struct srp_rdma_ch *ch; > struct srp_device *srp_dev = host->srp_dev; > struct ib_device *ibdev = srp_dev->dev; > - int ret; > + int ret, node_idx, node, cpu, i; > + bool multich = false; > > target_host = scsi_host_alloc(&srp_template, > sizeof (struct srp_target_port)); > @@ -3098,34 +3211,82 @@ static ssize_t srp_create_target(struct device *dev, > INIT_WORK(&target->tl_err_work, srp_tl_err_work); > INIT_WORK(&target->remove_work, srp_remove_work); > spin_lock_init(&target->lock); > - ch = &target->ch; > - ch->target = target; > - ch->comp_vector = target->comp_vector; > - spin_lock_init(&ch->lock); > - INIT_LIST_HEAD(&ch->free_tx); > - ret = srp_alloc_req_data(ch); > - if (ret) > - goto err_free_mem; > - > ret = ib_query_gid(ibdev, host->port, 0, &target->sgid); > if (ret) > - goto err_free_mem; > + goto err; > > - ret = srp_create_ch_ib(ch); > - if (ret) > - goto err_free_mem; > + ret = -ENOMEM; Any chance you take this non-trivial setup below to a routine and document what you are trying to attempt? > + target->ch_count = max_t(unsigned, num_online_nodes(), > + min(ch_count ? : > + min(4 * num_online_nodes(), > + ibdev->num_comp_vectors), > + num_online_cpus())); > + target->ch = kcalloc(target->ch_count, sizeof(*target->ch), > + GFP_KERNEL); > + if (!target->ch) > + goto err; > > - ret = srp_new_cm_id(ch); > - if (ret) > - goto err_free_ib; > + node_idx = 0; > + for_each_online_node(node) { > + const int ch_start = (node_idx * target->ch_count / > + num_online_nodes()); > + const int ch_end = ((node_idx + 1) * target->ch_count / > + num_online_nodes()); > + const int cv_start = (node_idx * ibdev->num_comp_vectors / > + num_online_nodes() + target->comp_vector) > + % ibdev->num_comp_vectors; > + const int cv_end = ((node_idx + 1) * ibdev->num_comp_vectors / > + num_online_nodes() + target->comp_vector) > + % ibdev->num_comp_vectors; > + int cpu_idx = 0; > + > + for_each_online_cpu(cpu) { > + if (cpu_to_node(cpu) != node) > + continue; > + if (ch_start + cpu_idx >= ch_end) > + continue; > + ch = &target->ch[ch_start + cpu_idx]; > + ch->target = target; > + ch->comp_vector = cv_start == cv_end ? cv_start : > + cv_start + cpu_idx % (cv_end - cv_start); > + spin_lock_init(&ch->lock); > + INIT_LIST_HEAD(&ch->free_tx); > + ret = srp_new_cm_id(ch); > + if (ret) > + goto err_disconnect; > > - ret = srp_connect_ch(ch); > - if (ret) { > - shost_printk(KERN_ERR, target->scsi_host, > - PFX "Connection failed\n"); > - goto err_free_ib; > + ret = srp_create_ch_ib(ch); > + if (ret) > + goto err_disconnect; > + > + ret = srp_alloc_req_data(ch); > + if (ret) > + goto err_disconnect; > + > + ret = srp_connect_ch(ch, multich); > + if (ret) { > + shost_printk(KERN_ERR, target->scsi_host, > + PFX "Connection %d/%d failed\n", > + ch_start + cpu_idx, > + target->ch_count); > + if (node_idx == 0 && cpu_idx == 0) { > + goto err_disconnect; > + } else { > + srp_free_ch_ib(target, ch); > + srp_free_req_data(target, ch); > + target->ch_count = ch - target->ch; > + break; > + } > + } > + > + multich = true; > + cpu_idx++; > + } > + node_idx++; > } > > + target->scsi_host->nr_hw_queues = target->ch_count; > + > ret = srp_add_target(host, target); > if (ret) > goto err_disconnect; > @@ -3154,11 +3315,13 @@ out: > err_disconnect: > srp_disconnect_target(target); > > -err_free_ib: > - srp_free_ch_ib(ch); > + for (i = 0; i < target->ch_count; i++) { > + ch = &target->ch[i]; > + srp_free_ch_ib(target, ch); > + srp_free_req_data(target, ch); > + } > > -err_free_mem: > - srp_free_req_data(ch); > + kfree(target->ch); > > err: > scsi_host_put(target_host); > diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h > index 0609124..d9660e1 100644 > --- a/drivers/infiniband/ulp/srp/ib_srp.h > +++ b/drivers/infiniband/ulp/srp/ib_srp.h > @@ -84,6 +84,21 @@ enum srp_iu_type { > SRP_IU_RSP, > }; > > +static inline u32 build_srp_tag(u16 ch, u16 req_idx) > +{ > + return ch << 16 | req_idx; > +} > + > +static inline u16 srp_tag_ch(u32 tag) > +{ > + return tag >> 16; > +} > + > +static inline u16 srp_tag_idx(u32 tag) > +{ > + return tag & ((1 << 16) - 1); > +} > + > /* > * @mr_page_mask: HCA memory registration page mask. > * @mr_page_size: HCA memory registration page size. > @@ -127,7 +142,7 @@ struct srp_request { > struct srp_direct_buf *indirect_desc; > dma_addr_t indirect_dma_addr; > short nmdesc; > - short index; > + uint32_t tag; > }; > > struct srp_rdma_ch { > @@ -173,8 +188,9 @@ struct srp_target_port { > /* read and written in the hot path */ > spinlock_t lock; > > - struct srp_rdma_ch ch; > /* read only in the hot path */ > + struct srp_rdma_ch *ch; > + u32 ch_count; > u32 lkey; > u32 rkey; > enum srp_target_state state; > -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 23/09/2014 10:32, Sagi Grimberg wrote: > On 9/19/2014 4:00 PM, Bart Van Assche wrote: >> Improve performance by using multiple RDMA/RC channels per SCSI host >> for communicating with an SRP target. >> > > Hey Bart, > > Since you don't seem to negotiate/declare multichannel with the target, > did you test this code with some target implementations other than SCST > that happen to be out there? > > Overall, I think this patch would be easier to review if you also > provide a list of logical changes (which obviously are introduced in > this patch). Patch 7/8 can use some more information of target-channel > relations as well. Hello Sagi, That's a good question. So far this patch series has only been tested against the SCST SRP target driver. However, as you probably noticed, if setting up a second or later RDMA channel fails SRP login is not failed but communication proceeds with the number of channels that have been established. This mechanism should retain backwards compatibility with SRP target systems that do not support multichannel communication. However, if the new code for SRP login turns out to be triggering bugs in existing SRP target implementations we can still add a blacklist for these implementations. I will provide a more detailed list of logical changes in the second version of this patch series. Bart. Bart. -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 9/23/2014 10:02 PM, Bart Van Assche wrote: > On 23/09/2014 10:32, Sagi Grimberg wrote: >> On 9/19/2014 4:00 PM, Bart Van Assche wrote: >>> Improve performance by using multiple RDMA/RC channels per SCSI host >>> for communicating with an SRP target. >>> >> >> Hey Bart, >> >> Since you don't seem to negotiate/declare multichannel with the target, >> did you test this code with some target implementations other than SCST >> that happen to be out there? >> >> Overall, I think this patch would be easier to review if you also >> provide a list of logical changes (which obviously are introduced in >> this patch). Patch 7/8 can use some more information of target-channel >> relations as well. > > Hello Sagi, > > That's a good question. So far this patch series has only been tested > against the SCST SRP target driver. However, as you probably noticed, if > setting up a second or later RDMA channel fails SRP login is not failed > but communication proceeds with the number of channels that have been > established. This mechanism should retain backwards compatibility with > SRP target systems that do not support multichannel communication. > However, if the new code for SRP login turns out to be triggering bugs > in existing SRP target implementations we can still add a blacklist for > these implementations. I'm more concerned that a target will accept multichannel and then starts flipping since that wasn't tested in I don't know when, probably never... Since SRP_LOGIN_REQ/RESP has some free bits why not declare it and activate it when both sides *says* they support it? I'd be much calmer knowing we're on the safe side on this... > > I will provide a more detailed list of logical changes in the second > version of this patch series. Thanks, Plus, I would like to run it on my performance setups. can you point me to the SCST repo? is multichannel supported in scst trunk? Sagi. -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 24/09/2014 6:22, Sagi Grimberg wrote: > Since SRP_LOGIN_REQ/RESP has some free bits why not declare it and > activate it when both sides *says* they support it? I'd be much calmer > knowing we're on the safe side on this... Hello Sagi, Since more than ten years the SRP protocol is an official ANSI standard. Since multichannel support has been defined in that standard my preference is to follow what has been documented in that standard with regard to multichannel operation. Using one of the free bits in the SRP login request and response would involve a protocol modification. Hence the proposal to add a blacklist for non-conforming target implementations. > Plus, I would like to run it on my performance setups. can you point me > to the SCST repo? is multichannel supported in scst trunk? I think multichannel support was already present in the SCST SRP target driver before I started maintaining that driver. However, last April a few patches were checked in to improve multichannel support in the SCST SRP target driver. These patches have been included in the SCST 3.0 release. Download instructions for SCST (3.0 and trunk) can be found e.g. here: http://scst.sourceforge.net/downloads.html. Bart. -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 9/24/2014 4:13 PM, Bart Van Assche wrote: > On 24/09/2014 6:22, Sagi Grimberg wrote: >> Since SRP_LOGIN_REQ/RESP has some free bits why not declare it and >> activate it when both sides *says* they support it? I'd be much calmer >> knowing we're on the safe side on this... > > Hello Sagi, > > Since more than ten years the SRP protocol is an official ANSI standard. > Since multichannel support has been defined in that standard my > preference is to follow what has been documented in that standard with > regard to multichannel operation. Just re-visited the r16a, srp_login request req_flags include MULTI CHANNEL ACTION (Table 10) and srp login response rsp_flags include MULTI-CHANNEL RESULT (Table 12). Did you notice those? Didn't see any reference in the patch... Using one of the free bits in the SRP > login request and response would involve a protocol modification. Hence > the proposal to add a blacklist for non-conforming target implementations. > So I'm not so sure we need to update SRP login sequence... >> Plus, I would like to run it on my performance setups. can you point me >> to the SCST repo? is multichannel supported in scst trunk? > > I think multichannel support was already present in the SCST SRP target > driver before I started maintaining that driver. However, last April a > few patches were checked in to improve multichannel support in the SCST > SRP target driver. These patches have been included in the SCST 3.0 > release. Download instructions for SCST (3.0 and trunk) can be found > e.g. here: http://scst.sourceforge.net/downloads.html. > Thanks, P.S. Would it be possible to break 8/8 into more patches in the next round? it would help make it more review-able? Thanks, Sagi. -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 9/24/2014 4:38 PM, Sagi Grimberg wrote: > On 9/24/2014 4:13 PM, Bart Van Assche wrote: >> On 24/09/2014 6:22, Sagi Grimberg wrote: >>> Since SRP_LOGIN_REQ/RESP has some free bits why not declare it and >>> activate it when both sides *says* they support it? I'd be much calmer >>> knowing we're on the safe side on this... >> >> Hello Sagi, >> >> Since more than ten years the SRP protocol is an official ANSI standard. >> Since multichannel support has been defined in that standard my >> preference is to follow what has been documented in that standard with >> regard to multichannel operation. > > Just re-visited the r16a, srp_login request req_flags include MULTI > CHANNEL ACTION (Table 10) and srp login response rsp_flags include > MULTI-CHANNEL RESULT (Table 12). > > Did you notice those? Didn't see any reference in the patch... > > Using one of the free bits in the SRP >> login request and response would involve a protocol modification. Hence >> the proposal to add a blacklist for non-conforming target >> implementations. >> > > So I'm not so sure we need to update SRP login sequence... > Wait, yes you did reference those... OK, I'm on board now... Sagi. -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 09/23/14 18:32, Sagi Grimberg wrote: > Since you don't seem to negotiate/declare multichannel with the target, > did you test this code with some target implementations other than SCST > that happen to be out there? (replying to an e-mail of two weeks ago) Hello Sagi, I have just verified that the multichannel code in this patch series works fine in combination with the upstream SRP target driver. Bart. -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 10/7/2014 3:51 PM, Bart Van Assche wrote: > On 09/23/14 18:32, Sagi Grimberg wrote: >> Since you don't seem to negotiate/declare multichannel with the target, >> did you test this code with some target implementations other than SCST >> that happen to be out there? > > (replying to an e-mail of two weeks ago) > > Hello Sagi, > > I have just verified that the multichannel code in this patch series > works fine in combination with the upstream SRP target driver. > Working as in single channel mode? or multichannel mode? Sagi. -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 10/13/14 10:17, Sagi Grimberg wrote: > On 10/7/2014 3:51 PM, Bart Van Assche wrote: >> On 09/23/14 18:32, Sagi Grimberg wrote: >>> Since you don't seem to negotiate/declare multichannel with the target, >>> did you test this code with some target implementations other than SCST >>> that happen to be out there? >> >> (replying to an e-mail of two weeks ago) >> >> I have just verified that the multichannel code in this patch series >> works fine in combination with the upstream SRP target driver. > > Working as in single channel mode? or multichannel mode? Hello Sagi, In my e-mail I was referring to multichannel mode. Bart. -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/Documentation/ABI/stable/sysfs-driver-ib_srp b/Documentation/ABI/stable/sysfs-driver-ib_srp index b9688de..d5a459e 100644 --- a/Documentation/ABI/stable/sysfs-driver-ib_srp +++ b/Documentation/ABI/stable/sysfs-driver-ib_srp @@ -55,12 +55,12 @@ Description: Interface for making ib_srp connect to a new target. only safe with partial memory descriptor list support enabled (allow_ext_sg=1). * comp_vector, a number in the range 0..n-1 specifying the - MSI-X completion vector. Some HCA's allocate multiple (n) - MSI-X vectors per HCA port. If the IRQ affinity masks of - these interrupts have been configured such that each MSI-X - interrupt is handled by a different CPU then the comp_vector - parameter can be used to spread the SRP completion workload - over multiple CPU's. + MSI-X completion vector of the first RDMA channel. Some + HCA's allocate multiple (n) MSI-X vectors per HCA port. If + the IRQ affinity masks of these interrupts have been + configured such that each MSI-X interrupt is handled by a + different CPU then the comp_vector parameter can be used to + spread the SRP completion workload over multiple CPU's. * tl_retry_count, a number in the range 2..7 specifying the IB RC retry count. * queue_size, the maximum number of commands that the @@ -88,6 +88,13 @@ Description: Whether ib_srp is allowed to include a partial memory descriptor list in an SRP_CMD when communicating with an SRP target. +What: /sys/class/scsi_host/host<n>/ch_count +Date: November 1, 2014 +KernelVersion: 3.18 +Contact: linux-rdma@vger.kernel.org +Description: Number of RDMA channels used for communication with the SRP + target. + What: /sys/class/scsi_host/host<n>/cmd_sg_entries Date: May 19, 2011 KernelVersion: 2.6.39 @@ -95,6 +102,12 @@ Contact: linux-rdma@vger.kernel.org Description: Maximum number of data buffer descriptors that may be sent to the target in a single SRP_CMD request. +What: /sys/class/scsi_host/host<n>/comp_vector +Date: September 2, 2013 +KernelVersion: 3.11 +Contact: linux-rdma@vger.kernel.org +Description: Completion vector used for the first RDMA channel. + What: /sys/class/scsi_host/host<n>/dgid Date: June 17, 2006 KernelVersion: 2.6.17 diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 9feeea1..58ca618 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -123,6 +123,16 @@ MODULE_PARM_DESC(dev_loss_tmo, " if fast_io_fail_tmo has not been set. \"off\" means that" " this functionality is disabled."); +static unsigned ch_count; +module_param(ch_count, uint, 0444); +MODULE_PARM_DESC(ch_count, + "Number of RDMA channels to use for communication with an SRP" + " target. Using more than one channel improves performance" + " if the HCA supports multiple completion vectors. The" + " default value is the minimum of four times the number of" + " online CPU sockets and the number of completion vectors" + " supported by the HCA."); + static void srp_add_one(struct ib_device *device); static void srp_remove_one(struct ib_device *device); static void srp_recv_completion(struct ib_cq *cq, void *ch_ptr); @@ -556,17 +566,32 @@ err: * Note: this function may be called without srp_alloc_iu_bufs() having been * invoked. Hence the ch->[rt]x_ring checks. */ -static void srp_free_ch_ib(struct srp_rdma_ch *ch) +static void srp_free_ch_ib(struct srp_target_port *target, + struct srp_rdma_ch *ch) { - struct srp_target_port *target = ch->target; struct srp_device *dev = target->srp_host->srp_dev; int i; + if (!ch->target) + return; + + /* + * Avoid that the SCSI error handler tries to use this channel after + * it has been freed. The SCSI error handler can namely continue + * trying to perform recovery actions after scsi_remove_host() + * returned. + */ + ch->target = NULL; + if (ch->cm_id) { ib_destroy_cm_id(ch->cm_id); ch->cm_id = NULL; } + /* If srp_new_cm_id() succeeded but srp_create_ch_ib() not, return. */ + if (!ch->qp) + return; + if (dev->use_fast_reg) { if (ch->fr_pool) srp_destroy_fr_pool(ch->fr_pool); @@ -647,7 +672,7 @@ static int srp_lookup_path(struct srp_rdma_ch *ch) return ch->status; } -static int srp_send_req(struct srp_rdma_ch *ch) +static int srp_send_req(struct srp_rdma_ch *ch, bool multich) { struct srp_target_port *target = ch->target; struct { @@ -688,6 +713,8 @@ static int srp_send_req(struct srp_rdma_ch *ch) req->priv.req_it_iu_len = cpu_to_be32(target->max_iu_len); req->priv.req_buf_fmt = cpu_to_be16(SRP_BUF_FORMAT_DIRECT | SRP_BUF_FORMAT_INDIRECT); + req->priv.req_flags = (multich ? SRP_MULTICHAN_MULTI : + SRP_MULTICHAN_SINGLE); /* * In the published SRP specification (draft rev. 16a), the * port identifier format is 8 bytes of ID extension followed @@ -769,27 +796,31 @@ static bool srp_change_conn_state(struct srp_target_port *target, static void srp_disconnect_target(struct srp_target_port *target) { - struct srp_rdma_ch *ch = &target->ch; + struct srp_rdma_ch *ch; + int i; if (srp_change_conn_state(target, false)) { /* XXX should send SRP_I_LOGOUT request */ - if (ib_send_cm_dreq(ch->cm_id, NULL, 0)) { - shost_printk(KERN_DEBUG, target->scsi_host, - PFX "Sending CM DREQ failed\n"); + for (i = 0; i < target->ch_count; i++) { + ch = &target->ch[i]; + if (ch->cm_id && ib_send_cm_dreq(ch->cm_id, NULL, 0)) { + shost_printk(KERN_DEBUG, target->scsi_host, + PFX "Sending CM DREQ failed\n"); + } } } } -static void srp_free_req_data(struct srp_rdma_ch *ch) +static void srp_free_req_data(struct srp_target_port *target, + struct srp_rdma_ch *ch) { - struct srp_target_port *target = ch->target; struct srp_device *dev = target->srp_host->srp_dev; struct ib_device *ibdev = dev->dev; struct srp_request *req; int i; - if (!ch->req_ring) + if (!ch->target || !ch->req_ring) return; for (i = 0; i < target->req_ring_size; ++i) { @@ -853,7 +884,7 @@ static int srp_alloc_req_data(struct srp_rdma_ch *ch) goto out; req->indirect_dma_addr = dma_addr; - req->index = i; + req->tag = build_srp_tag(ch - target->ch, i); list_add_tail(&req->list, &ch->free_reqs); } ret = 0; @@ -879,7 +910,8 @@ static void srp_del_scsi_host_attr(struct Scsi_Host *shost) static void srp_remove_target(struct srp_target_port *target) { - struct srp_rdma_ch *ch = &target->ch; + struct srp_rdma_ch *ch; + int i; WARN_ON_ONCE(target->state != SRP_TARGET_REMOVED); @@ -889,10 +921,18 @@ static void srp_remove_target(struct srp_target_port *target) scsi_remove_host(target->scsi_host); srp_stop_rport_timers(target->rport); srp_disconnect_target(target); - srp_free_ch_ib(ch); + for (i = 0; i < target->ch_count; i++) { + ch = &target->ch[i]; + srp_free_ch_ib(target, ch); + } cancel_work_sync(&target->tl_err_work); srp_rport_put(target->rport); - srp_free_req_data(ch); + for (i = 0; i < target->ch_count; i++) { + ch = &target->ch[i]; + srp_free_req_data(target, ch); + } + kfree(target->ch); + target->ch = NULL; spin_lock(&target->srp_host->target_lock); list_del(&target->list); @@ -918,12 +958,12 @@ static void srp_rport_delete(struct srp_rport *rport) srp_queue_remove_work(target); } -static int srp_connect_ch(struct srp_rdma_ch *ch) +static int srp_connect_ch(struct srp_rdma_ch *ch, bool multich) { struct srp_target_port *target = ch->target; int ret; - WARN_ON_ONCE(target->connected); + WARN_ON_ONCE(!multich && target->connected); target->qp_in_error = false; @@ -933,7 +973,7 @@ static int srp_connect_ch(struct srp_rdma_ch *ch) while (1) { init_completion(&ch->done); - ret = srp_send_req(ch); + ret = srp_send_req(ch, multich); if (ret) return ret; ret = wait_for_completion_interruptible(&ch->done); @@ -1095,10 +1135,10 @@ static void srp_finish_req(struct srp_rdma_ch *ch, struct srp_request *req, static void srp_terminate_io(struct srp_rport *rport) { struct srp_target_port *target = rport->lld_data; - struct srp_rdma_ch *ch = &target->ch; + struct srp_rdma_ch *ch; struct Scsi_Host *shost = target->scsi_host; struct scsi_device *sdev; - int i; + int i, j; /* * Invoking srp_terminate_io() while srp_queuecommand() is running @@ -1107,10 +1147,15 @@ static void srp_terminate_io(struct srp_rport *rport) shost_for_each_device(sdev, shost) WARN_ON_ONCE(sdev->request_queue->request_fn_active); - for (i = 0; i < target->req_ring_size; ++i) { - struct srp_request *req = &ch->req_ring[i]; + for (i = 0; i < target->ch_count; i++) { + ch = &target->ch[i]; + + for (j = 0; j < target->req_ring_size; ++j) { + struct srp_request *req = &ch->req_ring[j]; - srp_finish_req(ch, req, NULL, DID_TRANSPORT_FAILFAST << 16); + srp_finish_req(ch, req, NULL, + DID_TRANSPORT_FAILFAST << 16); + } } } @@ -1126,8 +1171,9 @@ static void srp_terminate_io(struct srp_rport *rport) static int srp_rport_reconnect(struct srp_rport *rport) { struct srp_target_port *target = rport->lld_data; - struct srp_rdma_ch *ch = &target->ch; - int i, ret; + struct srp_rdma_ch *ch; + int i, j, ret = 0; + bool multich = false; srp_disconnect_target(target); @@ -1139,27 +1185,43 @@ static int srp_rport_reconnect(struct srp_rport *rport) * case things are really fouled up. Doing so also ensures that all CM * callbacks will have finished before a new QP is allocated. */ - ret = srp_new_cm_id(ch); - - for (i = 0; i < target->req_ring_size; ++i) { - struct srp_request *req = &ch->req_ring[i]; - - srp_finish_req(ch, req, NULL, DID_RESET << 16); + for (i = 0; i < target->ch_count; i++) { + ch = &target->ch[i]; + if (!ch->target) + return -ENODEV; + ret += srp_new_cm_id(ch); + } + for (i = 0; i < target->ch_count; i++) { + ch = &target->ch[i]; + for (j = 0; j < target->req_ring_size; ++j) { + struct srp_request *req = &ch->req_ring[j]; + + srp_finish_req(ch, req, NULL, DID_RESET << 16); + } } + for (i = 0; i < target->ch_count; i++) { + ch = &target->ch[i]; + /* + * Whether or not creating a new CM ID succeeded, create a new + * QP. This guarantees that all completion callback function + * invocations have finished before request resetting starts. + */ + ret += srp_create_ch_ib(ch); - /* - * Whether or not creating a new CM ID succeeded, create a new - * QP. This guarantees that all callback functions for the old QP have - * finished before any send requests are posted on the new QP. - */ - ret += srp_create_ch_ib(ch); - - INIT_LIST_HEAD(&ch->free_tx); - for (i = 0; i < target->queue_size; ++i) - list_add(&ch->tx_ring[i]->list, &ch->free_tx); - - if (ret == 0) - ret = srp_connect_ch(ch); + INIT_LIST_HEAD(&ch->free_tx); + for (j = 0; j < target->queue_size; ++j) + list_add(&ch->tx_ring[j]->list, &ch->free_tx); + } + for (i = 0; i < target->ch_count; i++) { + ch = &target->ch[i]; + if (ret) { + if (i > 1) + ret = 0; + break; + } + ret = srp_connect_ch(ch, multich); + multich = true; + } if (ret == 0) shost_printk(KERN_INFO, target->scsi_host, @@ -1573,7 +1635,7 @@ static struct srp_iu *__srp_get_tx_iu(struct srp_rdma_ch *ch, s32 rsv = (iu_type == SRP_IU_TSK_MGMT) ? 0 : SRP_TSK_MGMT_SQ_SIZE; struct srp_iu *iu; - srp_send_completion(ch->send_cq, target); + srp_send_completion(ch->send_cq, ch); if (list_empty(&ch->free_tx)) return NULL; @@ -1637,6 +1699,7 @@ static void srp_process_rsp(struct srp_rdma_ch *ch, struct srp_rsp *rsp) struct srp_request *req; struct scsi_cmnd *scmnd; unsigned long flags; + unsigned i; if (unlikely(rsp->tag & SRP_TAG_TSK_MGMT)) { spin_lock_irqsave(&ch->lock, flags); @@ -1648,12 +1711,20 @@ static void srp_process_rsp(struct srp_rdma_ch *ch, struct srp_rsp *rsp) ch->tsk_mgmt_status = rsp->data[3]; complete(&ch->tsk_mgmt_done); } else { - req = &ch->req_ring[rsp->tag]; - scmnd = srp_claim_req(ch, req, NULL, NULL); + if (srp_tag_ch(rsp->tag) != ch - target->ch) + pr_err("Channel idx mismatch: tag %#llx <> ch %#lx\n", + rsp->tag, ch - target->ch); + i = srp_tag_idx(rsp->tag); + if (i < target->req_ring_size) { + req = &ch->req_ring[i]; + scmnd = srp_claim_req(ch, req, NULL, NULL); + } else { + scmnd = NULL; + } if (!scmnd) { shost_printk(KERN_ERR, target->scsi_host, - "Null scmnd for RSP w/tag %016llx\n", - (unsigned long long) rsp->tag); + "Null scmnd for RSP w/tag %#016llx received on ch %ld / QP %#x\n", + rsp->tag, ch - target->ch, ch->qp->qp_num); spin_lock_irqsave(&ch->lock, flags); ch->req_lim += be32_to_cpu(rsp->req_lim_delta); @@ -1879,7 +1950,8 @@ static void srp_send_completion(struct ib_cq *cq, void *ch_ptr) } } -static int srp_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *scmnd) +static int srp_queuecommand(unsigned hwq, struct Scsi_Host *shost, + struct scsi_cmnd *scmnd) { struct srp_target_port *target = host_to_target(shost); struct srp_rport *rport = target->rport; @@ -1905,7 +1977,7 @@ static int srp_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *scmnd) if (unlikely(scmnd->result)) goto err; - ch = &target->ch; + ch = &target->ch[hwq]; spin_lock_irqsave(&ch->lock, flags); iu = __srp_get_tx_iu(ch, SRP_IU_CMD); @@ -1927,7 +1999,7 @@ static int srp_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *scmnd) cmd->opcode = SRP_CMD; cmd->lun = cpu_to_be64((u64) scmnd->device->lun << 48); - cmd->tag = req->index; + cmd->tag = req->tag; memcpy(cmd->cdb, scmnd->cmnd, scmnd->cmd_len); req->scmnd = scmnd; @@ -1993,6 +2065,17 @@ err: goto unlock_rport; } +static int srp_sq_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *scmnd) +{ + return srp_queuecommand(0, shost, scmnd); +} + +static int srp_mq_queuecommand(struct blk_mq_hw_ctx *hctx, + struct scsi_cmnd *scmnd) +{ + return srp_queuecommand(hctx->queue_num, scmnd->device->host, scmnd); +} + /* * Note: the resources allocated in this function are freed in * srp_free_ch_ib(). @@ -2409,15 +2492,23 @@ static int srp_abort(struct scsi_cmnd *scmnd) { struct srp_target_port *target = host_to_target(scmnd->device->host); struct srp_request *req = (struct srp_request *) scmnd->host_scribble; + u16 ch_idx; struct srp_rdma_ch *ch; int ret; shost_printk(KERN_ERR, target->scsi_host, "SRP abort called\n"); - ch = &target->ch; - if (!req || !srp_claim_req(ch, req, NULL, scmnd)) + if (!req) + return SUCCESS; + ch_idx = srp_tag_ch(req->tag); + if (WARN_ON_ONCE(ch_idx >= target->ch_count)) return SUCCESS; - if (srp_send_tsk_mgmt(ch, req->index, scmnd->device->lun, + ch = &target->ch[ch_idx]; + if (!srp_claim_req(ch, req, NULL, scmnd)) + return SUCCESS; + shost_printk(KERN_ERR, target->scsi_host, + "Sending SRP abort for tag %#x\n", req->tag); + if (srp_send_tsk_mgmt(ch, req->tag, scmnd->device->lun, SRP_TSK_ABORT_TASK) == 0) ret = SUCCESS; else if (target->rport->state == SRP_RPORT_LOST) @@ -2434,21 +2525,25 @@ static int srp_abort(struct scsi_cmnd *scmnd) static int srp_reset_device(struct scsi_cmnd *scmnd) { struct srp_target_port *target = host_to_target(scmnd->device->host); - struct srp_rdma_ch *ch = &target->ch; + struct srp_rdma_ch *ch; int i; shost_printk(KERN_ERR, target->scsi_host, "SRP reset_device called\n"); + ch = &target->ch[0]; if (srp_send_tsk_mgmt(ch, SRP_TAG_NO_REQ, scmnd->device->lun, SRP_TSK_LUN_RESET)) return FAILED; if (ch->tsk_mgmt_status) return FAILED; - for (i = 0; i < target->req_ring_size; ++i) { - struct srp_request *req = &ch->req_ring[i]; + for (i = 0; i < target->ch_count; i++) { + ch = &target->ch[i]; + for (i = 0; i < target->req_ring_size; ++i) { + struct srp_request *req = &ch->req_ring[i]; - srp_finish_req(ch, req, scmnd->device, DID_RESET << 16); + srp_finish_req(ch, req, scmnd->device, DID_RESET << 16); + } } return SUCCESS; @@ -2525,7 +2620,7 @@ static ssize_t show_dgid(struct device *dev, struct device_attribute *attr, char *buf) { struct srp_target_port *target = host_to_target(class_to_shost(dev)); - struct srp_rdma_ch *ch = &target->ch; + struct srp_rdma_ch *ch = &target->ch[0]; return sprintf(buf, "%pI6\n", ch->path.dgid.raw); } @@ -2542,8 +2637,14 @@ static ssize_t show_req_lim(struct device *dev, struct device_attribute *attr, char *buf) { struct srp_target_port *target = host_to_target(class_to_shost(dev)); + struct srp_rdma_ch *ch; + int i, req_lim = INT_MAX; - return sprintf(buf, "%d\n", target->ch.req_lim); + for (i = 0; i < target->ch_count; i++) { + ch = &target->ch[i]; + req_lim = min(req_lim, ch->req_lim); + } + return sprintf(buf, "%d\n", req_lim); } static ssize_t show_zero_req_lim(struct device *dev, @@ -2570,6 +2671,14 @@ static ssize_t show_local_ib_device(struct device *dev, return sprintf(buf, "%s\n", target->srp_host->srp_dev->dev->name); } +static ssize_t show_ch_count(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct srp_target_port *target = host_to_target(class_to_shost(dev)); + + return sprintf(buf, "%d\n", target->ch_count); +} + static ssize_t show_comp_vector(struct device *dev, struct device_attribute *attr, char *buf) { @@ -2613,6 +2722,7 @@ static DEVICE_ATTR(req_lim, S_IRUGO, show_req_lim, NULL); static DEVICE_ATTR(zero_req_lim, S_IRUGO, show_zero_req_lim, NULL); static DEVICE_ATTR(local_ib_port, S_IRUGO, show_local_ib_port, NULL); static DEVICE_ATTR(local_ib_device, S_IRUGO, show_local_ib_device, NULL); +static DEVICE_ATTR(ch_count, S_IRUGO, show_ch_count, NULL); static DEVICE_ATTR(comp_vector, S_IRUGO, show_comp_vector, NULL); static DEVICE_ATTR(tl_retry_count, S_IRUGO, show_tl_retry_count, NULL); static DEVICE_ATTR(cmd_sg_entries, S_IRUGO, show_cmd_sg_entries, NULL); @@ -2630,6 +2740,7 @@ static struct device_attribute *srp_host_attrs[] = { &dev_attr_zero_req_lim, &dev_attr_local_ib_port, &dev_attr_local_ib_device, + &dev_attr_ch_count, &dev_attr_comp_vector, &dev_attr_tl_retry_count, &dev_attr_cmd_sg_entries, @@ -2643,7 +2754,8 @@ static struct scsi_host_template srp_template = { .proc_name = DRV_NAME, .slave_configure = srp_slave_configure, .info = srp_target_info, - .queuecommand = srp_queuecommand, + .queuecommand = srp_sq_queuecommand, + .mq_queuecommand = srp_mq_queuecommand, .change_queue_depth = srp_change_queue_depth, .change_queue_type = srp_change_queue_type, .eh_abort_handler = srp_abort, @@ -3038,7 +3150,8 @@ static ssize_t srp_create_target(struct device *dev, struct srp_rdma_ch *ch; struct srp_device *srp_dev = host->srp_dev; struct ib_device *ibdev = srp_dev->dev; - int ret; + int ret, node_idx, node, cpu, i; + bool multich = false; target_host = scsi_host_alloc(&srp_template, sizeof (struct srp_target_port)); @@ -3098,34 +3211,82 @@ static ssize_t srp_create_target(struct device *dev, INIT_WORK(&target->tl_err_work, srp_tl_err_work); INIT_WORK(&target->remove_work, srp_remove_work); spin_lock_init(&target->lock); - ch = &target->ch; - ch->target = target; - ch->comp_vector = target->comp_vector; - spin_lock_init(&ch->lock); - INIT_LIST_HEAD(&ch->free_tx); - ret = srp_alloc_req_data(ch); - if (ret) - goto err_free_mem; - ret = ib_query_gid(ibdev, host->port, 0, &target->sgid); if (ret) - goto err_free_mem; + goto err; - ret = srp_create_ch_ib(ch); - if (ret) - goto err_free_mem; + ret = -ENOMEM; + target->ch_count = max_t(unsigned, num_online_nodes(), + min(ch_count ? : + min(4 * num_online_nodes(), + ibdev->num_comp_vectors), + num_online_cpus())); + target->ch = kcalloc(target->ch_count, sizeof(*target->ch), + GFP_KERNEL); + if (!target->ch) + goto err; - ret = srp_new_cm_id(ch); - if (ret) - goto err_free_ib; + node_idx = 0; + for_each_online_node(node) { + const int ch_start = (node_idx * target->ch_count / + num_online_nodes()); + const int ch_end = ((node_idx + 1) * target->ch_count / + num_online_nodes()); + const int cv_start = (node_idx * ibdev->num_comp_vectors / + num_online_nodes() + target->comp_vector) + % ibdev->num_comp_vectors; + const int cv_end = ((node_idx + 1) * ibdev->num_comp_vectors / + num_online_nodes() + target->comp_vector) + % ibdev->num_comp_vectors; + int cpu_idx = 0; + + for_each_online_cpu(cpu) { + if (cpu_to_node(cpu) != node) + continue; + if (ch_start + cpu_idx >= ch_end) + continue; + ch = &target->ch[ch_start + cpu_idx]; + ch->target = target; + ch->comp_vector = cv_start == cv_end ? cv_start : + cv_start + cpu_idx % (cv_end - cv_start); + spin_lock_init(&ch->lock); + INIT_LIST_HEAD(&ch->free_tx); + ret = srp_new_cm_id(ch); + if (ret) + goto err_disconnect; - ret = srp_connect_ch(ch); - if (ret) { - shost_printk(KERN_ERR, target->scsi_host, - PFX "Connection failed\n"); - goto err_free_ib; + ret = srp_create_ch_ib(ch); + if (ret) + goto err_disconnect; + + ret = srp_alloc_req_data(ch); + if (ret) + goto err_disconnect; + + ret = srp_connect_ch(ch, multich); + if (ret) { + shost_printk(KERN_ERR, target->scsi_host, + PFX "Connection %d/%d failed\n", + ch_start + cpu_idx, + target->ch_count); + if (node_idx == 0 && cpu_idx == 0) { + goto err_disconnect; + } else { + srp_free_ch_ib(target, ch); + srp_free_req_data(target, ch); + target->ch_count = ch - target->ch; + break; + } + } + + multich = true; + cpu_idx++; + } + node_idx++; } + target->scsi_host->nr_hw_queues = target->ch_count; + ret = srp_add_target(host, target); if (ret) goto err_disconnect; @@ -3154,11 +3315,13 @@ out: err_disconnect: srp_disconnect_target(target); -err_free_ib: - srp_free_ch_ib(ch); + for (i = 0; i < target->ch_count; i++) { + ch = &target->ch[i]; + srp_free_ch_ib(target, ch); + srp_free_req_data(target, ch); + } -err_free_mem: - srp_free_req_data(ch); + kfree(target->ch); err: scsi_host_put(target_host); diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h index 0609124..d9660e1 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.h +++ b/drivers/infiniband/ulp/srp/ib_srp.h @@ -84,6 +84,21 @@ enum srp_iu_type { SRP_IU_RSP, }; +static inline u32 build_srp_tag(u16 ch, u16 req_idx) +{ + return ch << 16 | req_idx; +} + +static inline u16 srp_tag_ch(u32 tag) +{ + return tag >> 16; +} + +static inline u16 srp_tag_idx(u32 tag) +{ + return tag & ((1 << 16) - 1); +} + /* * @mr_page_mask: HCA memory registration page mask. * @mr_page_size: HCA memory registration page size. @@ -127,7 +142,7 @@ struct srp_request { struct srp_direct_buf *indirect_desc; dma_addr_t indirect_dma_addr; short nmdesc; - short index; + uint32_t tag; }; struct srp_rdma_ch { @@ -173,8 +188,9 @@ struct srp_target_port { /* read and written in the hot path */ spinlock_t lock; - struct srp_rdma_ch ch; /* read only in the hot path */ + struct srp_rdma_ch *ch; + u32 ch_count; u32 lkey; u32 rkey; enum srp_target_state state;
Improve performance by using multiple RDMA/RC channels per SCSI host for communicating with an SRP target. Signed-off-by: Bart Van Assche <bvanassche@acm.org> --- Documentation/ABI/stable/sysfs-driver-ib_srp | 25 +- drivers/infiniband/ulp/srp/ib_srp.c | 337 ++++++++++++++++++++------- drivers/infiniband/ulp/srp/ib_srp.h | 20 +- 3 files changed, 287 insertions(+), 95 deletions(-)