Message ID | 1389714323-20130-10-git-send-email-eli@mellanox.com (mailing list archive) |
---|---|
State | Accepted, archived |
Headers | show |
Hi, Le mardi 14 janvier 2014 à 17:45 +0200, Eli Cohen a écrit : > Implement resize CQ which is a mandatory verb in mlx5. > > Signed-off-by: Eli Cohen <eli@mellanox.com> > > Conflicts: > include/linux/mlx5/device.h > --- > drivers/infiniband/hw/mlx5/cq.c | 282 +++++++++++++++++++++++++-- > drivers/infiniband/hw/mlx5/mlx5_ib.h | 3 +- > drivers/infiniband/hw/mlx5/user.h | 3 + > drivers/net/ethernet/mellanox/mlx5/core/cq.c | 4 +- > include/linux/mlx5/cq.h | 12 +- > include/linux/mlx5/device.h | 2 + > 6 files changed, 284 insertions(+), 22 deletions(-) > > diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c > index b4c122e..50b03a8 100644 > --- a/drivers/infiniband/hw/mlx5/cq.c > +++ b/drivers/infiniband/hw/mlx5/cq.c > @@ -73,14 +73,24 @@ static void *get_cqe(struct mlx5_ib_cq *cq, int n) > return get_cqe_from_buf(&cq->buf, n, cq->mcq.cqe_sz); > } > > +static u8 sw_ownership_bit(int n, int nent) > +{ > + return (n & nent) ? 1 : 0; > +} > + > static void *get_sw_cqe(struct mlx5_ib_cq *cq, int n) > { > void *cqe = get_cqe(cq, n & cq->ibcq.cqe); > struct mlx5_cqe64 *cqe64; > > cqe64 = (cq->mcq.cqe_sz == 64) ? cqe : cqe + 64; > - return ((cqe64->op_own & MLX5_CQE_OWNER_MASK) ^ > - !!(n & (cq->ibcq.cqe + 1))) ? NULL : cqe; > + > + if (likely((cqe64->op_own) >> 4 != MLX5_CQE_INVALID) && > + !((cqe64->op_own & MLX5_CQE_OWNER_MASK) ^ !!(n & (cq->ibcq.cqe + 1)))) { > + return cqe; > + } else { > + return NULL; > + } > } > > static void *next_cqe_sw(struct mlx5_ib_cq *cq) > @@ -351,6 +361,11 @@ static void handle_atomics(struct mlx5_ib_qp *qp, struct mlx5_cqe64 *cqe64, > qp->sq.last_poll = tail; > } > > +static void free_cq_buf(struct mlx5_ib_dev *dev, struct mlx5_ib_cq_buf *buf) > +{ > + mlx5_buf_free(&dev->mdev, &buf->buf); > +} > + > static int mlx5_poll_one(struct mlx5_ib_cq *cq, > struct mlx5_ib_qp **cur_qp, > struct ib_wc *wc) > @@ -366,6 +381,7 @@ static int mlx5_poll_one(struct mlx5_ib_cq *cq, > void *cqe; > int idx; > > +repoll: > cqe = next_cqe_sw(cq); > if (!cqe) > return -EAGAIN; > @@ -379,7 +395,18 @@ static int mlx5_poll_one(struct mlx5_ib_cq *cq, > */ > rmb(); > > - /* TBD: resize CQ */ > + opcode = cqe64->op_own >> 4; > + if (unlikely(opcode == MLX5_CQE_RESIZE_CQ)) { > + if (likely(cq->resize_buf)) { > + free_cq_buf(dev, &cq->buf); > + cq->buf = *cq->resize_buf; > + kfree(cq->resize_buf); > + cq->resize_buf = NULL; > + goto repoll; > + } else { > + mlx5_ib_warn(dev, "unexpected resize cqe\n"); > + } > + } > > qpn = ntohl(cqe64->sop_drop_qpn) & 0xffffff; > if (!*cur_qp || (qpn != (*cur_qp)->ibqp.qp_num)) { > @@ -398,7 +425,6 @@ static int mlx5_poll_one(struct mlx5_ib_cq *cq, > } > > wc->qp = &(*cur_qp)->ibqp; > - opcode = cqe64->op_own >> 4; > switch (opcode) { > case MLX5_CQE_REQ: > wq = &(*cur_qp)->sq; > @@ -503,15 +529,11 @@ static int alloc_cq_buf(struct mlx5_ib_dev *dev, struct mlx5_ib_cq_buf *buf, > return err; > > buf->cqe_size = cqe_size; > + buf->nent = nent; > > return 0; > } > > -static void free_cq_buf(struct mlx5_ib_dev *dev, struct mlx5_ib_cq_buf *buf) > -{ > - mlx5_buf_free(&dev->mdev, &buf->buf); > -} > - > static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata, > struct ib_ucontext *context, struct mlx5_ib_cq *cq, > int entries, struct mlx5_create_cq_mbox_in **cqb, > @@ -576,16 +598,16 @@ static void destroy_cq_user(struct mlx5_ib_cq *cq, struct ib_ucontext *context) > ib_umem_release(cq->buf.umem); > } > > -static void init_cq_buf(struct mlx5_ib_cq *cq, int nent) > +static void init_cq_buf(struct mlx5_ib_cq *cq, struct mlx5_ib_cq_buf *buf) > { > int i; > void *cqe; > struct mlx5_cqe64 *cqe64; > > - for (i = 0; i < nent; i++) { > - cqe = get_cqe(cq, i); > - cqe64 = (cq->buf.cqe_size == 64) ? cqe : cqe + 64; > - cqe64->op_own = 0xf1; > + for (i = 0; i < buf->nent; i++) { > + cqe = get_cqe_from_buf(buf, i, buf->cqe_size); > + cqe64 = buf->cqe_size == 64 ? cqe : cqe + 64; > + cqe64->op_own = MLX5_CQE_INVALID << 4; > } > } > > @@ -610,7 +632,7 @@ static int create_cq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq, > if (err) > goto err_db; > > - init_cq_buf(cq, entries); > + init_cq_buf(cq, &cq->buf); > > *inlen = sizeof(**cqb) + sizeof(*(*cqb)->pas) * cq->buf.buf.npages; > *cqb = mlx5_vzalloc(*inlen); > @@ -836,7 +858,7 @@ int mlx5_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period) > in->ctx.cq_period = cpu_to_be16(cq_period); > in->ctx.cq_max_count = cpu_to_be16(cq_count); > in->field_select = cpu_to_be32(fsel); > - err = mlx5_core_modify_cq(&dev->mdev, &mcq->mcq, in); > + err = mlx5_core_modify_cq(&dev->mdev, &mcq->mcq, in, sizeof(*in)); > kfree(in); > > if (err) > @@ -845,9 +867,235 @@ int mlx5_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period) > return err; > } > > +static int resize_user(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq, > + int entries, struct ib_udata *udata, int *npas, > + int *page_shift, int *cqe_size) > +{ > + struct mlx5_ib_resize_cq ucmd; > + struct ib_umem *umem; > + int err; > + int npages; > + struct ib_ucontext *context = cq->buf.umem->context; > + > + if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd))) > + return -EFAULT; > + You might also write err = ib_copy_from_udata(&ucmd, udata, sizeof(ucmd)); if (err) return err; Then you should check reserved fields being set to the default value: As noted by Daniel Vetter in its article "Botching up ioctls"[1] "Check *all* unused fields and flags and all the padding for whether it's 0, and reject the ioctl if that's not the case. Otherwise your nice plan for future extensions is going right down the gutters since someone *will* submit an ioctl struct with random stack garbage in the yet unused parts. Which then bakes in the ABI that those fields can never be used for anything else but garbage." It's important to ensure that reserved fields are set to known value, so that it will be possible to use them latter to extend the ABI. [1] http://blog.ffwll.ch/2013/11/botching-up-ioctls.html if (ucmd.reserved0 || ucmd.reserved1) return -EINVAL; > + umem = ib_umem_get(context, ucmd.buf_addr, entries * ucmd.cqe_size, > + IB_ACCESS_LOCAL_WRITE, 1); > + if (IS_ERR(umem)) { > + err = PTR_ERR(umem); > + return err; > + } > + > + mlx5_ib_cont_pages(umem, ucmd.buf_addr, &npages, page_shift, > + npas, NULL); > + > + cq->resize_umem = umem; > + *cqe_size = ucmd.cqe_size; > + > + return 0; > +} > + > int mlx5_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata) > { > - return -ENOSYS; > + struct mlx5_ib_dev *dev = to_mdev(ibcq->device); > + struct mlx5_ib_cq *cq = to_mcq(ibcq); > + struct mlx5_modify_cq_mbox_in *in; > + int err; > + int npas; > + int page_shift; > + int inlen; > + int uninitialized_var(cqe_size); > + unsigned long flags; > + > + if (!(dev->mdev.caps.flags & MLX5_DEV_CAP_FLAG_RESIZE_CQ)) { > + pr_info("Firmware does not support resize CQ\n"); > + return -ENOSYS; > + } > + > + if (entries < 1) > + return -EINVAL; > + > + entries = roundup_pow_of_two(entries + 1); > + if (entries > dev->mdev.caps.max_cqes + 1) > + return -EINVAL; > + > + if (entries == ibcq->cqe + 1) > + return 0; > + > + mutex_lock(&cq->resize_mutex); ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ > + if (udata) { > + err = resize_user(dev, cq, entries, udata, &npas, &page_shift, > + &cqe_size); > + } else { > + cqe_size = 64; > + err = resize_kernel(dev, cq, entries, cqe_size); > + if (!err) { > + npas = cq->resize_buf->buf.npages; > + page_shift = cq->resize_buf->buf.page_shift; > + } > + } > + > + if (err) > + goto ex; > + > + inlen = sizeof(*in) + npas * sizeof(in->pas[0]); > + in = mlx5_vzalloc(inlen); > + if (!in) { > + err = -ENOMEM; > + goto ex_resize; > + } > + > + if (udata) > + mlx5_ib_populate_pas(dev, cq->resize_umem, page_shift, > + in->pas, 0); > + else > + mlx5_fill_page_array(&cq->resize_buf->buf, in->pas); > + > + in->field_select = cpu_to_be32(MLX5_MODIFY_CQ_MASK_LOG_SIZE | > + MLX5_MODIFY_CQ_MASK_PG_OFFSET | > + MLX5_MODIFY_CQ_MASK_PG_SIZE); > + in->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT; > + in->ctx.cqe_sz_flags = cqe_sz_to_mlx_sz(cqe_size) << 5; > + in->ctx.page_offset = 0; > + in->ctx.log_sz_usr_page = cpu_to_be32(ilog2(entries) << 24); > + in->hdr.opmod = cpu_to_be16(MLX5_CQ_OPMOD_RESIZE); > + in->cqn = cpu_to_be32(cq->mcq.cqn); > + > + err = mlx5_core_modify_cq(&dev->mdev, &cq->mcq, in, inlen); > + if (err) > + goto ex_alloc; > + > + if (udata) { > + cq->ibcq.cqe = entries - 1; > + ib_umem_release(cq->buf.umem); > + cq->buf.umem = cq->resize_umem; > + cq->resize_umem = NULL; > + } else { > + struct mlx5_ib_cq_buf tbuf; > + int resized = 0; > + > + spin_lock_irqsave(&cq->lock, flags); > + if (cq->resize_buf) { > + err = copy_resize_cqes(cq); > + if (!err) { > + tbuf = cq->buf; > + cq->buf = *cq->resize_buf; > + kfree(cq->resize_buf); > + cq->resize_buf = NULL; > + resized = 1; > + } > + } > + cq->ibcq.cqe = entries - 1; > + spin_unlock_irqrestore(&cq->lock, flags); > + if (resized) > + free_cq_buf(dev, &tbuf); > + } > + mutex_unlock(&cq->resize_mutex); ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Is everything in this section really critical. For example, allocating and setting 'in' structure or releasing the ressources could probably move outside the mutex protected section ? > + > + mlx5_vfree(in); > + return 0; > + > +ex_alloc: > + mlx5_vfree(in); > + > +ex_resize: > + if (udata) > + un_resize_user(cq); > + else > + un_resize_kernel(dev, cq); > +ex: > + mutex_unlock(&cq->resize_mutex); > + return err; > } > > > > int mlx5_core_modify_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, > - struct mlx5_modify_cq_mbox_in *in) > + struct mlx5_modify_cq_mbox_in *in, int in_sz) ^^^^^^^^^^ Should probably be 'unsigned' ? size_t ? > { > struct mlx5_modify_cq_mbox_out out; > int err; > > memset(&out, 0, sizeof(out)); > in->hdr.opcode = cpu_to_be16(MLX5_CMD_OP_MODIFY_CQ); > - err = mlx5_cmd_exec(dev, in, sizeof(*in), &out, sizeof(out)); > + err = mlx5_cmd_exec(dev, in, in_sz, &out, sizeof(out)); > if (err) > return err; > > @@ -158,7 +166,7 @@ int mlx5_core_destroy_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq); > int mlx5_core_query_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, > struct mlx5_query_cq_mbox_out *out); > int mlx5_core_modify_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, > - struct mlx5_modify_cq_mbox_in *in); > + struct mlx5_modify_cq_mbox_in *in, int in_sz); ^^^^^^^^^^ same here. diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h > index dbb03ca..87e2371 100644 > --- a/include/linux/mlx5/device.h > +++ b/include/linux/mlx5/device.h > @@ -710,6 +711,7 @@ struct mlx5_modify_cq_mbox_in { > > struct mlx5_modify_cq_mbox_out { > struct mlx5_outbox_hdr hdr; > + u8 rsvd[8]; > }; > > struct mlx5_enable_hca_mbox_in { > It not clear why 8 bytes are needed here ? Regards.
On Tue, Jan 14, 2014 at 05:36:50PM +0100, Yann Droneaud wrote: > > + if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd))) > > + return -EFAULT; > > + > > You might also write > > err = ib_copy_from_udata(&ucmd, udata, sizeof(ucmd)); > if (err) > return err; > > Then you should check reserved fields being set to the default value: > As noted by Daniel Vetter in its article "Botching up ioctls"[1] > "Check *all* unused fields and flags and all the padding for whether > it's 0, and reject the ioctl if that's not the case. Otherwise your > nice plan for future extensions is going right down the gutters > since someone *will* submit an ioctl struct with random stack > garbage in the yet unused parts. Which then bakes in the ABI that > those fields can never be used for anything else but garbage." > It's important to ensure that reserved fields are set to known value, > so that it will be possible to use them latter to extend the ABI. > > [1] http://blog.ffwll.ch/2013/11/botching-up-ioctls.html > > if (ucmd.reserved0 || ucmd.reserved1) > return -EINVAL; > It is not likely that someone will pass non-zero values here since libmlx5 clears and most apps will use it. But I agree with your comment - thanks for pointing this out. Probably there are other places that need to be checked. > > + } > > + mutex_unlock(&cq->resize_mutex); > ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ > > Is everything in this section really critical. > For example, allocating and setting 'in' structure or releasing the > ressources could probably move outside the mutex protected section ? > Well, you could move things around to shorten the overall time the lock is held but that might require structural changes in the code that will not necessairily fit nice. Resizing a CQ is not a frequent operation and this lock is used to avoid concurrent attempts of resizing of the same CQ so I would not invest more effort here. > > > > > > int mlx5_core_modify_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, > > - struct mlx5_modify_cq_mbox_in *in) > > + struct mlx5_modify_cq_mbox_in *in, int in_sz) > ^^^^^^^^^^ > > Should probably be 'unsigned' ? size_t ? > > same here. > The resized value is defined int at the ib core layer so I chose to follow the same type to avoid need for casting. Maybe a future patch could change the type all over. > diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h > > index dbb03ca..87e2371 100644 > > --- a/include/linux/mlx5/device.h > > +++ b/include/linux/mlx5/device.h > > @@ -710,6 +711,7 @@ struct mlx5_modify_cq_mbox_in { > > > > struct mlx5_modify_cq_mbox_out { > > struct mlx5_outbox_hdr hdr; > > + u8 rsvd[8]; > > }; > > > > struct mlx5_enable_hca_mbox_in { > > > > It not clear why 8 bytes are needed here ? > This is a requirement of the driver/firmware interface. -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Hi Eli, Le mercredi 15 janvier 2014 à 09:33 +0200, Eli Cohen a écrit : > On Tue, Jan 14, 2014 at 05:36:50PM +0100, Yann Droneaud wrote: > > > + if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd))) > > > + return -EFAULT; > > > + > > > > You might also write > > > > err = ib_copy_from_udata(&ucmd, udata, sizeof(ucmd)); > > if (err) > > return err; I'd like to have your opinion on this change. I'm going to patch every call to ib_copy_from_udata() to follow my proposed scheme (eg. returned error code from ib_copy_from_udata() instead of hard coded error value). > > > > Then you should check reserved fields being set to the default value: > > As noted by Daniel Vetter in its article "Botching up ioctls"[1] > > "Check *all* unused fields and flags and all the padding for whether > > it's 0, and reject the ioctl if that's not the case. Otherwise your > > nice plan for future extensions is going right down the gutters > > since someone *will* submit an ioctl struct with random stack > > garbage in the yet unused parts. Which then bakes in the ABI that > > those fields can never be used for anything else but garbage." > > It's important to ensure that reserved fields are set to known value, > > so that it will be possible to use them latter to extend the ABI. > > > > [1] http://blog.ffwll.ch/2013/11/botching-up-ioctls.html > > > > if (ucmd.reserved0 || ucmd.reserved1) > > return -EINVAL; > > > It is not likely that someone will pass non-zero values here since > libmlx5 clears and most apps will use it. Is libmlx5/libibverbs the ABI ? Unfortunately, anybody is allowed to access the kernel uverbs API directly, so we must take care of the kernel ABI, just in case. > But I agree with your > comment - thanks for pointing this out. Probably there are other > places that need to be checked. > For code not yet part of a released kernel version, we can fix that. But for all other, it would require proper checking/thinking before rejecting reserved field not set to 0 since it might theoterically break existing userspace program: it will be a departure from previous ABI. > > > > + } > > > + mutex_unlock(&cq->resize_mutex); > > ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ > > > > Is everything in this section really critical. > > For example, allocating and setting 'in' structure or releasing the > > ressources could probably move outside the mutex protected section ? > > > > Well, you could move things around to shorten the overall time the > lock is held but that might require structural changes in the code > that will not necessairily fit nice. Resizing a CQ is not a frequent > operation and this lock is used to avoid concurrent attempts of > resizing of the same CQ so I would not invest more effort here. > I agree. I would found more readable to have the two locks held next each other. YMMV. > > > > > > > > > int mlx5_core_modify_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, > > > - struct mlx5_modify_cq_mbox_in *in) > > > + struct mlx5_modify_cq_mbox_in *in, int in_sz) > > ^^^^^^^^^^ > > > > Should probably be 'unsigned' ? size_t ? > > > > same here. > > > > The resized value is defined int at the ib core layer so I chose to > follow the same type to avoid need for casting. Maybe a future patch > could change the type all over. > But it's the size of struct mlx5_modify_cq_mbox_in, not the number of 'cqe' to resize the cq to. > > diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h > > > index dbb03ca..87e2371 100644 > > > --- a/include/linux/mlx5/device.h > > > +++ b/include/linux/mlx5/device.h > > > @@ -710,6 +711,7 @@ struct mlx5_modify_cq_mbox_in { > > > > > > struct mlx5_modify_cq_mbox_out { > > > struct mlx5_outbox_hdr hdr; > > > + u8 rsvd[8]; > > > }; > > > > > > struct mlx5_enable_hca_mbox_in { > > > > > > > It not clear why 8 bytes are needed here ? > > > This is a requirement of the driver/firmware interface. It's a bit of magic :( Regards.
On Wed, Jan 15, 2014 at 11:02:55AM +0100, Yann Droneaud wrote: > > I'd like to have your opinion on this change. > > I'm going to patch every call to ib_copy_from_udata() to follow my > proposed scheme (eg. returned error code from ib_copy_from_udata() > instead of hard coded error value). > Yes, makes sense to me. > > Is libmlx5/libibverbs the ABI ? > > Unfortunately, anybody is allowed to access the kernel uverbs API > directly, so we must take care of the kernel ABI, just in case. > I am not sure I understand the question but libibverbs to libmlx5 relation is somehow like the relationship between ib_core and mlx5_ib. e.g. libibverbs sets the API and libmlx5 implements it. > > For code not yet part of a released kernel version, we can fix that. > But for all other, it would require proper checking/thinking before > rejecting reserved field not set to 0 since it might theoterically break > existing userspace program: it will be a departure from previous ABI. > Yes, this needs to be done carefully for each driver. We need to check if either libibverbs or any other provider library clears the struct before sending. If this is the case then it should be safe to fix the kernel part. > > I would found more readable to have the two locks held next each other. > YMMV. > I am not following you here. Please be more specific. > > > But it's the size of struct mlx5_modify_cq_mbox_in, not the number of > 'cqe' to resize the cq to. > Oh I see. Well, I don't think it will ever matter that I using int. I use it in other places too. -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Hi, Le mercredi 15 janvier 2014 à 13:24 +0200, Eli Cohen a écrit : > On Wed, Jan 15, 2014 at 11:02:55AM +0100, Yann Droneaud wrote: > > > > Is libmlx5/libibverbs the ABI ? > > > > Unfortunately, anybody is allowed to access the kernel uverbs API > > directly, so we must take care of the kernel ABI, just in case. > > > > I am not sure I understand the question but libibverbs to libmlx5 > relation is somehow like the relationship between ib_core and mlx5_ib. > e.g. libibverbs sets the API and libmlx5 implements it. It's a theoretical question: where to draw the line of the kernel ABI when it comes to uverbs ? Is it write() + all the data structures for request and response ? Or is it libibverbs + provider specific library ? In second case, we can change the behavor of the uverbs provided that it doesn't interfere with libibverbs: modifying corner case behavors which were never hit by libibverbs is then safe: it won't break userspace application. But in the first case, we cannot for sure attest that modifying behavor will not break hypothetical userspace application issuing uverbs command without going through libibverbs: such specific/specialized application might call uverbs in a way different that libibverbs which could break. I'm not aware of such unknown application, but I believe it could exists in the "proprietary" world. > > > > For code not yet part of a released kernel version, we can fix that. > > But for all other, it would require proper checking/thinking before > > rejecting reserved field not set to 0 since it might theoterically break > > existing userspace program: it will be a departure from previous ABI. > > > > Yes, this needs to be done carefully for each driver. We need to check > if either libibverbs or any other provider library clears the struct > before sending. If this is the case then it should be safe to fix the > kernel part. > > > > > I would found more readable to have the two locks held next each other. > > YMMV. > > > I am not following you here. Please be more specific. I thought that reworking the code to move mutex_lock(&cq->resize_mutex) and spin_lock_irqsave(&cq->lock, flags); next to each other could show that the first one would not be needed: perhaps is it possible doing: - prepare everything, - take the lock, - swap, - release the lock, - clean up. (oh, that remind me RCU). I agree that resizing the CQ is not a frequent operation, so trying to rewrite the code as I suggest won't give a lot of advantages. Regards.
diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c index b4c122e..50b03a8 100644 --- a/drivers/infiniband/hw/mlx5/cq.c +++ b/drivers/infiniband/hw/mlx5/cq.c @@ -73,14 +73,24 @@ static void *get_cqe(struct mlx5_ib_cq *cq, int n) return get_cqe_from_buf(&cq->buf, n, cq->mcq.cqe_sz); } +static u8 sw_ownership_bit(int n, int nent) +{ + return (n & nent) ? 1 : 0; +} + static void *get_sw_cqe(struct mlx5_ib_cq *cq, int n) { void *cqe = get_cqe(cq, n & cq->ibcq.cqe); struct mlx5_cqe64 *cqe64; cqe64 = (cq->mcq.cqe_sz == 64) ? cqe : cqe + 64; - return ((cqe64->op_own & MLX5_CQE_OWNER_MASK) ^ - !!(n & (cq->ibcq.cqe + 1))) ? NULL : cqe; + + if (likely((cqe64->op_own) >> 4 != MLX5_CQE_INVALID) && + !((cqe64->op_own & MLX5_CQE_OWNER_MASK) ^ !!(n & (cq->ibcq.cqe + 1)))) { + return cqe; + } else { + return NULL; + } } static void *next_cqe_sw(struct mlx5_ib_cq *cq) @@ -351,6 +361,11 @@ static void handle_atomics(struct mlx5_ib_qp *qp, struct mlx5_cqe64 *cqe64, qp->sq.last_poll = tail; } +static void free_cq_buf(struct mlx5_ib_dev *dev, struct mlx5_ib_cq_buf *buf) +{ + mlx5_buf_free(&dev->mdev, &buf->buf); +} + static int mlx5_poll_one(struct mlx5_ib_cq *cq, struct mlx5_ib_qp **cur_qp, struct ib_wc *wc) @@ -366,6 +381,7 @@ static int mlx5_poll_one(struct mlx5_ib_cq *cq, void *cqe; int idx; +repoll: cqe = next_cqe_sw(cq); if (!cqe) return -EAGAIN; @@ -379,7 +395,18 @@ static int mlx5_poll_one(struct mlx5_ib_cq *cq, */ rmb(); - /* TBD: resize CQ */ + opcode = cqe64->op_own >> 4; + if (unlikely(opcode == MLX5_CQE_RESIZE_CQ)) { + if (likely(cq->resize_buf)) { + free_cq_buf(dev, &cq->buf); + cq->buf = *cq->resize_buf; + kfree(cq->resize_buf); + cq->resize_buf = NULL; + goto repoll; + } else { + mlx5_ib_warn(dev, "unexpected resize cqe\n"); + } + } qpn = ntohl(cqe64->sop_drop_qpn) & 0xffffff; if (!*cur_qp || (qpn != (*cur_qp)->ibqp.qp_num)) { @@ -398,7 +425,6 @@ static int mlx5_poll_one(struct mlx5_ib_cq *cq, } wc->qp = &(*cur_qp)->ibqp; - opcode = cqe64->op_own >> 4; switch (opcode) { case MLX5_CQE_REQ: wq = &(*cur_qp)->sq; @@ -503,15 +529,11 @@ static int alloc_cq_buf(struct mlx5_ib_dev *dev, struct mlx5_ib_cq_buf *buf, return err; buf->cqe_size = cqe_size; + buf->nent = nent; return 0; } -static void free_cq_buf(struct mlx5_ib_dev *dev, struct mlx5_ib_cq_buf *buf) -{ - mlx5_buf_free(&dev->mdev, &buf->buf); -} - static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata, struct ib_ucontext *context, struct mlx5_ib_cq *cq, int entries, struct mlx5_create_cq_mbox_in **cqb, @@ -576,16 +598,16 @@ static void destroy_cq_user(struct mlx5_ib_cq *cq, struct ib_ucontext *context) ib_umem_release(cq->buf.umem); } -static void init_cq_buf(struct mlx5_ib_cq *cq, int nent) +static void init_cq_buf(struct mlx5_ib_cq *cq, struct mlx5_ib_cq_buf *buf) { int i; void *cqe; struct mlx5_cqe64 *cqe64; - for (i = 0; i < nent; i++) { - cqe = get_cqe(cq, i); - cqe64 = (cq->buf.cqe_size == 64) ? cqe : cqe + 64; - cqe64->op_own = 0xf1; + for (i = 0; i < buf->nent; i++) { + cqe = get_cqe_from_buf(buf, i, buf->cqe_size); + cqe64 = buf->cqe_size == 64 ? cqe : cqe + 64; + cqe64->op_own = MLX5_CQE_INVALID << 4; } } @@ -610,7 +632,7 @@ static int create_cq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq, if (err) goto err_db; - init_cq_buf(cq, entries); + init_cq_buf(cq, &cq->buf); *inlen = sizeof(**cqb) + sizeof(*(*cqb)->pas) * cq->buf.buf.npages; *cqb = mlx5_vzalloc(*inlen); @@ -836,7 +858,7 @@ int mlx5_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period) in->ctx.cq_period = cpu_to_be16(cq_period); in->ctx.cq_max_count = cpu_to_be16(cq_count); in->field_select = cpu_to_be32(fsel); - err = mlx5_core_modify_cq(&dev->mdev, &mcq->mcq, in); + err = mlx5_core_modify_cq(&dev->mdev, &mcq->mcq, in, sizeof(*in)); kfree(in); if (err) @@ -845,9 +867,235 @@ int mlx5_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period) return err; } +static int resize_user(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq, + int entries, struct ib_udata *udata, int *npas, + int *page_shift, int *cqe_size) +{ + struct mlx5_ib_resize_cq ucmd; + struct ib_umem *umem; + int err; + int npages; + struct ib_ucontext *context = cq->buf.umem->context; + + if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd))) + return -EFAULT; + + umem = ib_umem_get(context, ucmd.buf_addr, entries * ucmd.cqe_size, + IB_ACCESS_LOCAL_WRITE, 1); + if (IS_ERR(umem)) { + err = PTR_ERR(umem); + return err; + } + + mlx5_ib_cont_pages(umem, ucmd.buf_addr, &npages, page_shift, + npas, NULL); + + cq->resize_umem = umem; + *cqe_size = ucmd.cqe_size; + + return 0; +} + +static void un_resize_user(struct mlx5_ib_cq *cq) +{ + ib_umem_release(cq->resize_umem); +} + +static int resize_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq, + int entries, int cqe_size) +{ + int err; + + cq->resize_buf = kzalloc(sizeof(*cq->resize_buf), GFP_KERNEL); + if (!cq->resize_buf) + return -ENOMEM; + + err = alloc_cq_buf(dev, cq->resize_buf, entries, cqe_size); + if (err) + goto ex; + + init_cq_buf(cq, cq->resize_buf); + + return 0; + +ex: + kfree(cq->resize_buf); + return err; +} + +static void un_resize_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq) +{ + free_cq_buf(dev, cq->resize_buf); + cq->resize_buf = NULL; +} + +static int copy_resize_cqes(struct mlx5_ib_cq *cq) +{ + struct mlx5_ib_dev *dev = to_mdev(cq->ibcq.device); + struct mlx5_cqe64 *scqe64; + struct mlx5_cqe64 *dcqe64; + void *start_cqe; + void *scqe; + void *dcqe; + int ssize; + int dsize; + int i; + u8 sw_own; + + ssize = cq->buf.cqe_size; + dsize = cq->resize_buf->cqe_size; + if (ssize != dsize) { + mlx5_ib_warn(dev, "resize from different cqe size is not supported\n"); + return -EINVAL; + } + + i = cq->mcq.cons_index; + scqe = get_sw_cqe(cq, i); + scqe64 = ssize == 64 ? scqe : scqe + 64; + start_cqe = scqe; + if (!scqe) { + mlx5_ib_warn(dev, "expected cqe in sw ownership\n"); + return -EINVAL; + } + + while ((scqe64->op_own >> 4) != MLX5_CQE_RESIZE_CQ) { + dcqe = get_cqe_from_buf(cq->resize_buf, + (i + 1) & (cq->resize_buf->nent), + dsize); + dcqe64 = dsize == 64 ? dcqe : dcqe + 64; + sw_own = sw_ownership_bit(i + 1, cq->resize_buf->nent); + memcpy(dcqe, scqe, dsize); + dcqe64->op_own = (dcqe64->op_own & ~MLX5_CQE_OWNER_MASK) | sw_own; + + ++i; + scqe = get_sw_cqe(cq, i); + scqe64 = ssize == 64 ? scqe : scqe + 64; + if (!scqe) { + mlx5_ib_warn(dev, "expected cqe in sw ownership\n"); + return -EINVAL; + } + + if (scqe == start_cqe) { + pr_warn("resize CQ failed to get resize CQE, CQN 0x%x\n", + cq->mcq.cqn); + return -ENOMEM; + } + } + ++cq->mcq.cons_index; + return 0; +} + int mlx5_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata) { - return -ENOSYS; + struct mlx5_ib_dev *dev = to_mdev(ibcq->device); + struct mlx5_ib_cq *cq = to_mcq(ibcq); + struct mlx5_modify_cq_mbox_in *in; + int err; + int npas; + int page_shift; + int inlen; + int uninitialized_var(cqe_size); + unsigned long flags; + + if (!(dev->mdev.caps.flags & MLX5_DEV_CAP_FLAG_RESIZE_CQ)) { + pr_info("Firmware does not support resize CQ\n"); + return -ENOSYS; + } + + if (entries < 1) + return -EINVAL; + + entries = roundup_pow_of_two(entries + 1); + if (entries > dev->mdev.caps.max_cqes + 1) + return -EINVAL; + + if (entries == ibcq->cqe + 1) + return 0; + + mutex_lock(&cq->resize_mutex); + if (udata) { + err = resize_user(dev, cq, entries, udata, &npas, &page_shift, + &cqe_size); + } else { + cqe_size = 64; + err = resize_kernel(dev, cq, entries, cqe_size); + if (!err) { + npas = cq->resize_buf->buf.npages; + page_shift = cq->resize_buf->buf.page_shift; + } + } + + if (err) + goto ex; + + inlen = sizeof(*in) + npas * sizeof(in->pas[0]); + in = mlx5_vzalloc(inlen); + if (!in) { + err = -ENOMEM; + goto ex_resize; + } + + if (udata) + mlx5_ib_populate_pas(dev, cq->resize_umem, page_shift, + in->pas, 0); + else + mlx5_fill_page_array(&cq->resize_buf->buf, in->pas); + + in->field_select = cpu_to_be32(MLX5_MODIFY_CQ_MASK_LOG_SIZE | + MLX5_MODIFY_CQ_MASK_PG_OFFSET | + MLX5_MODIFY_CQ_MASK_PG_SIZE); + in->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT; + in->ctx.cqe_sz_flags = cqe_sz_to_mlx_sz(cqe_size) << 5; + in->ctx.page_offset = 0; + in->ctx.log_sz_usr_page = cpu_to_be32(ilog2(entries) << 24); + in->hdr.opmod = cpu_to_be16(MLX5_CQ_OPMOD_RESIZE); + in->cqn = cpu_to_be32(cq->mcq.cqn); + + err = mlx5_core_modify_cq(&dev->mdev, &cq->mcq, in, inlen); + if (err) + goto ex_alloc; + + if (udata) { + cq->ibcq.cqe = entries - 1; + ib_umem_release(cq->buf.umem); + cq->buf.umem = cq->resize_umem; + cq->resize_umem = NULL; + } else { + struct mlx5_ib_cq_buf tbuf; + int resized = 0; + + spin_lock_irqsave(&cq->lock, flags); + if (cq->resize_buf) { + err = copy_resize_cqes(cq); + if (!err) { + tbuf = cq->buf; + cq->buf = *cq->resize_buf; + kfree(cq->resize_buf); + cq->resize_buf = NULL; + resized = 1; + } + } + cq->ibcq.cqe = entries - 1; + spin_unlock_irqrestore(&cq->lock, flags); + if (resized) + free_cq_buf(dev, &tbuf); + } + mutex_unlock(&cq->resize_mutex); + + mlx5_vfree(in); + return 0; + +ex_alloc: + mlx5_vfree(in); + +ex_resize: + if (udata) + un_resize_user(cq); + else + un_resize_kernel(dev, cq); +ex: + mutex_unlock(&cq->resize_mutex); + return err; } int mlx5_ib_get_cqe_size(struct mlx5_ib_dev *dev, struct ib_cq *ibcq) diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 5acef30..389e319 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -195,6 +195,7 @@ struct mlx5_ib_cq_buf { struct mlx5_buf buf; struct ib_umem *umem; int cqe_size; + int nent; }; enum mlx5_ib_qp_flags { @@ -220,7 +221,7 @@ struct mlx5_ib_cq { /* protect resize cq */ struct mutex resize_mutex; - struct mlx5_ib_cq_resize *resize_buf; + struct mlx5_ib_cq_buf *resize_buf; struct ib_umem *resize_umem; int cqe_size; }; diff --git a/drivers/infiniband/hw/mlx5/user.h b/drivers/infiniband/hw/mlx5/user.h index a886de3..32a2a5d 100644 --- a/drivers/infiniband/hw/mlx5/user.h +++ b/drivers/infiniband/hw/mlx5/user.h @@ -93,6 +93,9 @@ struct mlx5_ib_create_cq_resp { struct mlx5_ib_resize_cq { __u64 buf_addr; + __u16 cqe_size; + __u16 reserved0; + __u32 reserved1; }; struct mlx5_ib_create_srq { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cq.c b/drivers/net/ethernet/mellanox/mlx5/core/cq.c index e6fedcf..43c5f48 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cq.c @@ -201,14 +201,14 @@ EXPORT_SYMBOL(mlx5_core_query_cq); int mlx5_core_modify_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, - struct mlx5_modify_cq_mbox_in *in) + struct mlx5_modify_cq_mbox_in *in, int in_sz) { struct mlx5_modify_cq_mbox_out out; int err; memset(&out, 0, sizeof(out)); in->hdr.opcode = cpu_to_be16(MLX5_CMD_OP_MODIFY_CQ); - err = mlx5_cmd_exec(dev, in, sizeof(*in), &out, sizeof(out)); + err = mlx5_cmd_exec(dev, in, in_sz, &out, sizeof(out)); if (err) return err; diff --git a/include/linux/mlx5/cq.h b/include/linux/mlx5/cq.h index c3cf5a4..2202c7f 100644 --- a/include/linux/mlx5/cq.h +++ b/include/linux/mlx5/cq.h @@ -79,9 +79,10 @@ enum { MLX5_CQE_RESP_SEND = 2, MLX5_CQE_RESP_SEND_IMM = 3, MLX5_CQE_RESP_SEND_INV = 4, - MLX5_CQE_RESIZE_CQ = 0xff, /* TBD */ + MLX5_CQE_RESIZE_CQ = 5, MLX5_CQE_REQ_ERR = 13, MLX5_CQE_RESP_ERR = 14, + MLX5_CQE_INVALID = 15, }; enum { @@ -90,6 +91,13 @@ enum { MLX5_CQ_MODIFY_OVERRUN = 1 << 2, }; +enum { + MLX5_CQ_OPMOD_RESIZE = 1, + MLX5_MODIFY_CQ_MASK_LOG_SIZE = 1 << 0, + MLX5_MODIFY_CQ_MASK_PG_OFFSET = 1 << 1, + MLX5_MODIFY_CQ_MASK_PG_SIZE = 1 << 2, +}; + struct mlx5_cq_modify_params { int type; union { @@ -158,7 +166,7 @@ int mlx5_core_destroy_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq); int mlx5_core_query_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, struct mlx5_query_cq_mbox_out *out); int mlx5_core_modify_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, - struct mlx5_modify_cq_mbox_in *in); + struct mlx5_modify_cq_mbox_in *in, int in_sz); int mlx5_debug_cq_add(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq); void mlx5_debug_cq_remove(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq); diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index dbb03ca..87e2371 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -178,6 +178,7 @@ enum { MLX5_DEV_CAP_FLAG_ATOMIC = 1LL << 18, MLX5_DEV_CAP_FLAG_ON_DMND_PG = 1LL << 24, MLX5_DEV_CAP_FLAG_CQ_MODER = 1LL << 29, + MLX5_DEV_CAP_FLAG_RESIZE_CQ = 1LL << 30, MLX5_DEV_CAP_FLAG_RESIZE_SRQ = 1LL << 32, MLX5_DEV_CAP_FLAG_REMOTE_FENCE = 1LL << 38, MLX5_DEV_CAP_FLAG_TLP_HINTS = 1LL << 39, @@ -710,6 +711,7 @@ struct mlx5_modify_cq_mbox_in { struct mlx5_modify_cq_mbox_out { struct mlx5_outbox_hdr hdr; + u8 rsvd[8]; }; struct mlx5_enable_hca_mbox_in {
Implement resize CQ which is a mandatory verb in mlx5. Signed-off-by: Eli Cohen <eli@mellanox.com> Conflicts: include/linux/mlx5/device.h --- drivers/infiniband/hw/mlx5/cq.c | 282 +++++++++++++++++++++++++-- drivers/infiniband/hw/mlx5/mlx5_ib.h | 3 +- drivers/infiniband/hw/mlx5/user.h | 3 + drivers/net/ethernet/mellanox/mlx5/core/cq.c | 4 +- include/linux/mlx5/cq.h | 12 +- include/linux/mlx5/device.h | 2 + 6 files changed, 284 insertions(+), 22 deletions(-)