Message ID | 1678868215-23626-1-git-send-email-selvin.xavier@broadcom.com (mailing list archive) |
---|---|
State | Accepted |
Headers | show |
Series | [for-next] RDMA/bnxt_re: Add resize_cq support | expand |
On 2023/3/15 16:16, Selvin Xavier wrote: > Add resize_cq verb support for user space CQs. Resize operation for > kernel CQs are not supported now. > > Driver should free the current CQ only after user library polls > for all the completions and switch to new CQ. So after the resize_cq > is returned from the driver, user libray polls for existing completions libray -> library > and store it as temporary data. Once library reaps all completions in the > current CQ, it invokes the ibv_cmd_poll_cq to inform the driver about > the resize_cq completion. Adding a check for user CQs in driver's > poll_cq and complete the resize operation for user CQs. > Updating uverbs_cmd_mask with poll_cq to support this. > > User library changes are available in this pull request. > https://github.com/linux-rdma/rdma-core/pull/1315 > > Signed-off-by: Selvin Xavier <selvin.xavier@broadcom.com> > --- > drivers/infiniband/hw/bnxt_re/ib_verbs.c | 109 +++++++++++++++++++++++++++++++ > drivers/infiniband/hw/bnxt_re/ib_verbs.h | 3 + > drivers/infiniband/hw/bnxt_re/main.c | 2 + > drivers/infiniband/hw/bnxt_re/qplib_fp.c | 44 +++++++++++++ > drivers/infiniband/hw/bnxt_re/qplib_fp.h | 5 ++ > include/uapi/rdma/bnxt_re-abi.h | 4 ++ > 6 files changed, 167 insertions(+) > > diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c > index 989edc7..e86afec 100644 > --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c > +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c > @@ -2912,6 +2912,106 @@ int bnxt_re_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, > return rc; > } > > +static void bnxt_re_resize_cq_complete(struct bnxt_re_cq *cq) > +{ > + struct bnxt_re_dev *rdev = cq->rdev; > + > + bnxt_qplib_resize_cq_complete(&rdev->qplib_res, &cq->qplib_cq); > + > + cq->qplib_cq.max_wqe = cq->resize_cqe; > + if (cq->resize_umem) { > + ib_umem_release(cq->umem); > + cq->umem = cq->resize_umem; > + cq->resize_umem = NULL; > + cq->resize_cqe = 0; > + } > +} > + > +int bnxt_re_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) > +{ > + struct bnxt_qplib_sg_info sg_info = {}; > + struct bnxt_qplib_dpi *orig_dpi = NULL; > + struct bnxt_qplib_dev_attr *dev_attr; > + struct bnxt_re_ucontext *uctx = NULL; > + struct bnxt_re_resize_cq_req req; > + struct bnxt_re_dev *rdev; > + struct bnxt_re_cq *cq; > + int rc, entries; > + > + cq = container_of(ibcq, struct bnxt_re_cq, ib_cq); > + rdev = cq->rdev; > + dev_attr = &rdev->dev_attr; > + if (!ibcq->uobject) { > + ibdev_err(&rdev->ibdev, "Kernel CQ Resize not supported"); > + return -EOPNOTSUPP; > + } > + > + if (cq->resize_umem) { > + ibdev_err(&rdev->ibdev, "Resize CQ %#x failed - Busy", > + cq->qplib_cq.id); > + return -EBUSY; > + } Does above cq->resize_umem checking has any conconcurrent protection again the bnxt_re_resize_cq_complete() called by bnxt_re_poll_cq()? bnxt_re_resize_cq() seems like a control path operation, while bnxt_re_poll_cq() seems like a data path operation, I am not sure there is any conconcurrent protection between them. > + > + /* Check the requested cq depth out of supported depth */ > + if (cqe < 1 || cqe > dev_attr->max_cq_wqes) { > + ibdev_err(&rdev->ibdev, "Resize CQ %#x failed - out of range cqe %d", > + cq->qplib_cq.id, cqe); > + return -EINVAL; > + } > + > + entries = roundup_pow_of_two(cqe + 1); > + if (entries > dev_attr->max_cq_wqes + 1) > + entries = dev_attr->max_cq_wqes + 1; > + > + uctx = rdma_udata_to_drv_context(udata, struct bnxt_re_ucontext, > + ib_uctx); > + /* uverbs consumer */ > + if (ib_copy_from_udata(&req, udata, sizeof(req))) { > + rc = -EFAULT; > + goto fail; > + } > + > + cq->resize_umem = ib_umem_get(&rdev->ibdev, req.cq_va, > + entries * sizeof(struct cq_base), > + IB_ACCESS_LOCAL_WRITE); > + if (IS_ERR(cq->resize_umem)) { > + rc = PTR_ERR(cq->resize_umem); > + cq->resize_umem = NULL; > + ibdev_err(&rdev->ibdev, "%s: ib_umem_get failed! rc = %d\n", > + __func__, rc); > + goto fail; > + } > + cq->resize_cqe = entries; > + memcpy(&sg_info, &cq->qplib_cq.sg_info, sizeof(sg_info)); > + orig_dpi = cq->qplib_cq.dpi; > + > + cq->qplib_cq.sg_info.umem = cq->resize_umem; > + cq->qplib_cq.sg_info.pgsize = PAGE_SIZE; > + cq->qplib_cq.sg_info.pgshft = PAGE_SHIFT; > + cq->qplib_cq.dpi = &uctx->dpi; > + > + rc = bnxt_qplib_resize_cq(&rdev->qplib_res, &cq->qplib_cq, entries); > + if (rc) { > + ibdev_err(&rdev->ibdev, "Resize HW CQ %#x failed!", > + cq->qplib_cq.id); > + goto fail; > + } > + > + cq->ib_cq.cqe = cq->resize_cqe; > + > + return 0; > + > +fail: > + if (cq->resize_umem) { > + ib_umem_release(cq->resize_umem); > + cq->resize_umem = NULL; > + cq->resize_cqe = 0; > + memcpy(&cq->qplib_cq.sg_info, &sg_info, sizeof(sg_info)); > + cq->qplib_cq.dpi = orig_dpi; > + } > + return rc; > +} > + > static u8 __req_to_ib_wc_status(u8 qstatus) > { > switch (qstatus) { > @@ -3425,6 +3525,15 @@ int bnxt_re_poll_cq(struct ib_cq *ib_cq, int num_entries, struct ib_wc *wc) > struct bnxt_re_sqp_entries *sqp_entry = NULL; > unsigned long flags; > > + /* User CQ; the only processing we do is to > + * complete any pending CQ resize operation. > + */ > + if (cq->umem) { > + if (cq->resize_umem) > + bnxt_re_resize_cq_complete(cq); > + return 0; > + } > + > spin_lock_irqsave(&cq->cq_lock, flags); > budget = min_t(u32, num_entries, cq->max_cql); > num_entries = budget; > diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.h b/drivers/infiniband/hw/bnxt_re/ib_verbs.h > index 9432626..31f7e34 100644 > --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.h > +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.h > @@ -104,6 +104,8 @@ struct bnxt_re_cq { > #define MAX_CQL_PER_POLL 1024 > u32 max_cql; > struct ib_umem *umem; > + struct ib_umem *resize_umem; > + int resize_cqe; > }; > > struct bnxt_re_mr { > @@ -191,6 +193,7 @@ int bnxt_re_post_recv(struct ib_qp *qp, const struct ib_recv_wr *recv_wr, > const struct ib_recv_wr **bad_recv_wr); > int bnxt_re_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, > struct ib_udata *udata); > +int bnxt_re_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata); > int bnxt_re_destroy_cq(struct ib_cq *cq, struct ib_udata *udata); > int bnxt_re_poll_cq(struct ib_cq *cq, int num_entries, struct ib_wc *wc); > int bnxt_re_req_notify_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags); > diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c > index c5867e7..48bbba7 100644 > --- a/drivers/infiniband/hw/bnxt_re/main.c > +++ b/drivers/infiniband/hw/bnxt_re/main.c > @@ -553,6 +553,7 @@ static const struct ib_device_ops bnxt_re_dev_ops = { > .query_srq = bnxt_re_query_srq, > .reg_user_mr = bnxt_re_reg_user_mr, > .req_notify_cq = bnxt_re_req_notify_cq, > + .resize_cq = bnxt_re_resize_cq, > INIT_RDMA_OBJ_SIZE(ib_ah, bnxt_re_ah, ib_ah), > INIT_RDMA_OBJ_SIZE(ib_cq, bnxt_re_cq, ib_cq), > INIT_RDMA_OBJ_SIZE(ib_pd, bnxt_re_pd, ib_pd), > @@ -584,6 +585,7 @@ static int bnxt_re_register_ib(struct bnxt_re_dev *rdev) > return ret; > > dma_set_max_seg_size(&rdev->en_dev->pdev->dev, UINT_MAX); > + ibdev->uverbs_cmd_mask |= BIT_ULL(IB_USER_VERBS_CMD_POLL_CQ); > return ib_register_device(ibdev, "bnxt_re%d", &rdev->en_dev->pdev->dev); > } > > diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c > index 96e581c..1d769a3 100644 > --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c > +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c > @@ -2100,6 +2100,50 @@ int bnxt_qplib_create_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq) > return rc; > } > > +void bnxt_qplib_resize_cq_complete(struct bnxt_qplib_res *res, > + struct bnxt_qplib_cq *cq) > +{ > + bnxt_qplib_free_hwq(res, &cq->hwq); > + memcpy(&cq->hwq, &cq->resize_hwq, sizeof(cq->hwq)); > +} > + > +int bnxt_qplib_resize_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq, > + int new_cqes) > +{ > + struct bnxt_qplib_hwq_attr hwq_attr = {}; > + struct bnxt_qplib_rcfw *rcfw = res->rcfw; > + struct creq_resize_cq_resp resp = {}; > + struct cmdq_resize_cq req = {}; > + struct bnxt_qplib_pbl *pbl; > + u32 pg_sz, lvl, new_sz; > + u16 cmd_flags = 0; > + int rc; > + > + RCFW_CMD_PREP(req, RESIZE_CQ, cmd_flags); > + hwq_attr.sginfo = &cq->sg_info; > + hwq_attr.res = res; > + hwq_attr.depth = new_cqes; > + hwq_attr.stride = sizeof(struct cq_base); > + hwq_attr.type = HWQ_TYPE_QUEUE; > + rc = bnxt_qplib_alloc_init_hwq(&cq->resize_hwq, &hwq_attr); > + if (rc) > + return rc; > + > + req.cq_cid = cpu_to_le32(cq->id); > + pbl = &cq->resize_hwq.pbl[PBL_LVL_0]; > + pg_sz = bnxt_qplib_base_pg_size(&cq->resize_hwq); > + lvl = (cq->resize_hwq.level << CMDQ_RESIZE_CQ_LVL_SFT) & > + CMDQ_RESIZE_CQ_LVL_MASK; > + new_sz = (new_cqes << CMDQ_RESIZE_CQ_NEW_CQ_SIZE_SFT) & > + CMDQ_RESIZE_CQ_NEW_CQ_SIZE_MASK; > + req.new_cq_size_pg_size_lvl = cpu_to_le32(new_sz | pg_sz | lvl); > + req.new_pbl = cpu_to_le64(pbl->pg_map_arr[0]); > + > + rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req, > + (void *)&resp, NULL, 0); > + return rc; return bnxt_qplib_rcfw_send_message(rcfw, (void *)&req, (void *)&resp, NULL, 0); > +} > + > int bnxt_qplib_destroy_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq) > { > struct bnxt_qplib_rcfw *rcfw = res->rcfw; > diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.h b/drivers/infiniband/hw/bnxt_re/qplib_fp.h > index 0375019..d74d5ea 100644 > --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.h > +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.h > @@ -400,6 +400,7 @@ struct bnxt_qplib_cq { > u16 count; > u16 period; > struct bnxt_qplib_hwq hwq; > + struct bnxt_qplib_hwq resize_hwq; > u32 cnq_hw_ring_id; > struct bnxt_qplib_nq *nq; > bool resize_in_progress; > @@ -532,6 +533,10 @@ void bnxt_qplib_post_recv_db(struct bnxt_qplib_qp *qp); > int bnxt_qplib_post_recv(struct bnxt_qplib_qp *qp, > struct bnxt_qplib_swqe *wqe); > int bnxt_qplib_create_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq); > +int bnxt_qplib_resize_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq, > + int new_cqes); > +void bnxt_qplib_resize_cq_complete(struct bnxt_qplib_res *res, > + struct bnxt_qplib_cq *cq); > int bnxt_qplib_destroy_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq); > int bnxt_qplib_poll_cq(struct bnxt_qplib_cq *cq, struct bnxt_qplib_cqe *cqe, > int num, struct bnxt_qplib_qp **qp); > diff --git a/include/uapi/rdma/bnxt_re-abi.h b/include/uapi/rdma/bnxt_re-abi.h > index b1de99b..f761e75 100644 > --- a/include/uapi/rdma/bnxt_re-abi.h > +++ b/include/uapi/rdma/bnxt_re-abi.h > @@ -96,6 +96,10 @@ struct bnxt_re_cq_resp { > __u32 rsvd; > }; > > +struct bnxt_re_resize_cq_req { > + __u64 cq_va; > +}; > + > struct bnxt_re_qp_req { > __aligned_u64 qpsva; > __aligned_u64 qprva; >
On Fri, Mar 17, 2023 at 2:40 PM Yunsheng Lin <linyunsheng@huawei.com> wrote: > > On 2023/3/15 16:16, Selvin Xavier wrote: > > Add resize_cq verb support for user space CQs. Resize operation for > > kernel CQs are not supported now. > > > > Driver should free the current CQ only after user library polls > > for all the completions and switch to new CQ. So after the resize_cq > > is returned from the driver, user libray polls for existing completions > > libray -> library > > > and store it as temporary data. Once library reaps all completions in the > > current CQ, it invokes the ibv_cmd_poll_cq to inform the driver about > > the resize_cq completion. Adding a check for user CQs in driver's > > poll_cq and complete the resize operation for user CQs. > > Updating uverbs_cmd_mask with poll_cq to support this. > > > > User library changes are available in this pull request. > > https://github.com/linux-rdma/rdma-core/pull/1315 > > > > Signed-off-by: Selvin Xavier <selvin.xavier@broadcom.com> > > --- > > drivers/infiniband/hw/bnxt_re/ib_verbs.c | 109 +++++++++++++++++++++++++++++++ > > drivers/infiniband/hw/bnxt_re/ib_verbs.h | 3 + > > drivers/infiniband/hw/bnxt_re/main.c | 2 + > > drivers/infiniband/hw/bnxt_re/qplib_fp.c | 44 +++++++++++++ > > drivers/infiniband/hw/bnxt_re/qplib_fp.h | 5 ++ > > include/uapi/rdma/bnxt_re-abi.h | 4 ++ > > 6 files changed, 167 insertions(+) > > > > diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c > > index 989edc7..e86afec 100644 > > --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c > > +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c > > @@ -2912,6 +2912,106 @@ int bnxt_re_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, > > return rc; > > } > > > > +static void bnxt_re_resize_cq_complete(struct bnxt_re_cq *cq) > > +{ > > + struct bnxt_re_dev *rdev = cq->rdev; > > + > > + bnxt_qplib_resize_cq_complete(&rdev->qplib_res, &cq->qplib_cq); > > + > > + cq->qplib_cq.max_wqe = cq->resize_cqe; > > + if (cq->resize_umem) { > > + ib_umem_release(cq->umem); > > + cq->umem = cq->resize_umem; > > + cq->resize_umem = NULL; > > + cq->resize_cqe = 0; > > + } > > +} > > + > > +int bnxt_re_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) > > +{ > > + struct bnxt_qplib_sg_info sg_info = {}; > > + struct bnxt_qplib_dpi *orig_dpi = NULL; > > + struct bnxt_qplib_dev_attr *dev_attr; > > + struct bnxt_re_ucontext *uctx = NULL; > > + struct bnxt_re_resize_cq_req req; > > + struct bnxt_re_dev *rdev; > > + struct bnxt_re_cq *cq; > > + int rc, entries; > > + > > + cq = container_of(ibcq, struct bnxt_re_cq, ib_cq); > > + rdev = cq->rdev; > > + dev_attr = &rdev->dev_attr; > > + if (!ibcq->uobject) { > > + ibdev_err(&rdev->ibdev, "Kernel CQ Resize not supported"); > > + return -EOPNOTSUPP; > > + } > > + > > + if (cq->resize_umem) { > > + ibdev_err(&rdev->ibdev, "Resize CQ %#x failed - Busy", > > + cq->qplib_cq.id); > > + return -EBUSY; > > + } > > Does above cq->resize_umem checking has any conconcurrent protection > again the bnxt_re_resize_cq_complete() called by bnxt_re_poll_cq()? > > bnxt_re_resize_cq() seems like a control path operation, while > bnxt_re_poll_cq() seems like a data path operation, I am not sure > there is any conconcurrent protection between them. > The previous check is to prevent simultaneous resize_cq context from the user application. if you see the library implementation (PR https://github.com/linux-rdma/rdma-core/pull/1315), entire operation is done in the single resize_cq context from application i.e. bnxt_re_resize_cq -> ibv_cmd_resize_cq call driver bnxt_re_resize_cq and return -> poll out the current completions and store it in a user lib list -> issue an ibv_cmd_poll_cq. This will invoke bnxt_re_poll_cq in the kernel driver. We free the previous cq resources. So the synchronization between resize_cq and poll_cq is happening in the user lib. We can free the old CQ only after user land sees the final Completion on the previous CQ. So we return from the driver's bnxt_re_resize_cq and then poll for all completions and then use ibv_cmd_poll_cq as a hook to reach the kernel driver and free the old CQ's resource. Summarize, driver's bnxt_re_poll_cq for user space CQs will be called only from resize_cq context and no concurrent protection required in the driver. > > + > > + /* Check the requested cq depth out of supported depth */ > > + if (cqe < 1 || cqe > dev_attr->max_cq_wqes) { > > + ibdev_err(&rdev->ibdev, "Resize CQ %#x failed - out of range cqe %d", > > + cq->qplib_cq.id, cqe); > > + return -EINVAL; > > + } > > + > > + entries = roundup_pow_of_two(cqe + 1); > > + if (entries > dev_attr->max_cq_wqes + 1) > > + entries = dev_attr->max_cq_wqes + 1; > > + > > + uctx = rdma_udata_to_drv_context(udata, struct bnxt_re_ucontext, > > + ib_uctx); > > + /* uverbs consumer */ > > + if (ib_copy_from_udata(&req, udata, sizeof(req))) { > > + rc = -EFAULT; > > + goto fail; > > + } > > + > > + cq->resize_umem = ib_umem_get(&rdev->ibdev, req.cq_va, > > + entries * sizeof(struct cq_base), > > + IB_ACCESS_LOCAL_WRITE); > > + if (IS_ERR(cq->resize_umem)) { > > + rc = PTR_ERR(cq->resize_umem); > > + cq->resize_umem = NULL; > > + ibdev_err(&rdev->ibdev, "%s: ib_umem_get failed! rc = %d\n", > > + __func__, rc); > > + goto fail; > > + } > > + cq->resize_cqe = entries; > > + memcpy(&sg_info, &cq->qplib_cq.sg_info, sizeof(sg_info)); > > + orig_dpi = cq->qplib_cq.dpi; > > + > > + cq->qplib_cq.sg_info.umem = cq->resize_umem; > > + cq->qplib_cq.sg_info.pgsize = PAGE_SIZE; > > + cq->qplib_cq.sg_info.pgshft = PAGE_SHIFT; > > + cq->qplib_cq.dpi = &uctx->dpi; > > + > > + rc = bnxt_qplib_resize_cq(&rdev->qplib_res, &cq->qplib_cq, entries); > > + if (rc) { > > + ibdev_err(&rdev->ibdev, "Resize HW CQ %#x failed!", > > + cq->qplib_cq.id); > > + goto fail; > > + } > > + > > + cq->ib_cq.cqe = cq->resize_cqe; > > + > > + return 0; > > + > > +fail: > > + if (cq->resize_umem) { > > + ib_umem_release(cq->resize_umem); > > + cq->resize_umem = NULL; > > + cq->resize_cqe = 0; > > + memcpy(&cq->qplib_cq.sg_info, &sg_info, sizeof(sg_info)); > > + cq->qplib_cq.dpi = orig_dpi; > > + } > > + return rc; > > +} > > + > > static u8 __req_to_ib_wc_status(u8 qstatus) > > { > > switch (qstatus) { > > @@ -3425,6 +3525,15 @@ int bnxt_re_poll_cq(struct ib_cq *ib_cq, int num_entries, struct ib_wc *wc) > > struct bnxt_re_sqp_entries *sqp_entry = NULL; > > unsigned long flags; > > > > + /* User CQ; the only processing we do is to > > + * complete any pending CQ resize operation. > > + */ > > + if (cq->umem) { > > + if (cq->resize_umem) > > + bnxt_re_resize_cq_complete(cq); > > + return 0; > > + } > > + > > spin_lock_irqsave(&cq->cq_lock, flags); > > budget = min_t(u32, num_entries, cq->max_cql); > > num_entries = budget; > > diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.h b/drivers/infiniband/hw/bnxt_re/ib_verbs.h > > index 9432626..31f7e34 100644 > > --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.h > > +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.h > > @@ -104,6 +104,8 @@ struct bnxt_re_cq { > > #define MAX_CQL_PER_POLL 1024 > > u32 max_cql; > > struct ib_umem *umem; > > + struct ib_umem *resize_umem; > > + int resize_cqe; > > }; > > > > struct bnxt_re_mr { > > @@ -191,6 +193,7 @@ int bnxt_re_post_recv(struct ib_qp *qp, const struct ib_recv_wr *recv_wr, > > const struct ib_recv_wr **bad_recv_wr); > > int bnxt_re_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, > > struct ib_udata *udata); > > +int bnxt_re_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata); > > int bnxt_re_destroy_cq(struct ib_cq *cq, struct ib_udata *udata); > > int bnxt_re_poll_cq(struct ib_cq *cq, int num_entries, struct ib_wc *wc); > > int bnxt_re_req_notify_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags); > > diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c > > index c5867e7..48bbba7 100644 > > --- a/drivers/infiniband/hw/bnxt_re/main.c > > +++ b/drivers/infiniband/hw/bnxt_re/main.c > > @@ -553,6 +553,7 @@ static const struct ib_device_ops bnxt_re_dev_ops = { > > .query_srq = bnxt_re_query_srq, > > .reg_user_mr = bnxt_re_reg_user_mr, > > .req_notify_cq = bnxt_re_req_notify_cq, > > + .resize_cq = bnxt_re_resize_cq, > > INIT_RDMA_OBJ_SIZE(ib_ah, bnxt_re_ah, ib_ah), > > INIT_RDMA_OBJ_SIZE(ib_cq, bnxt_re_cq, ib_cq), > > INIT_RDMA_OBJ_SIZE(ib_pd, bnxt_re_pd, ib_pd), > > @@ -584,6 +585,7 @@ static int bnxt_re_register_ib(struct bnxt_re_dev *rdev) > > return ret; > > > > dma_set_max_seg_size(&rdev->en_dev->pdev->dev, UINT_MAX); > > + ibdev->uverbs_cmd_mask |= BIT_ULL(IB_USER_VERBS_CMD_POLL_CQ); > > return ib_register_device(ibdev, "bnxt_re%d", &rdev->en_dev->pdev->dev); > > } > > > > diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c > > index 96e581c..1d769a3 100644 > > --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c > > +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c > > @@ -2100,6 +2100,50 @@ int bnxt_qplib_create_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq) > > return rc; > > } > > > > +void bnxt_qplib_resize_cq_complete(struct bnxt_qplib_res *res, > > + struct bnxt_qplib_cq *cq) > > +{ > > + bnxt_qplib_free_hwq(res, &cq->hwq); > > + memcpy(&cq->hwq, &cq->resize_hwq, sizeof(cq->hwq)); > > +} > > + > > +int bnxt_qplib_resize_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq, > > + int new_cqes) > > +{ > > + struct bnxt_qplib_hwq_attr hwq_attr = {}; > > + struct bnxt_qplib_rcfw *rcfw = res->rcfw; > > + struct creq_resize_cq_resp resp = {}; > > + struct cmdq_resize_cq req = {}; > > + struct bnxt_qplib_pbl *pbl; > > + u32 pg_sz, lvl, new_sz; > > + u16 cmd_flags = 0; > > + int rc; > > + > > + RCFW_CMD_PREP(req, RESIZE_CQ, cmd_flags); > > + hwq_attr.sginfo = &cq->sg_info; > > + hwq_attr.res = res; > > + hwq_attr.depth = new_cqes; > > + hwq_attr.stride = sizeof(struct cq_base); > > + hwq_attr.type = HWQ_TYPE_QUEUE; > > + rc = bnxt_qplib_alloc_init_hwq(&cq->resize_hwq, &hwq_attr); > > + if (rc) > > + return rc; > > + > > + req.cq_cid = cpu_to_le32(cq->id); > > + pbl = &cq->resize_hwq.pbl[PBL_LVL_0]; > > + pg_sz = bnxt_qplib_base_pg_size(&cq->resize_hwq); > > + lvl = (cq->resize_hwq.level << CMDQ_RESIZE_CQ_LVL_SFT) & > > + CMDQ_RESIZE_CQ_LVL_MASK; > > + new_sz = (new_cqes << CMDQ_RESIZE_CQ_NEW_CQ_SIZE_SFT) & > > + CMDQ_RESIZE_CQ_NEW_CQ_SIZE_MASK; > > + req.new_cq_size_pg_size_lvl = cpu_to_le32(new_sz | pg_sz | lvl); > > + req.new_pbl = cpu_to_le64(pbl->pg_map_arr[0]); > > + > > + rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req, > > + (void *)&resp, NULL, 0); > > + return rc; > > return bnxt_qplib_rcfw_send_message(rcfw, (void *)&req, > (void *)&resp, NULL, 0); > > > +} > > + > > int bnxt_qplib_destroy_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq) > > { > > struct bnxt_qplib_rcfw *rcfw = res->rcfw; > > diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.h b/drivers/infiniband/hw/bnxt_re/qplib_fp.h > > index 0375019..d74d5ea 100644 > > --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.h > > +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.h > > @@ -400,6 +400,7 @@ struct bnxt_qplib_cq { > > u16 count; > > u16 period; > > struct bnxt_qplib_hwq hwq; > > + struct bnxt_qplib_hwq resize_hwq; > > u32 cnq_hw_ring_id; > > struct bnxt_qplib_nq *nq; > > bool resize_in_progress; > > @@ -532,6 +533,10 @@ void bnxt_qplib_post_recv_db(struct bnxt_qplib_qp *qp); > > int bnxt_qplib_post_recv(struct bnxt_qplib_qp *qp, > > struct bnxt_qplib_swqe *wqe); > > int bnxt_qplib_create_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq); > > +int bnxt_qplib_resize_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq, > > + int new_cqes); > > +void bnxt_qplib_resize_cq_complete(struct bnxt_qplib_res *res, > > + struct bnxt_qplib_cq *cq); > > int bnxt_qplib_destroy_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq); > > int bnxt_qplib_poll_cq(struct bnxt_qplib_cq *cq, struct bnxt_qplib_cqe *cqe, > > int num, struct bnxt_qplib_qp **qp); > > diff --git a/include/uapi/rdma/bnxt_re-abi.h b/include/uapi/rdma/bnxt_re-abi.h > > index b1de99b..f761e75 100644 > > --- a/include/uapi/rdma/bnxt_re-abi.h > > +++ b/include/uapi/rdma/bnxt_re-abi.h > > @@ -96,6 +96,10 @@ struct bnxt_re_cq_resp { > > __u32 rsvd; > > }; > > > > +struct bnxt_re_resize_cq_req { > > + __u64 cq_va; > > +}; > > + > > struct bnxt_re_qp_req { > > __aligned_u64 qpsva; > > __aligned_u64 qprva; > >
On 2023/3/17 18:17, Selvin Xavier wrote: > On Fri, Mar 17, 2023 at 2:40 PM Yunsheng Lin <linyunsheng@huawei.com> wrote: >> >> On 2023/3/15 16:16, Selvin Xavier wrote: >>> Add resize_cq verb support for user space CQs. Resize operation for >>> kernel CQs are not supported now. >>> >>> Driver should free the current CQ only after user library polls >>> for all the completions and switch to new CQ. So after the resize_cq >>> is returned from the driver, user libray polls for existing completions >> >> libray -> library >> >>> and store it as temporary data. Once library reaps all completions in the >>> current CQ, it invokes the ibv_cmd_poll_cq to inform the driver about >>> the resize_cq completion. Adding a check for user CQs in driver's >>> poll_cq and complete the resize operation for user CQs. >>> Updating uverbs_cmd_mask with poll_cq to support this. >>> >>> User library changes are available in this pull request. >>> https://github.com/linux-rdma/rdma-core/pull/1315 >>> >>> Signed-off-by: Selvin Xavier <selvin.xavier@broadcom.com> >>> --- >>> drivers/infiniband/hw/bnxt_re/ib_verbs.c | 109 +++++++++++++++++++++++++++++++ >>> drivers/infiniband/hw/bnxt_re/ib_verbs.h | 3 + >>> drivers/infiniband/hw/bnxt_re/main.c | 2 + >>> drivers/infiniband/hw/bnxt_re/qplib_fp.c | 44 +++++++++++++ >>> drivers/infiniband/hw/bnxt_re/qplib_fp.h | 5 ++ >>> include/uapi/rdma/bnxt_re-abi.h | 4 ++ >>> 6 files changed, 167 insertions(+) >>> >>> diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c >>> index 989edc7..e86afec 100644 >>> --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c >>> +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c >>> @@ -2912,6 +2912,106 @@ int bnxt_re_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, >>> return rc; >>> } >>> >>> +static void bnxt_re_resize_cq_complete(struct bnxt_re_cq *cq) >>> +{ >>> + struct bnxt_re_dev *rdev = cq->rdev; >>> + >>> + bnxt_qplib_resize_cq_complete(&rdev->qplib_res, &cq->qplib_cq); >>> + >>> + cq->qplib_cq.max_wqe = cq->resize_cqe; >>> + if (cq->resize_umem) { >>> + ib_umem_release(cq->umem); >>> + cq->umem = cq->resize_umem; >>> + cq->resize_umem = NULL; >>> + cq->resize_cqe = 0; >>> + } >>> +} >>> + >>> +int bnxt_re_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) >>> +{ >>> + struct bnxt_qplib_sg_info sg_info = {}; >>> + struct bnxt_qplib_dpi *orig_dpi = NULL; >>> + struct bnxt_qplib_dev_attr *dev_attr; >>> + struct bnxt_re_ucontext *uctx = NULL; >>> + struct bnxt_re_resize_cq_req req; >>> + struct bnxt_re_dev *rdev; >>> + struct bnxt_re_cq *cq; >>> + int rc, entries; >>> + >>> + cq = container_of(ibcq, struct bnxt_re_cq, ib_cq); >>> + rdev = cq->rdev; >>> + dev_attr = &rdev->dev_attr; >>> + if (!ibcq->uobject) { >>> + ibdev_err(&rdev->ibdev, "Kernel CQ Resize not supported"); >>> + return -EOPNOTSUPP; >>> + } >>> + >>> + if (cq->resize_umem) { >>> + ibdev_err(&rdev->ibdev, "Resize CQ %#x failed - Busy", >>> + cq->qplib_cq.id); >>> + return -EBUSY; >>> + } >> >> Does above cq->resize_umem checking has any conconcurrent protection >> again the bnxt_re_resize_cq_complete() called by bnxt_re_poll_cq()? >> >> bnxt_re_resize_cq() seems like a control path operation, while >> bnxt_re_poll_cq() seems like a data path operation, I am not sure >> there is any conconcurrent protection between them. >> > The previous check is to prevent simultaneous resize_cq context from > the user application. > > if you see the library implementation (PR > https://github.com/linux-rdma/rdma-core/pull/1315), entire operation > is done in the single resize_cq context from application > i.e. > bnxt_re_resize_cq > -> ibv_cmd_resize_cq > call driver bnxt_re_resize_cq and return > -> poll out the current completions and store it in a user lib list > -> issue an ibv_cmd_poll_cq. > This will invoke bnxt_re_poll_cq in the kernel driver. We free > the previous cq resources. > > So the synchronization between resize_cq and poll_cq is happening in > the user lib. We can free the old CQ only after user land sees the > final Completion on the previous CQ. So we return from the driver's > bnxt_re_resize_cq and then poll for all completions and then use > ibv_cmd_poll_cq as a hook to reach the kernel driver and free the old > CQ's resource. > > Summarize, driver's bnxt_re_poll_cq for user space CQs will be called > only from resize_cq context and no concurrent protection required in > the driver. Is it acceptable to depend on the user space code to ensure the correct order in the kernel space? Isn't that may utilized by some malicious user to crash the kernel?
On Fri, Mar 17, 2023 at 5:53 PM Yunsheng Lin <linyunsheng@huawei.com> wrote: > > On 2023/3/17 18:17, Selvin Xavier wrote: > > On Fri, Mar 17, 2023 at 2:40 PM Yunsheng Lin <linyunsheng@huawei.com> wrote: > >> > >> On 2023/3/15 16:16, Selvin Xavier wrote: > >>> Add resize_cq verb support for user space CQs. Resize operation for > >>> kernel CQs are not supported now. > >>> > >>> Driver should free the current CQ only after user library polls > >>> for all the completions and switch to new CQ. So after the resize_cq > >>> is returned from the driver, user libray polls for existing completions > >> > >> libray -> library > >> > >>> and store it as temporary data. Once library reaps all completions in the > >>> current CQ, it invokes the ibv_cmd_poll_cq to inform the driver about > >>> the resize_cq completion. Adding a check for user CQs in driver's > >>> poll_cq and complete the resize operation for user CQs. > >>> Updating uverbs_cmd_mask with poll_cq to support this. > >>> > >>> User library changes are available in this pull request. > >>> https://github.com/linux-rdma/rdma-core/pull/1315 > >>> > >>> Signed-off-by: Selvin Xavier <selvin.xavier@broadcom.com> > >>> --- > >>> drivers/infiniband/hw/bnxt_re/ib_verbs.c | 109 +++++++++++++++++++++++++++++++ > >>> drivers/infiniband/hw/bnxt_re/ib_verbs.h | 3 + > >>> drivers/infiniband/hw/bnxt_re/main.c | 2 + > >>> drivers/infiniband/hw/bnxt_re/qplib_fp.c | 44 +++++++++++++ > >>> drivers/infiniband/hw/bnxt_re/qplib_fp.h | 5 ++ > >>> include/uapi/rdma/bnxt_re-abi.h | 4 ++ > >>> 6 files changed, 167 insertions(+) > >>> > >>> diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c > >>> index 989edc7..e86afec 100644 > >>> --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c > >>> +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c > >>> @@ -2912,6 +2912,106 @@ int bnxt_re_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, > >>> return rc; > >>> } > >>> > >>> +static void bnxt_re_resize_cq_complete(struct bnxt_re_cq *cq) > >>> +{ > >>> + struct bnxt_re_dev *rdev = cq->rdev; > >>> + > >>> + bnxt_qplib_resize_cq_complete(&rdev->qplib_res, &cq->qplib_cq); > >>> + > >>> + cq->qplib_cq.max_wqe = cq->resize_cqe; > >>> + if (cq->resize_umem) { > >>> + ib_umem_release(cq->umem); > >>> + cq->umem = cq->resize_umem; > >>> + cq->resize_umem = NULL; > >>> + cq->resize_cqe = 0; > >>> + } > >>> +} > >>> + > >>> +int bnxt_re_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) > >>> +{ > >>> + struct bnxt_qplib_sg_info sg_info = {}; > >>> + struct bnxt_qplib_dpi *orig_dpi = NULL; > >>> + struct bnxt_qplib_dev_attr *dev_attr; > >>> + struct bnxt_re_ucontext *uctx = NULL; > >>> + struct bnxt_re_resize_cq_req req; > >>> + struct bnxt_re_dev *rdev; > >>> + struct bnxt_re_cq *cq; > >>> + int rc, entries; > >>> + > >>> + cq = container_of(ibcq, struct bnxt_re_cq, ib_cq); > >>> + rdev = cq->rdev; > >>> + dev_attr = &rdev->dev_attr; > >>> + if (!ibcq->uobject) { > >>> + ibdev_err(&rdev->ibdev, "Kernel CQ Resize not supported"); > >>> + return -EOPNOTSUPP; > >>> + } > >>> + > >>> + if (cq->resize_umem) { > >>> + ibdev_err(&rdev->ibdev, "Resize CQ %#x failed - Busy", > >>> + cq->qplib_cq.id); > >>> + return -EBUSY; > >>> + } > >> > >> Does above cq->resize_umem checking has any conconcurrent protection > >> again the bnxt_re_resize_cq_complete() called by bnxt_re_poll_cq()? > >> > >> bnxt_re_resize_cq() seems like a control path operation, while > >> bnxt_re_poll_cq() seems like a data path operation, I am not sure > >> there is any conconcurrent protection between them. > >> > > The previous check is to prevent simultaneous resize_cq context from > > the user application. > > > > if you see the library implementation (PR > > https://github.com/linux-rdma/rdma-core/pull/1315), entire operation > > is done in the single resize_cq context from application > > i.e. > > bnxt_re_resize_cq > > -> ibv_cmd_resize_cq > > call driver bnxt_re_resize_cq and return > > -> poll out the current completions and store it in a user lib list > > -> issue an ibv_cmd_poll_cq. > > This will invoke bnxt_re_poll_cq in the kernel driver. We free > > the previous cq resources. > > > > So the synchronization between resize_cq and poll_cq is happening in > > the user lib. We can free the old CQ only after user land sees the > > final Completion on the previous CQ. So we return from the driver's > > bnxt_re_resize_cq and then poll for all completions and then use > > ibv_cmd_poll_cq as a hook to reach the kernel driver and free the old > > CQ's resource. > > > > Summarize, driver's bnxt_re_poll_cq for user space CQs will be called > > only from resize_cq context and no concurrent protection required in > > the driver. > > Is it acceptable to depend on the user space code to ensure the correct > order in the kernel space? > Isn't that may utilized by some malicious user to crash the kernel? > The user space code I mentioned is implemented in the provider/bnxt_re user library just like any other verb interface. Apps shall always go through this library to send request to the provider driver.
On Sun, Mar 19, 2023 at 07:37:45AM +0530, Selvin Xavier wrote: > On Fri, Mar 17, 2023 at 5:53 PM Yunsheng Lin <linyunsheng@huawei.com> wrote: > > > > On 2023/3/17 18:17, Selvin Xavier wrote: > > > On Fri, Mar 17, 2023 at 2:40 PM Yunsheng Lin <linyunsheng@huawei.com> wrote: > > >> > > >> On 2023/3/15 16:16, Selvin Xavier wrote: > > >>> Add resize_cq verb support for user space CQs. Resize operation for > > >>> kernel CQs are not supported now. > > >>> > > >>> Driver should free the current CQ only after user library polls > > >>> for all the completions and switch to new CQ. So after the resize_cq > > >>> is returned from the driver, user libray polls for existing completions > > >> > > >> libray -> library > > >> > > >>> and store it as temporary data. Once library reaps all completions in the > > >>> current CQ, it invokes the ibv_cmd_poll_cq to inform the driver about > > >>> the resize_cq completion. Adding a check for user CQs in driver's > > >>> poll_cq and complete the resize operation for user CQs. > > >>> Updating uverbs_cmd_mask with poll_cq to support this. > > >>> > > >>> User library changes are available in this pull request. > > >>> https://github.com/linux-rdma/rdma-core/pull/1315 > > >>> > > >>> Signed-off-by: Selvin Xavier <selvin.xavier@broadcom.com> > > >>> --- > > >>> drivers/infiniband/hw/bnxt_re/ib_verbs.c | 109 +++++++++++++++++++++++++++++++ > > >>> drivers/infiniband/hw/bnxt_re/ib_verbs.h | 3 + > > >>> drivers/infiniband/hw/bnxt_re/main.c | 2 + > > >>> drivers/infiniband/hw/bnxt_re/qplib_fp.c | 44 +++++++++++++ > > >>> drivers/infiniband/hw/bnxt_re/qplib_fp.h | 5 ++ > > >>> include/uapi/rdma/bnxt_re-abi.h | 4 ++ > > >>> 6 files changed, 167 insertions(+) > > >>> > > >>> diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c > > >>> index 989edc7..e86afec 100644 > > >>> --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c > > >>> +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c > > >>> @@ -2912,6 +2912,106 @@ int bnxt_re_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, > > >>> return rc; > > >>> } > > >>> > > >>> +static void bnxt_re_resize_cq_complete(struct bnxt_re_cq *cq) > > >>> +{ > > >>> + struct bnxt_re_dev *rdev = cq->rdev; > > >>> + > > >>> + bnxt_qplib_resize_cq_complete(&rdev->qplib_res, &cq->qplib_cq); > > >>> + > > >>> + cq->qplib_cq.max_wqe = cq->resize_cqe; > > >>> + if (cq->resize_umem) { > > >>> + ib_umem_release(cq->umem); > > >>> + cq->umem = cq->resize_umem; > > >>> + cq->resize_umem = NULL; > > >>> + cq->resize_cqe = 0; > > >>> + } > > >>> +} > > >>> + > > >>> +int bnxt_re_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) > > >>> +{ > > >>> + struct bnxt_qplib_sg_info sg_info = {}; > > >>> + struct bnxt_qplib_dpi *orig_dpi = NULL; > > >>> + struct bnxt_qplib_dev_attr *dev_attr; > > >>> + struct bnxt_re_ucontext *uctx = NULL; > > >>> + struct bnxt_re_resize_cq_req req; > > >>> + struct bnxt_re_dev *rdev; > > >>> + struct bnxt_re_cq *cq; > > >>> + int rc, entries; > > >>> + > > >>> + cq = container_of(ibcq, struct bnxt_re_cq, ib_cq); > > >>> + rdev = cq->rdev; > > >>> + dev_attr = &rdev->dev_attr; > > >>> + if (!ibcq->uobject) { > > >>> + ibdev_err(&rdev->ibdev, "Kernel CQ Resize not supported"); > > >>> + return -EOPNOTSUPP; > > >>> + } > > >>> + > > >>> + if (cq->resize_umem) { > > >>> + ibdev_err(&rdev->ibdev, "Resize CQ %#x failed - Busy", > > >>> + cq->qplib_cq.id); > > >>> + return -EBUSY; > > >>> + } > > >> > > >> Does above cq->resize_umem checking has any conconcurrent protection > > >> again the bnxt_re_resize_cq_complete() called by bnxt_re_poll_cq()? > > >> > > >> bnxt_re_resize_cq() seems like a control path operation, while > > >> bnxt_re_poll_cq() seems like a data path operation, I am not sure > > >> there is any conconcurrent protection between them. > > >> > > > The previous check is to prevent simultaneous resize_cq context from > > > the user application. > > > > > > if you see the library implementation (PR > > > https://github.com/linux-rdma/rdma-core/pull/1315), entire operation > > > is done in the single resize_cq context from application > > > i.e. > > > bnxt_re_resize_cq > > > -> ibv_cmd_resize_cq > > > call driver bnxt_re_resize_cq and return > > > -> poll out the current completions and store it in a user lib list > > > -> issue an ibv_cmd_poll_cq. > > > This will invoke bnxt_re_poll_cq in the kernel driver. We free > > > the previous cq resources. > > > > > > So the synchronization between resize_cq and poll_cq is happening in > > > the user lib. We can free the old CQ only after user land sees the > > > final Completion on the previous CQ. So we return from the driver's > > > bnxt_re_resize_cq and then poll for all completions and then use > > > ibv_cmd_poll_cq as a hook to reach the kernel driver and free the old > > > CQ's resource. > > > > > > Summarize, driver's bnxt_re_poll_cq for user space CQs will be called > > > only from resize_cq context and no concurrent protection required in > > > the driver. > > > > Is it acceptable to depend on the user space code to ensure the correct > > order in the kernel space? > > Isn't that may utilized by some malicious user to crash the kernel? > > > The user space code I mentioned is implemented in the provider/bnxt_re > user library just > like any other verb interface. Apps shall always go through this > library to send request > to the provider driver. And if I use my own version of provider/bnxt_re, will kernel crash? Thanks
On Sun, Mar 19, 2023 at 5:32 PM Leon Romanovsky <leon@kernel.org> wrote: > > On Sun, Mar 19, 2023 at 07:37:45AM +0530, Selvin Xavier wrote: > > On Fri, Mar 17, 2023 at 5:53 PM Yunsheng Lin <linyunsheng@huawei.com> wrote: > > > > > > On 2023/3/17 18:17, Selvin Xavier wrote: > > > > On Fri, Mar 17, 2023 at 2:40 PM Yunsheng Lin <linyunsheng@huawei.com> wrote: > > > >> > > > >> On 2023/3/15 16:16, Selvin Xavier wrote: > > > >>> Add resize_cq verb support for user space CQs. Resize operation for > > > >>> kernel CQs are not supported now. > > > >>> > > > >>> Driver should free the current CQ only after user library polls > > > >>> for all the completions and switch to new CQ. So after the resize_cq > > > >>> is returned from the driver, user libray polls for existing completions > > > >> > > > >> libray -> library > > > >> > > > >>> and store it as temporary data. Once library reaps all completions in the > > > >>> current CQ, it invokes the ibv_cmd_poll_cq to inform the driver about > > > >>> the resize_cq completion. Adding a check for user CQs in driver's > > > >>> poll_cq and complete the resize operation for user CQs. > > > >>> Updating uverbs_cmd_mask with poll_cq to support this. > > > >>> > > > >>> User library changes are available in this pull request. > > > >>> https://github.com/linux-rdma/rdma-core/pull/1315 > > > >>> > > > >>> Signed-off-by: Selvin Xavier <selvin.xavier@broadcom.com> > > > >>> --- > > > >>> drivers/infiniband/hw/bnxt_re/ib_verbs.c | 109 +++++++++++++++++++++++++++++++ > > > >>> drivers/infiniband/hw/bnxt_re/ib_verbs.h | 3 + > > > >>> drivers/infiniband/hw/bnxt_re/main.c | 2 + > > > >>> drivers/infiniband/hw/bnxt_re/qplib_fp.c | 44 +++++++++++++ > > > >>> drivers/infiniband/hw/bnxt_re/qplib_fp.h | 5 ++ > > > >>> include/uapi/rdma/bnxt_re-abi.h | 4 ++ > > > >>> 6 files changed, 167 insertions(+) > > > >>> > > > >>> diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c > > > >>> index 989edc7..e86afec 100644 > > > >>> --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c > > > >>> +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c > > > >>> @@ -2912,6 +2912,106 @@ int bnxt_re_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, > > > >>> return rc; > > > >>> } > > > >>> > > > >>> +static void bnxt_re_resize_cq_complete(struct bnxt_re_cq *cq) > > > >>> +{ > > > >>> + struct bnxt_re_dev *rdev = cq->rdev; > > > >>> + > > > >>> + bnxt_qplib_resize_cq_complete(&rdev->qplib_res, &cq->qplib_cq); > > > >>> + > > > >>> + cq->qplib_cq.max_wqe = cq->resize_cqe; > > > >>> + if (cq->resize_umem) { > > > >>> + ib_umem_release(cq->umem); > > > >>> + cq->umem = cq->resize_umem; > > > >>> + cq->resize_umem = NULL; > > > >>> + cq->resize_cqe = 0; > > > >>> + } > > > >>> +} > > > >>> + > > > >>> +int bnxt_re_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) > > > >>> +{ > > > >>> + struct bnxt_qplib_sg_info sg_info = {}; > > > >>> + struct bnxt_qplib_dpi *orig_dpi = NULL; > > > >>> + struct bnxt_qplib_dev_attr *dev_attr; > > > >>> + struct bnxt_re_ucontext *uctx = NULL; > > > >>> + struct bnxt_re_resize_cq_req req; > > > >>> + struct bnxt_re_dev *rdev; > > > >>> + struct bnxt_re_cq *cq; > > > >>> + int rc, entries; > > > >>> + > > > >>> + cq = container_of(ibcq, struct bnxt_re_cq, ib_cq); > > > >>> + rdev = cq->rdev; > > > >>> + dev_attr = &rdev->dev_attr; > > > >>> + if (!ibcq->uobject) { > > > >>> + ibdev_err(&rdev->ibdev, "Kernel CQ Resize not supported"); > > > >>> + return -EOPNOTSUPP; > > > >>> + } > > > >>> + > > > >>> + if (cq->resize_umem) { > > > >>> + ibdev_err(&rdev->ibdev, "Resize CQ %#x failed - Busy", > > > >>> + cq->qplib_cq.id); > > > >>> + return -EBUSY; > > > >>> + } > > > >> > > > >> Does above cq->resize_umem checking has any conconcurrent protection > > > >> again the bnxt_re_resize_cq_complete() called by bnxt_re_poll_cq()? > > > >> > > > >> bnxt_re_resize_cq() seems like a control path operation, while > > > >> bnxt_re_poll_cq() seems like a data path operation, I am not sure > > > >> there is any conconcurrent protection between them. > > > >> > > > > The previous check is to prevent simultaneous resize_cq context from > > > > the user application. > > > > > > > > if you see the library implementation (PR > > > > https://github.com/linux-rdma/rdma-core/pull/1315), entire operation > > > > is done in the single resize_cq context from application > > > > i.e. > > > > bnxt_re_resize_cq > > > > -> ibv_cmd_resize_cq > > > > call driver bnxt_re_resize_cq and return > > > > -> poll out the current completions and store it in a user lib list > > > > -> issue an ibv_cmd_poll_cq. > > > > This will invoke bnxt_re_poll_cq in the kernel driver. We free > > > > the previous cq resources. > > > > > > > > So the synchronization between resize_cq and poll_cq is happening in > > > > the user lib. We can free the old CQ only after user land sees the > > > > final Completion on the previous CQ. So we return from the driver's > > > > bnxt_re_resize_cq and then poll for all completions and then use > > > > ibv_cmd_poll_cq as a hook to reach the kernel driver and free the old > > > > CQ's resource. > > > > > > > > Summarize, driver's bnxt_re_poll_cq for user space CQs will be called > > > > only from resize_cq context and no concurrent protection required in > > > > the driver. > > > > > > Is it acceptable to depend on the user space code to ensure the correct > > > order in the kernel space? > > > Isn't that may utilized by some malicious user to crash the kernel? > > > > > The user space code I mentioned is implemented in the provider/bnxt_re > > user library just > > like any other verb interface. Apps shall always go through this > > library to send request > > to the provider driver. > > And if I use my own version of provider/bnxt_re, will kernel crash? No.. it will not. It can cause a memory leak if the provider/bnxt_re doesn't call ibv_cmd_poll_cq after the ibv_cmd_resize_cq. As per the HW design, freeing the old CQ resources can happen only after the HW generates a cut off completion entry on the CQ which is getting resized. This needs to be polled by the user library. Once the user lib sees the cut off completion, we issue an ibv_cmd_poll_cq from the library to invoke the driver and free the old CQ resource. > > Thanks
On Wed, Mar 15, 2023 at 01:16:55AM -0700, Selvin Xavier wrote: > Add resize_cq verb support for user space CQs. Resize operation for > kernel CQs are not supported now. > > Driver should free the current CQ only after user library polls > for all the completions and switch to new CQ. So after the resize_cq > is returned from the driver, user libray polls for existing completions > and store it as temporary data. Once library reaps all completions in the > current CQ, it invokes the ibv_cmd_poll_cq to inform the driver about > the resize_cq completion. Adding a check for user CQs in driver's > poll_cq and complete the resize operation for user CQs. > Updating uverbs_cmd_mask with poll_cq to support this. > > User library changes are available in this pull request. > https://github.com/linux-rdma/rdma-core/pull/1315 > > Signed-off-by: Selvin Xavier <selvin.xavier@broadcom.com> > --- > drivers/infiniband/hw/bnxt_re/ib_verbs.c | 109 +++++++++++++++++++++++++++++++ > drivers/infiniband/hw/bnxt_re/ib_verbs.h | 3 + > drivers/infiniband/hw/bnxt_re/main.c | 2 + > drivers/infiniband/hw/bnxt_re/qplib_fp.c | 44 +++++++++++++ > drivers/infiniband/hw/bnxt_re/qplib_fp.h | 5 ++ > include/uapi/rdma/bnxt_re-abi.h | 4 ++ > 6 files changed, 167 insertions(+) <...> > +struct bnxt_re_resize_cq_req { > + __u64 cq_va; This should be __aligned_u64, see commit: 26b9906612c3 ("RDMA: Change all uapi headers to use __aligned_u64 instead of __u64") Fixed locally and applied. Thanks > +}; > + > struct bnxt_re_qp_req { > __aligned_u64 qpsva; > __aligned_u64 qprva; > -- > 2.5.5 >
On Wed, 15 Mar 2023 01:16:55 -0700, Selvin Xavier wrote: > Add resize_cq verb support for user space CQs. Resize operation for > kernel CQs are not supported now. > > Driver should free the current CQ only after user library polls > for all the completions and switch to new CQ. So after the resize_cq > is returned from the driver, user libray polls for existing completions > and store it as temporary data. Once library reaps all completions in the > current CQ, it invokes the ibv_cmd_poll_cq to inform the driver about > the resize_cq completion. Adding a check for user CQs in driver's > poll_cq and complete the resize operation for user CQs. > Updating uverbs_cmd_mask with poll_cq to support this. > > [...] Applied, thanks! [1/1] RDMA/bnxt_re: Add resize_cq support https://git.kernel.org/rdma/rdma/c/d54bd5abf4d26e Best regards,
diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 989edc7..e86afec 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -2912,6 +2912,106 @@ int bnxt_re_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, return rc; } +static void bnxt_re_resize_cq_complete(struct bnxt_re_cq *cq) +{ + struct bnxt_re_dev *rdev = cq->rdev; + + bnxt_qplib_resize_cq_complete(&rdev->qplib_res, &cq->qplib_cq); + + cq->qplib_cq.max_wqe = cq->resize_cqe; + if (cq->resize_umem) { + ib_umem_release(cq->umem); + cq->umem = cq->resize_umem; + cq->resize_umem = NULL; + cq->resize_cqe = 0; + } +} + +int bnxt_re_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) +{ + struct bnxt_qplib_sg_info sg_info = {}; + struct bnxt_qplib_dpi *orig_dpi = NULL; + struct bnxt_qplib_dev_attr *dev_attr; + struct bnxt_re_ucontext *uctx = NULL; + struct bnxt_re_resize_cq_req req; + struct bnxt_re_dev *rdev; + struct bnxt_re_cq *cq; + int rc, entries; + + cq = container_of(ibcq, struct bnxt_re_cq, ib_cq); + rdev = cq->rdev; + dev_attr = &rdev->dev_attr; + if (!ibcq->uobject) { + ibdev_err(&rdev->ibdev, "Kernel CQ Resize not supported"); + return -EOPNOTSUPP; + } + + if (cq->resize_umem) { + ibdev_err(&rdev->ibdev, "Resize CQ %#x failed - Busy", + cq->qplib_cq.id); + return -EBUSY; + } + + /* Check the requested cq depth out of supported depth */ + if (cqe < 1 || cqe > dev_attr->max_cq_wqes) { + ibdev_err(&rdev->ibdev, "Resize CQ %#x failed - out of range cqe %d", + cq->qplib_cq.id, cqe); + return -EINVAL; + } + + entries = roundup_pow_of_two(cqe + 1); + if (entries > dev_attr->max_cq_wqes + 1) + entries = dev_attr->max_cq_wqes + 1; + + uctx = rdma_udata_to_drv_context(udata, struct bnxt_re_ucontext, + ib_uctx); + /* uverbs consumer */ + if (ib_copy_from_udata(&req, udata, sizeof(req))) { + rc = -EFAULT; + goto fail; + } + + cq->resize_umem = ib_umem_get(&rdev->ibdev, req.cq_va, + entries * sizeof(struct cq_base), + IB_ACCESS_LOCAL_WRITE); + if (IS_ERR(cq->resize_umem)) { + rc = PTR_ERR(cq->resize_umem); + cq->resize_umem = NULL; + ibdev_err(&rdev->ibdev, "%s: ib_umem_get failed! rc = %d\n", + __func__, rc); + goto fail; + } + cq->resize_cqe = entries; + memcpy(&sg_info, &cq->qplib_cq.sg_info, sizeof(sg_info)); + orig_dpi = cq->qplib_cq.dpi; + + cq->qplib_cq.sg_info.umem = cq->resize_umem; + cq->qplib_cq.sg_info.pgsize = PAGE_SIZE; + cq->qplib_cq.sg_info.pgshft = PAGE_SHIFT; + cq->qplib_cq.dpi = &uctx->dpi; + + rc = bnxt_qplib_resize_cq(&rdev->qplib_res, &cq->qplib_cq, entries); + if (rc) { + ibdev_err(&rdev->ibdev, "Resize HW CQ %#x failed!", + cq->qplib_cq.id); + goto fail; + } + + cq->ib_cq.cqe = cq->resize_cqe; + + return 0; + +fail: + if (cq->resize_umem) { + ib_umem_release(cq->resize_umem); + cq->resize_umem = NULL; + cq->resize_cqe = 0; + memcpy(&cq->qplib_cq.sg_info, &sg_info, sizeof(sg_info)); + cq->qplib_cq.dpi = orig_dpi; + } + return rc; +} + static u8 __req_to_ib_wc_status(u8 qstatus) { switch (qstatus) { @@ -3425,6 +3525,15 @@ int bnxt_re_poll_cq(struct ib_cq *ib_cq, int num_entries, struct ib_wc *wc) struct bnxt_re_sqp_entries *sqp_entry = NULL; unsigned long flags; + /* User CQ; the only processing we do is to + * complete any pending CQ resize operation. + */ + if (cq->umem) { + if (cq->resize_umem) + bnxt_re_resize_cq_complete(cq); + return 0; + } + spin_lock_irqsave(&cq->cq_lock, flags); budget = min_t(u32, num_entries, cq->max_cql); num_entries = budget; diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.h b/drivers/infiniband/hw/bnxt_re/ib_verbs.h index 9432626..31f7e34 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.h +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.h @@ -104,6 +104,8 @@ struct bnxt_re_cq { #define MAX_CQL_PER_POLL 1024 u32 max_cql; struct ib_umem *umem; + struct ib_umem *resize_umem; + int resize_cqe; }; struct bnxt_re_mr { @@ -191,6 +193,7 @@ int bnxt_re_post_recv(struct ib_qp *qp, const struct ib_recv_wr *recv_wr, const struct ib_recv_wr **bad_recv_wr); int bnxt_re_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, struct ib_udata *udata); +int bnxt_re_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata); int bnxt_re_destroy_cq(struct ib_cq *cq, struct ib_udata *udata); int bnxt_re_poll_cq(struct ib_cq *cq, int num_entries, struct ib_wc *wc); int bnxt_re_req_notify_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags); diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index c5867e7..48bbba7 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -553,6 +553,7 @@ static const struct ib_device_ops bnxt_re_dev_ops = { .query_srq = bnxt_re_query_srq, .reg_user_mr = bnxt_re_reg_user_mr, .req_notify_cq = bnxt_re_req_notify_cq, + .resize_cq = bnxt_re_resize_cq, INIT_RDMA_OBJ_SIZE(ib_ah, bnxt_re_ah, ib_ah), INIT_RDMA_OBJ_SIZE(ib_cq, bnxt_re_cq, ib_cq), INIT_RDMA_OBJ_SIZE(ib_pd, bnxt_re_pd, ib_pd), @@ -584,6 +585,7 @@ static int bnxt_re_register_ib(struct bnxt_re_dev *rdev) return ret; dma_set_max_seg_size(&rdev->en_dev->pdev->dev, UINT_MAX); + ibdev->uverbs_cmd_mask |= BIT_ULL(IB_USER_VERBS_CMD_POLL_CQ); return ib_register_device(ibdev, "bnxt_re%d", &rdev->en_dev->pdev->dev); } diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c index 96e581c..1d769a3 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c @@ -2100,6 +2100,50 @@ int bnxt_qplib_create_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq) return rc; } +void bnxt_qplib_resize_cq_complete(struct bnxt_qplib_res *res, + struct bnxt_qplib_cq *cq) +{ + bnxt_qplib_free_hwq(res, &cq->hwq); + memcpy(&cq->hwq, &cq->resize_hwq, sizeof(cq->hwq)); +} + +int bnxt_qplib_resize_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq, + int new_cqes) +{ + struct bnxt_qplib_hwq_attr hwq_attr = {}; + struct bnxt_qplib_rcfw *rcfw = res->rcfw; + struct creq_resize_cq_resp resp = {}; + struct cmdq_resize_cq req = {}; + struct bnxt_qplib_pbl *pbl; + u32 pg_sz, lvl, new_sz; + u16 cmd_flags = 0; + int rc; + + RCFW_CMD_PREP(req, RESIZE_CQ, cmd_flags); + hwq_attr.sginfo = &cq->sg_info; + hwq_attr.res = res; + hwq_attr.depth = new_cqes; + hwq_attr.stride = sizeof(struct cq_base); + hwq_attr.type = HWQ_TYPE_QUEUE; + rc = bnxt_qplib_alloc_init_hwq(&cq->resize_hwq, &hwq_attr); + if (rc) + return rc; + + req.cq_cid = cpu_to_le32(cq->id); + pbl = &cq->resize_hwq.pbl[PBL_LVL_0]; + pg_sz = bnxt_qplib_base_pg_size(&cq->resize_hwq); + lvl = (cq->resize_hwq.level << CMDQ_RESIZE_CQ_LVL_SFT) & + CMDQ_RESIZE_CQ_LVL_MASK; + new_sz = (new_cqes << CMDQ_RESIZE_CQ_NEW_CQ_SIZE_SFT) & + CMDQ_RESIZE_CQ_NEW_CQ_SIZE_MASK; + req.new_cq_size_pg_size_lvl = cpu_to_le32(new_sz | pg_sz | lvl); + req.new_pbl = cpu_to_le64(pbl->pg_map_arr[0]); + + rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req, + (void *)&resp, NULL, 0); + return rc; +} + int bnxt_qplib_destroy_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq) { struct bnxt_qplib_rcfw *rcfw = res->rcfw; diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.h b/drivers/infiniband/hw/bnxt_re/qplib_fp.h index 0375019..d74d5ea 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.h @@ -400,6 +400,7 @@ struct bnxt_qplib_cq { u16 count; u16 period; struct bnxt_qplib_hwq hwq; + struct bnxt_qplib_hwq resize_hwq; u32 cnq_hw_ring_id; struct bnxt_qplib_nq *nq; bool resize_in_progress; @@ -532,6 +533,10 @@ void bnxt_qplib_post_recv_db(struct bnxt_qplib_qp *qp); int bnxt_qplib_post_recv(struct bnxt_qplib_qp *qp, struct bnxt_qplib_swqe *wqe); int bnxt_qplib_create_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq); +int bnxt_qplib_resize_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq, + int new_cqes); +void bnxt_qplib_resize_cq_complete(struct bnxt_qplib_res *res, + struct bnxt_qplib_cq *cq); int bnxt_qplib_destroy_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq); int bnxt_qplib_poll_cq(struct bnxt_qplib_cq *cq, struct bnxt_qplib_cqe *cqe, int num, struct bnxt_qplib_qp **qp); diff --git a/include/uapi/rdma/bnxt_re-abi.h b/include/uapi/rdma/bnxt_re-abi.h index b1de99b..f761e75 100644 --- a/include/uapi/rdma/bnxt_re-abi.h +++ b/include/uapi/rdma/bnxt_re-abi.h @@ -96,6 +96,10 @@ struct bnxt_re_cq_resp { __u32 rsvd; }; +struct bnxt_re_resize_cq_req { + __u64 cq_va; +}; + struct bnxt_re_qp_req { __aligned_u64 qpsva; __aligned_u64 qprva;
Add resize_cq verb support for user space CQs. Resize operation for kernel CQs are not supported now. Driver should free the current CQ only after user library polls for all the completions and switch to new CQ. So after the resize_cq is returned from the driver, user libray polls for existing completions and store it as temporary data. Once library reaps all completions in the current CQ, it invokes the ibv_cmd_poll_cq to inform the driver about the resize_cq completion. Adding a check for user CQs in driver's poll_cq and complete the resize operation for user CQs. Updating uverbs_cmd_mask with poll_cq to support this. User library changes are available in this pull request. https://github.com/linux-rdma/rdma-core/pull/1315 Signed-off-by: Selvin Xavier <selvin.xavier@broadcom.com> --- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 109 +++++++++++++++++++++++++++++++ drivers/infiniband/hw/bnxt_re/ib_verbs.h | 3 + drivers/infiniband/hw/bnxt_re/main.c | 2 + drivers/infiniband/hw/bnxt_re/qplib_fp.c | 44 +++++++++++++ drivers/infiniband/hw/bnxt_re/qplib_fp.h | 5 ++ include/uapi/rdma/bnxt_re-abi.h | 4 ++ 6 files changed, 167 insertions(+)