Message ID | 20230103060855.644516-1-yanjun.zhu@intel.com (mailing list archive) |
---|---|
State | Superseded |
Delegated to: | Jason Gunthorpe |
Headers | show |
Series | [PATCHv2,1/1] RDMA/irdma: Add support for dmabuf pin memory regions | expand |
> Subject: [PATCHv2 1/1] RDMA/irdma: Add support for dmabuf pin memory > regions > > From: Zhu Yanjun <yanjun.zhu@linux.dev> > > This is a followup to the EFA dmabuf[1]. Irdma driver currently does not support > on-demand-paging(ODP). So it uses habanalabs as the dmabuf exporter, and > irdma as the importer to allow for peer2peer access through libibverbs. > > In this commit, the function ib_umem_dmabuf_get_pinned() is used. > This function is introduced in EFA dmabuf[1] which allows the driver to get a > dmabuf umem which is pinned and does not require move_notify callback > implementation. The returned umem is pinned and DMA mapped like standard cpu > umems, and is released through ib_umem_release(). > > [1]https://lore.kernel.org/lkml/20211007114018.GD2688930@ziepe.ca/t/ > > Signed-off-by: Zhu Yanjun <yanjun.zhu@linux.dev> Is there a corresponding user-space patch? > --- > V1->V2: Fix the build warning by adding a static > --- > drivers/infiniband/hw/irdma/verbs.c | 158 ++++++++++++++++++++++++++++ > 1 file changed, 158 insertions(+) > > diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c > index f6973ea55eda..1572baa93856 100644 > --- a/drivers/infiniband/hw/irdma/verbs.c > +++ b/drivers/infiniband/hw/irdma/verbs.c > @@ -2912,6 +2912,163 @@ static struct ib_mr *irdma_reg_user_mr(struct ib_pd > *pd, u64 start, u64 len, > return ERR_PTR(err); > } > > +static struct ib_mr *irdma_reg_user_mr_dmabuf(struct ib_pd *pd, u64 start, > + u64 len, u64 virt, > + int fd, int access, > + struct ib_udata *udata) > +{ > + struct irdma_device *iwdev = to_iwdev(pd->device); > + struct irdma_ucontext *ucontext; > + struct irdma_pble_alloc *palloc; > + struct irdma_pbl *iwpbl; > + struct irdma_mr *iwmr; > + struct irdma_mem_reg_req req; > + u32 total, stag = 0; > + u8 shadow_pgcnt = 1; > + bool use_pbles = false; > + unsigned long flags; > + int err = -EINVAL; > + struct ib_umem_dmabuf *umem_dmabuf; > + > + if (len > iwdev->rf->sc_dev.hw_attrs.max_mr_size) > + return ERR_PTR(-EINVAL); > + > + if (udata->inlen < IRDMA_MEM_REG_MIN_REQ_LEN) > + return ERR_PTR(-EINVAL); > + > + umem_dmabuf = ib_umem_dmabuf_get_pinned(pd->device, start, len, fd, > + access); > + if (IS_ERR(umem_dmabuf)) { > + err = PTR_ERR(umem_dmabuf); > + ibdev_dbg(&iwdev->ibdev, "Failed to get dmabuf umem[%d]\n", > err); > + return ERR_PTR(err); > + } > + > + if (ib_copy_from_udata(&req, udata, min(sizeof(req), udata->inlen))) { > + ib_umem_release(&umem_dmabuf->umem); > + return ERR_PTR(-EFAULT); > + } > + > + iwmr = kzalloc(sizeof(*iwmr), GFP_KERNEL); > + if (!iwmr) { > + ib_umem_release(&umem_dmabuf->umem); > + return ERR_PTR(-ENOMEM); > + } > + > + iwpbl = &iwmr->iwpbl; > + iwpbl->iwmr = iwmr; > + iwmr->region = &umem_dmabuf->umem; > + iwmr->ibmr.pd = pd; > + iwmr->ibmr.device = pd->device; > + iwmr->ibmr.iova = virt; > + iwmr->page_size = PAGE_SIZE; > + > + if (req.reg_type == IRDMA_MEMREG_TYPE_MEM) { > + iwmr->page_size = ib_umem_find_best_pgsz(iwmr->region, > + iwdev->rf- > >sc_dev.hw_attrs.page_size_cap, > + virt); > + if (unlikely(!iwmr->page_size)) { > + kfree(iwmr); > + ib_umem_release(iwmr->region); > + return ERR_PTR(-EOPNOTSUPP); > + } > + } > + iwmr->len = iwmr->region->length; > + iwpbl->user_base = virt; > + palloc = &iwpbl->pble_alloc; > + iwmr->type = req.reg_type; > + iwmr->page_cnt = ib_umem_num_dma_blocks(iwmr->region, > +iwmr->page_size); > + > + switch (req.reg_type) { > + case IRDMA_MEMREG_TYPE_QP: > + total = req.sq_pages + req.rq_pages + shadow_pgcnt; > + if (total > iwmr->page_cnt) { > + err = -EINVAL; > + goto error; > + } > + total = req.sq_pages + req.rq_pages; > + use_pbles = (total > 2); > + err = irdma_handle_q_mem(iwdev, &req, iwpbl, use_pbles); > + if (err) > + goto error; > + > + ucontext = rdma_udata_to_drv_context(udata, struct > irdma_ucontext, > + ibucontext); > + spin_lock_irqsave(&ucontext->qp_reg_mem_list_lock, flags); > + list_add_tail(&iwpbl->list, &ucontext->qp_reg_mem_list); > + iwpbl->on_list = true; > + spin_unlock_irqrestore(&ucontext->qp_reg_mem_list_lock, flags); > + break; > + case IRDMA_MEMREG_TYPE_CQ: > + if (iwdev->rf->sc_dev.hw_attrs.uk_attrs.feature_flags & > IRDMA_FEATURE_CQ_RESIZE) > + shadow_pgcnt = 0; > + total = req.cq_pages + shadow_pgcnt; > + if (total > iwmr->page_cnt) { > + err = -EINVAL; > + goto error; > + } > + > + use_pbles = (req.cq_pages > 1); > + err = irdma_handle_q_mem(iwdev, &req, iwpbl, use_pbles); > + if (err) > + goto error; > + > + ucontext = rdma_udata_to_drv_context(udata, struct > irdma_ucontext, > + ibucontext); > + spin_lock_irqsave(&ucontext->cq_reg_mem_list_lock, flags); > + list_add_tail(&iwpbl->list, &ucontext->cq_reg_mem_list); > + iwpbl->on_list = true; > + spin_unlock_irqrestore(&ucontext->cq_reg_mem_list_lock, flags); > + break; I don't think we want to do this for user QP, CQ pinned memory. In fact, it will just be dead-code. The irdma provider implementation of the ibv_reg_dmabuf_mr will just default to IRDMA_MEMREG_TYPE_MEM type similar to how irdma_ureg_mr is implemented. https://github.com/linux-rdma/rdma-core/blob/master/providers/irdma/uverbs.c#L128 It should simplify this function a lot. > + case IRDMA_MEMREG_TYPE_MEM: > + use_pbles = (iwmr->page_cnt != 1); > + > + err = irdma_setup_pbles(iwdev->rf, iwmr, use_pbles, false); > + if (err) > + goto error; > + > + if (use_pbles) { > + err = irdma_check_mr_contiguous(palloc, > + iwmr->page_size); > + if (err) { > + irdma_free_pble(iwdev->rf->pble_rsrc, palloc); > + iwpbl->pbl_allocated = false; > + } > + } > + > + stag = irdma_create_stag(iwdev); > + if (!stag) { > + err = -ENOMEM; > + goto error; > + } > + > + iwmr->stag = stag; > + iwmr->ibmr.rkey = stag; > + iwmr->ibmr.lkey = stag; > + err = irdma_hwreg_mr(iwdev, iwmr, access); > + if (err) { > + irdma_free_stag(iwdev, stag); > + goto error; > + } > + > + break; > + default: > + goto error; > + } > + > + iwmr->type = req.reg_type; > + > + return &iwmr->ibmr; > + > +error: > + if (palloc->level != PBLE_LEVEL_0 && iwpbl->pbl_allocated) > + irdma_free_pble(iwdev->rf->pble_rsrc, palloc); > + ib_umem_release(iwmr->region); > + kfree(iwmr); Ideally we want unwind in the reverse order of allocation. > + > + return ERR_PTR(err); > +} > + > /** > * irdma_reg_phys_mr - register kernel physical memory > * @pd: ibpd pointer > @@ -4418,6 +4575,7 @@ static const struct ib_device_ops irdma_dev_ops = { > .query_port = irdma_query_port, > .query_qp = irdma_query_qp, > .reg_user_mr = irdma_reg_user_mr, > + .reg_user_mr_dmabuf = irdma_reg_user_mr_dmabuf, > .req_notify_cq = irdma_req_notify_cq, > .resize_cq = irdma_resize_cq, > INIT_RDMA_OBJ_SIZE(ib_pd, irdma_pd, ibpd), > -- > 2.27.0
> Subject: RE: [PATCHv2 1/1] RDMA/irdma: Add support for dmabuf pin memory > regions > [......] > > + > > + switch (req.reg_type) { > > + case IRDMA_MEMREG_TYPE_QP: > > + total = req.sq_pages + req.rq_pages + shadow_pgcnt; > > + if (total > iwmr->page_cnt) { > > + err = -EINVAL; > > + goto error; > > + } > > + total = req.sq_pages + req.rq_pages; > > + use_pbles = (total > 2); > > + err = irdma_handle_q_mem(iwdev, &req, iwpbl, use_pbles); > > + if (err) > > + goto error; > > + > > + ucontext = rdma_udata_to_drv_context(udata, struct > > irdma_ucontext, > > + ibucontext); > > + spin_lock_irqsave(&ucontext->qp_reg_mem_list_lock, flags); > > + list_add_tail(&iwpbl->list, &ucontext->qp_reg_mem_list); > > + iwpbl->on_list = true; > > + spin_unlock_irqrestore(&ucontext->qp_reg_mem_list_lock, flags); > > + break; > > + case IRDMA_MEMREG_TYPE_CQ: > > + if (iwdev->rf->sc_dev.hw_attrs.uk_attrs.feature_flags & > > IRDMA_FEATURE_CQ_RESIZE) > > + shadow_pgcnt = 0; > > + total = req.cq_pages + shadow_pgcnt; > > + if (total > iwmr->page_cnt) { > > + err = -EINVAL; > > + goto error; > > + } > > + > > + use_pbles = (req.cq_pages > 1); > > + err = irdma_handle_q_mem(iwdev, &req, iwpbl, use_pbles); > > + if (err) > > + goto error; > > + > > + ucontext = rdma_udata_to_drv_context(udata, struct > > irdma_ucontext, > > + ibucontext); > > + spin_lock_irqsave(&ucontext->cq_reg_mem_list_lock, flags); > > + list_add_tail(&iwpbl->list, &ucontext->cq_reg_mem_list); > > + iwpbl->on_list = true; > > + spin_unlock_irqrestore(&ucontext->cq_reg_mem_list_lock, flags); > > + break; > > I don't think we want to do this for user QP, CQ pinned memory. In fact, it will just > be dead-code. > > The irdma provider implementation of the ibv_reg_dmabuf_mr will just default to > IRDMA_MEMREG_TYPE_MEM type similar to how irdma_ureg_mr is implemented. > > https://github.com/linux-rdma/rdma- > core/blob/master/providers/irdma/uverbs.c#L128 > > It should simplify this function a lot. > > Actually I don't see a need even to use the irdma_mem_reg_req ABI struct to pass any info from user-space like reg_type. Shiraz
在 2023/1/4 7:35, Saleem, Shiraz 写道: >> Subject: [PATCHv2 1/1] RDMA/irdma: Add support for dmabuf pin memory >> regions >> >> From: Zhu Yanjun <yanjun.zhu@linux.dev> >> >> This is a followup to the EFA dmabuf[1]. Irdma driver currently does not support >> on-demand-paging(ODP). So it uses habanalabs as the dmabuf exporter, and >> irdma as the importer to allow for peer2peer access through libibverbs. >> >> In this commit, the function ib_umem_dmabuf_get_pinned() is used. >> This function is introduced in EFA dmabuf[1] which allows the driver to get a >> dmabuf umem which is pinned and does not require move_notify callback >> implementation. The returned umem is pinned and DMA mapped like standard cpu >> umems, and is released through ib_umem_release(). >> >> [1]https://lore.kernel.org/lkml/20211007114018.GD2688930@ziepe.ca/t/ >> >> Signed-off-by: Zhu Yanjun <yanjun.zhu@linux.dev> > > Is there a corresponding user-space patch? Yes. I will send it out very soon. Zhu Yanjun > > >> --- >> V1->V2: Fix the build warning by adding a static >> --- >> drivers/infiniband/hw/irdma/verbs.c | 158 ++++++++++++++++++++++++++++ >> 1 file changed, 158 insertions(+) >> >> diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c >> index f6973ea55eda..1572baa93856 100644 >> --- a/drivers/infiniband/hw/irdma/verbs.c >> +++ b/drivers/infiniband/hw/irdma/verbs.c >> @@ -2912,6 +2912,163 @@ static struct ib_mr *irdma_reg_user_mr(struct ib_pd >> *pd, u64 start, u64 len, >> return ERR_PTR(err); >> } >> >> +static struct ib_mr *irdma_reg_user_mr_dmabuf(struct ib_pd *pd, u64 start, >> + u64 len, u64 virt, >> + int fd, int access, >> + struct ib_udata *udata) >> +{ >> + struct irdma_device *iwdev = to_iwdev(pd->device); >> + struct irdma_ucontext *ucontext; >> + struct irdma_pble_alloc *palloc; >> + struct irdma_pbl *iwpbl; >> + struct irdma_mr *iwmr; >> + struct irdma_mem_reg_req req; >> + u32 total, stag = 0; >> + u8 shadow_pgcnt = 1; >> + bool use_pbles = false; >> + unsigned long flags; >> + int err = -EINVAL; >> + struct ib_umem_dmabuf *umem_dmabuf; >> + >> + if (len > iwdev->rf->sc_dev.hw_attrs.max_mr_size) >> + return ERR_PTR(-EINVAL); >> + >> + if (udata->inlen < IRDMA_MEM_REG_MIN_REQ_LEN) >> + return ERR_PTR(-EINVAL); >> + >> + umem_dmabuf = ib_umem_dmabuf_get_pinned(pd->device, start, len, fd, >> + access); >> + if (IS_ERR(umem_dmabuf)) { >> + err = PTR_ERR(umem_dmabuf); >> + ibdev_dbg(&iwdev->ibdev, "Failed to get dmabuf umem[%d]\n", >> err); >> + return ERR_PTR(err); >> + } >> + >> + if (ib_copy_from_udata(&req, udata, min(sizeof(req), udata->inlen))) { >> + ib_umem_release(&umem_dmabuf->umem); >> + return ERR_PTR(-EFAULT); >> + } >> + >> + iwmr = kzalloc(sizeof(*iwmr), GFP_KERNEL); >> + if (!iwmr) { >> + ib_umem_release(&umem_dmabuf->umem); >> + return ERR_PTR(-ENOMEM); >> + } >> + >> + iwpbl = &iwmr->iwpbl; >> + iwpbl->iwmr = iwmr; >> + iwmr->region = &umem_dmabuf->umem; >> + iwmr->ibmr.pd = pd; >> + iwmr->ibmr.device = pd->device; >> + iwmr->ibmr.iova = virt; >> + iwmr->page_size = PAGE_SIZE; >> + >> + if (req.reg_type == IRDMA_MEMREG_TYPE_MEM) { >> + iwmr->page_size = ib_umem_find_best_pgsz(iwmr->region, >> + iwdev->rf- >>> sc_dev.hw_attrs.page_size_cap, >> + virt); >> + if (unlikely(!iwmr->page_size)) { >> + kfree(iwmr); >> + ib_umem_release(iwmr->region); >> + return ERR_PTR(-EOPNOTSUPP); >> + } >> + } >> + iwmr->len = iwmr->region->length; >> + iwpbl->user_base = virt; >> + palloc = &iwpbl->pble_alloc; >> + iwmr->type = req.reg_type; >> + iwmr->page_cnt = ib_umem_num_dma_blocks(iwmr->region, >> +iwmr->page_size); >> + >> + switch (req.reg_type) { >> + case IRDMA_MEMREG_TYPE_QP: >> + total = req.sq_pages + req.rq_pages + shadow_pgcnt; >> + if (total > iwmr->page_cnt) { >> + err = -EINVAL; >> + goto error; >> + } >> + total = req.sq_pages + req.rq_pages; >> + use_pbles = (total > 2); >> + err = irdma_handle_q_mem(iwdev, &req, iwpbl, use_pbles); >> + if (err) >> + goto error; >> + >> + ucontext = rdma_udata_to_drv_context(udata, struct >> irdma_ucontext, >> + ibucontext); >> + spin_lock_irqsave(&ucontext->qp_reg_mem_list_lock, flags); >> + list_add_tail(&iwpbl->list, &ucontext->qp_reg_mem_list); >> + iwpbl->on_list = true; >> + spin_unlock_irqrestore(&ucontext->qp_reg_mem_list_lock, flags); >> + break; >> + case IRDMA_MEMREG_TYPE_CQ: >> + if (iwdev->rf->sc_dev.hw_attrs.uk_attrs.feature_flags & >> IRDMA_FEATURE_CQ_RESIZE) >> + shadow_pgcnt = 0; >> + total = req.cq_pages + shadow_pgcnt; >> + if (total > iwmr->page_cnt) { >> + err = -EINVAL; >> + goto error; >> + } >> + >> + use_pbles = (req.cq_pages > 1); >> + err = irdma_handle_q_mem(iwdev, &req, iwpbl, use_pbles); >> + if (err) >> + goto error; >> + >> + ucontext = rdma_udata_to_drv_context(udata, struct >> irdma_ucontext, >> + ibucontext); >> + spin_lock_irqsave(&ucontext->cq_reg_mem_list_lock, flags); >> + list_add_tail(&iwpbl->list, &ucontext->cq_reg_mem_list); >> + iwpbl->on_list = true; >> + spin_unlock_irqrestore(&ucontext->cq_reg_mem_list_lock, flags); >> + break; > I don't think we want to do this for user QP, CQ pinned memory. In fact, it will just be dead-code. > > The irdma provider implementation of the ibv_reg_dmabuf_mr will just default to IRDMA_MEMREG_TYPE_MEM type similar to how irdma_ureg_mr is implemented. > > https://github.com/linux-rdma/rdma-core/blob/master/providers/irdma/uverbs.c#L128 > > It should simplify this function a lot. > > >> + case IRDMA_MEMREG_TYPE_MEM: >> + use_pbles = (iwmr->page_cnt != 1); >> + >> + err = irdma_setup_pbles(iwdev->rf, iwmr, use_pbles, false); >> + if (err) >> + goto error; >> + >> + if (use_pbles) { >> + err = irdma_check_mr_contiguous(palloc, >> + iwmr->page_size); >> + if (err) { >> + irdma_free_pble(iwdev->rf->pble_rsrc, palloc); >> + iwpbl->pbl_allocated = false; >> + } >> + } >> + >> + stag = irdma_create_stag(iwdev); >> + if (!stag) { >> + err = -ENOMEM; >> + goto error; >> + } >> + >> + iwmr->stag = stag; >> + iwmr->ibmr.rkey = stag; >> + iwmr->ibmr.lkey = stag; >> + err = irdma_hwreg_mr(iwdev, iwmr, access); >> + if (err) { >> + irdma_free_stag(iwdev, stag); >> + goto error; >> + } >> + >> + break; >> + default: >> + goto error; >> + } >> + >> + iwmr->type = req.reg_type; >> + >> + return &iwmr->ibmr; >> + >> +error: >> + if (palloc->level != PBLE_LEVEL_0 && iwpbl->pbl_allocated) >> + irdma_free_pble(iwdev->rf->pble_rsrc, palloc); >> + ib_umem_release(iwmr->region); >> + kfree(iwmr); > Ideally we want unwind in the reverse order of allocation. > >> + >> + return ERR_PTR(err); >> +} >> + >> /** >> * irdma_reg_phys_mr - register kernel physical memory >> * @pd: ibpd pointer >> @@ -4418,6 +4575,7 @@ static const struct ib_device_ops irdma_dev_ops = { >> .query_port = irdma_query_port, >> .query_qp = irdma_query_qp, >> .reg_user_mr = irdma_reg_user_mr, >> + .reg_user_mr_dmabuf = irdma_reg_user_mr_dmabuf, >> .req_notify_cq = irdma_req_notify_cq, >> .resize_cq = irdma_resize_cq, >> INIT_RDMA_OBJ_SIZE(ib_pd, irdma_pd, ibpd), >> -- >> 2.27.0
在 2023/1/4 7:35, Saleem, Shiraz 写道: >> Subject: [PATCHv2 1/1] RDMA/irdma: Add support for dmabuf pin memory >> regions >> >> From: Zhu Yanjun <yanjun.zhu@linux.dev> >> >> This is a followup to the EFA dmabuf[1]. Irdma driver currently does not support >> on-demand-paging(ODP). So it uses habanalabs as the dmabuf exporter, and >> irdma as the importer to allow for peer2peer access through libibverbs. >> >> In this commit, the function ib_umem_dmabuf_get_pinned() is used. >> This function is introduced in EFA dmabuf[1] which allows the driver to get a >> dmabuf umem which is pinned and does not require move_notify callback >> implementation. The returned umem is pinned and DMA mapped like standard cpu >> umems, and is released through ib_umem_release(). >> >> [1]https://lore.kernel.org/lkml/20211007114018.GD2688930@ziepe.ca/t/ >> >> Signed-off-by: Zhu Yanjun <yanjun.zhu@linux.dev> > > Is there a corresponding user-space patch? > > >> --- >> V1->V2: Fix the build warning by adding a static >> --- >> drivers/infiniband/hw/irdma/verbs.c | 158 ++++++++++++++++++++++++++++ >> 1 file changed, 158 insertions(+) >> >> diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c >> index f6973ea55eda..1572baa93856 100644 >> --- a/drivers/infiniband/hw/irdma/verbs.c >> +++ b/drivers/infiniband/hw/irdma/verbs.c >> @@ -2912,6 +2912,163 @@ static struct ib_mr *irdma_reg_user_mr(struct ib_pd >> *pd, u64 start, u64 len, >> return ERR_PTR(err); >> } >> >> +static struct ib_mr *irdma_reg_user_mr_dmabuf(struct ib_pd *pd, u64 start, >> + u64 len, u64 virt, >> + int fd, int access, >> + struct ib_udata *udata) >> +{ >> + struct irdma_device *iwdev = to_iwdev(pd->device); >> + struct irdma_ucontext *ucontext; >> + struct irdma_pble_alloc *palloc; >> + struct irdma_pbl *iwpbl; >> + struct irdma_mr *iwmr; >> + struct irdma_mem_reg_req req; >> + u32 total, stag = 0; >> + u8 shadow_pgcnt = 1; >> + bool use_pbles = false; >> + unsigned long flags; >> + int err = -EINVAL; >> + struct ib_umem_dmabuf *umem_dmabuf; >> + >> + if (len > iwdev->rf->sc_dev.hw_attrs.max_mr_size) >> + return ERR_PTR(-EINVAL); >> + >> + if (udata->inlen < IRDMA_MEM_REG_MIN_REQ_LEN) >> + return ERR_PTR(-EINVAL); >> + >> + umem_dmabuf = ib_umem_dmabuf_get_pinned(pd->device, start, len, fd, >> + access); >> + if (IS_ERR(umem_dmabuf)) { >> + err = PTR_ERR(umem_dmabuf); >> + ibdev_dbg(&iwdev->ibdev, "Failed to get dmabuf umem[%d]\n", >> err); >> + return ERR_PTR(err); >> + } >> + >> + if (ib_copy_from_udata(&req, udata, min(sizeof(req), udata->inlen))) { >> + ib_umem_release(&umem_dmabuf->umem); >> + return ERR_PTR(-EFAULT); >> + } >> + >> + iwmr = kzalloc(sizeof(*iwmr), GFP_KERNEL); >> + if (!iwmr) { >> + ib_umem_release(&umem_dmabuf->umem); >> + return ERR_PTR(-ENOMEM); >> + } >> + >> + iwpbl = &iwmr->iwpbl; >> + iwpbl->iwmr = iwmr; >> + iwmr->region = &umem_dmabuf->umem; >> + iwmr->ibmr.pd = pd; >> + iwmr->ibmr.device = pd->device; >> + iwmr->ibmr.iova = virt; >> + iwmr->page_size = PAGE_SIZE; >> + >> + if (req.reg_type == IRDMA_MEMREG_TYPE_MEM) { >> + iwmr->page_size = ib_umem_find_best_pgsz(iwmr->region, >> + iwdev->rf- >>> sc_dev.hw_attrs.page_size_cap, >> + virt); >> + if (unlikely(!iwmr->page_size)) { >> + kfree(iwmr); >> + ib_umem_release(iwmr->region); >> + return ERR_PTR(-EOPNOTSUPP); >> + } >> + } >> + iwmr->len = iwmr->region->length; >> + iwpbl->user_base = virt; >> + palloc = &iwpbl->pble_alloc; >> + iwmr->type = req.reg_type; >> + iwmr->page_cnt = ib_umem_num_dma_blocks(iwmr->region, >> +iwmr->page_size); >> + >> + switch (req.reg_type) { >> + case IRDMA_MEMREG_TYPE_QP: >> + total = req.sq_pages + req.rq_pages + shadow_pgcnt; >> + if (total > iwmr->page_cnt) { >> + err = -EINVAL; >> + goto error; >> + } >> + total = req.sq_pages + req.rq_pages; >> + use_pbles = (total > 2); >> + err = irdma_handle_q_mem(iwdev, &req, iwpbl, use_pbles); >> + if (err) >> + goto error; >> + >> + ucontext = rdma_udata_to_drv_context(udata, struct >> irdma_ucontext, >> + ibucontext); >> + spin_lock_irqsave(&ucontext->qp_reg_mem_list_lock, flags); >> + list_add_tail(&iwpbl->list, &ucontext->qp_reg_mem_list); >> + iwpbl->on_list = true; >> + spin_unlock_irqrestore(&ucontext->qp_reg_mem_list_lock, flags); >> + break; >> + case IRDMA_MEMREG_TYPE_CQ: >> + if (iwdev->rf->sc_dev.hw_attrs.uk_attrs.feature_flags & >> IRDMA_FEATURE_CQ_RESIZE) >> + shadow_pgcnt = 0; >> + total = req.cq_pages + shadow_pgcnt; >> + if (total > iwmr->page_cnt) { >> + err = -EINVAL; >> + goto error; >> + } >> + >> + use_pbles = (req.cq_pages > 1); >> + err = irdma_handle_q_mem(iwdev, &req, iwpbl, use_pbles); >> + if (err) >> + goto error; >> + >> + ucontext = rdma_udata_to_drv_context(udata, struct >> irdma_ucontext, >> + ibucontext); >> + spin_lock_irqsave(&ucontext->cq_reg_mem_list_lock, flags); >> + list_add_tail(&iwpbl->list, &ucontext->cq_reg_mem_list); >> + iwpbl->on_list = true; >> + spin_unlock_irqrestore(&ucontext->cq_reg_mem_list_lock, flags); >> + break; > I don't think we want to do this for user QP, CQ pinned memory. In fact, it will just be dead-code. > > The irdma provider implementation of the ibv_reg_dmabuf_mr will just default to IRDMA_MEMREG_TYPE_MEM type similar to how irdma_ureg_mr is implemented. > > https://github.com/linux-rdma/rdma-core/blob/master/providers/irdma/uverbs.c#L128 > > It should simplify this function a lot. Got it. Thanks a lot for your advice. I will send out the latest commit very soon. > > >> + case IRDMA_MEMREG_TYPE_MEM: >> + use_pbles = (iwmr->page_cnt != 1); >> + >> + err = irdma_setup_pbles(iwdev->rf, iwmr, use_pbles, false); >> + if (err) >> + goto error; >> + >> + if (use_pbles) { >> + err = irdma_check_mr_contiguous(palloc, >> + iwmr->page_size); >> + if (err) { >> + irdma_free_pble(iwdev->rf->pble_rsrc, palloc); >> + iwpbl->pbl_allocated = false; >> + } >> + } >> + >> + stag = irdma_create_stag(iwdev); >> + if (!stag) { >> + err = -ENOMEM; >> + goto error; >> + } >> + >> + iwmr->stag = stag; >> + iwmr->ibmr.rkey = stag; >> + iwmr->ibmr.lkey = stag; >> + err = irdma_hwreg_mr(iwdev, iwmr, access); >> + if (err) { >> + irdma_free_stag(iwdev, stag); >> + goto error; >> + } >> + >> + break; >> + default: >> + goto error; >> + } >> + >> + iwmr->type = req.reg_type; >> + >> + return &iwmr->ibmr; >> + >> +error: >> + if (palloc->level != PBLE_LEVEL_0 && iwpbl->pbl_allocated) >> + irdma_free_pble(iwdev->rf->pble_rsrc, palloc); >> + ib_umem_release(iwmr->region); >> + kfree(iwmr); > Ideally we want unwind in the reverse order of allocation. Good catch. Thanks Zhu Yanjun > >> + >> + return ERR_PTR(err); >> +} >> + >> /** >> * irdma_reg_phys_mr - register kernel physical memory >> * @pd: ibpd pointer >> @@ -4418,6 +4575,7 @@ static const struct ib_device_ops irdma_dev_ops = { >> .query_port = irdma_query_port, >> .query_qp = irdma_query_qp, >> .reg_user_mr = irdma_reg_user_mr, >> + .reg_user_mr_dmabuf = irdma_reg_user_mr_dmabuf, >> .req_notify_cq = irdma_req_notify_cq, >> .resize_cq = irdma_resize_cq, >> INIT_RDMA_OBJ_SIZE(ib_pd, irdma_pd, ibpd), >> -- >> 2.27.0
在 2023/1/4 8:21, Saleem, Shiraz 写道: >> Subject: RE: [PATCHv2 1/1] RDMA/irdma: Add support for dmabuf pin memory >> regions >> > [......] > >>> + >>> + switch (req.reg_type) { >>> + case IRDMA_MEMREG_TYPE_QP: >>> + total = req.sq_pages + req.rq_pages + shadow_pgcnt; >>> + if (total > iwmr->page_cnt) { >>> + err = -EINVAL; >>> + goto error; >>> + } >>> + total = req.sq_pages + req.rq_pages; >>> + use_pbles = (total > 2); >>> + err = irdma_handle_q_mem(iwdev, &req, iwpbl, use_pbles); >>> + if (err) >>> + goto error; >>> + >>> + ucontext = rdma_udata_to_drv_context(udata, struct >>> irdma_ucontext, >>> + ibucontext); >>> + spin_lock_irqsave(&ucontext->qp_reg_mem_list_lock, flags); >>> + list_add_tail(&iwpbl->list, &ucontext->qp_reg_mem_list); >>> + iwpbl->on_list = true; >>> + spin_unlock_irqrestore(&ucontext->qp_reg_mem_list_lock, flags); >>> + break; >>> + case IRDMA_MEMREG_TYPE_CQ: >>> + if (iwdev->rf->sc_dev.hw_attrs.uk_attrs.feature_flags & >>> IRDMA_FEATURE_CQ_RESIZE) >>> + shadow_pgcnt = 0; >>> + total = req.cq_pages + shadow_pgcnt; >>> + if (total > iwmr->page_cnt) { >>> + err = -EINVAL; >>> + goto error; >>> + } >>> + >>> + use_pbles = (req.cq_pages > 1); >>> + err = irdma_handle_q_mem(iwdev, &req, iwpbl, use_pbles); >>> + if (err) >>> + goto error; >>> + >>> + ucontext = rdma_udata_to_drv_context(udata, struct >>> irdma_ucontext, >>> + ibucontext); >>> + spin_lock_irqsave(&ucontext->cq_reg_mem_list_lock, flags); >>> + list_add_tail(&iwpbl->list, &ucontext->cq_reg_mem_list); >>> + iwpbl->on_list = true; >>> + spin_unlock_irqrestore(&ucontext->cq_reg_mem_list_lock, flags); >>> + break; >> I don't think we want to do this for user QP, CQ pinned memory. In fact, it will just >> be dead-code. >> >> The irdma provider implementation of the ibv_reg_dmabuf_mr will just default to >> IRDMA_MEMREG_TYPE_MEM type similar to how irdma_ureg_mr is implemented. >> >> https://github.com/linux-rdma/rdma- >> core/blob/master/providers/irdma/uverbs.c#L128 >> >> It should simplify this function a lot. >> >> > Actually I don't see a need even to use the irdma_mem_reg_req ABI struct to pass any info from user-space like reg_type. In the latest commit, the irdma_mem_reg_req ABI struct is removed. I will send out it very soon. Zhu Yanjun > > Shiraz >
diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c index f6973ea55eda..1572baa93856 100644 --- a/drivers/infiniband/hw/irdma/verbs.c +++ b/drivers/infiniband/hw/irdma/verbs.c @@ -2912,6 +2912,163 @@ static struct ib_mr *irdma_reg_user_mr(struct ib_pd *pd, u64 start, u64 len, return ERR_PTR(err); } +static struct ib_mr *irdma_reg_user_mr_dmabuf(struct ib_pd *pd, u64 start, + u64 len, u64 virt, + int fd, int access, + struct ib_udata *udata) +{ + struct irdma_device *iwdev = to_iwdev(pd->device); + struct irdma_ucontext *ucontext; + struct irdma_pble_alloc *palloc; + struct irdma_pbl *iwpbl; + struct irdma_mr *iwmr; + struct irdma_mem_reg_req req; + u32 total, stag = 0; + u8 shadow_pgcnt = 1; + bool use_pbles = false; + unsigned long flags; + int err = -EINVAL; + struct ib_umem_dmabuf *umem_dmabuf; + + if (len > iwdev->rf->sc_dev.hw_attrs.max_mr_size) + return ERR_PTR(-EINVAL); + + if (udata->inlen < IRDMA_MEM_REG_MIN_REQ_LEN) + return ERR_PTR(-EINVAL); + + umem_dmabuf = ib_umem_dmabuf_get_pinned(pd->device, start, len, fd, + access); + if (IS_ERR(umem_dmabuf)) { + err = PTR_ERR(umem_dmabuf); + ibdev_dbg(&iwdev->ibdev, "Failed to get dmabuf umem[%d]\n", err); + return ERR_PTR(err); + } + + if (ib_copy_from_udata(&req, udata, min(sizeof(req), udata->inlen))) { + ib_umem_release(&umem_dmabuf->umem); + return ERR_PTR(-EFAULT); + } + + iwmr = kzalloc(sizeof(*iwmr), GFP_KERNEL); + if (!iwmr) { + ib_umem_release(&umem_dmabuf->umem); + return ERR_PTR(-ENOMEM); + } + + iwpbl = &iwmr->iwpbl; + iwpbl->iwmr = iwmr; + iwmr->region = &umem_dmabuf->umem; + iwmr->ibmr.pd = pd; + iwmr->ibmr.device = pd->device; + iwmr->ibmr.iova = virt; + iwmr->page_size = PAGE_SIZE; + + if (req.reg_type == IRDMA_MEMREG_TYPE_MEM) { + iwmr->page_size = ib_umem_find_best_pgsz(iwmr->region, + iwdev->rf->sc_dev.hw_attrs.page_size_cap, + virt); + if (unlikely(!iwmr->page_size)) { + kfree(iwmr); + ib_umem_release(iwmr->region); + return ERR_PTR(-EOPNOTSUPP); + } + } + iwmr->len = iwmr->region->length; + iwpbl->user_base = virt; + palloc = &iwpbl->pble_alloc; + iwmr->type = req.reg_type; + iwmr->page_cnt = ib_umem_num_dma_blocks(iwmr->region, iwmr->page_size); + + switch (req.reg_type) { + case IRDMA_MEMREG_TYPE_QP: + total = req.sq_pages + req.rq_pages + shadow_pgcnt; + if (total > iwmr->page_cnt) { + err = -EINVAL; + goto error; + } + total = req.sq_pages + req.rq_pages; + use_pbles = (total > 2); + err = irdma_handle_q_mem(iwdev, &req, iwpbl, use_pbles); + if (err) + goto error; + + ucontext = rdma_udata_to_drv_context(udata, struct irdma_ucontext, + ibucontext); + spin_lock_irqsave(&ucontext->qp_reg_mem_list_lock, flags); + list_add_tail(&iwpbl->list, &ucontext->qp_reg_mem_list); + iwpbl->on_list = true; + spin_unlock_irqrestore(&ucontext->qp_reg_mem_list_lock, flags); + break; + case IRDMA_MEMREG_TYPE_CQ: + if (iwdev->rf->sc_dev.hw_attrs.uk_attrs.feature_flags & IRDMA_FEATURE_CQ_RESIZE) + shadow_pgcnt = 0; + total = req.cq_pages + shadow_pgcnt; + if (total > iwmr->page_cnt) { + err = -EINVAL; + goto error; + } + + use_pbles = (req.cq_pages > 1); + err = irdma_handle_q_mem(iwdev, &req, iwpbl, use_pbles); + if (err) + goto error; + + ucontext = rdma_udata_to_drv_context(udata, struct irdma_ucontext, + ibucontext); + spin_lock_irqsave(&ucontext->cq_reg_mem_list_lock, flags); + list_add_tail(&iwpbl->list, &ucontext->cq_reg_mem_list); + iwpbl->on_list = true; + spin_unlock_irqrestore(&ucontext->cq_reg_mem_list_lock, flags); + break; + case IRDMA_MEMREG_TYPE_MEM: + use_pbles = (iwmr->page_cnt != 1); + + err = irdma_setup_pbles(iwdev->rf, iwmr, use_pbles, false); + if (err) + goto error; + + if (use_pbles) { + err = irdma_check_mr_contiguous(palloc, + iwmr->page_size); + if (err) { + irdma_free_pble(iwdev->rf->pble_rsrc, palloc); + iwpbl->pbl_allocated = false; + } + } + + stag = irdma_create_stag(iwdev); + if (!stag) { + err = -ENOMEM; + goto error; + } + + iwmr->stag = stag; + iwmr->ibmr.rkey = stag; + iwmr->ibmr.lkey = stag; + err = irdma_hwreg_mr(iwdev, iwmr, access); + if (err) { + irdma_free_stag(iwdev, stag); + goto error; + } + + break; + default: + goto error; + } + + iwmr->type = req.reg_type; + + return &iwmr->ibmr; + +error: + if (palloc->level != PBLE_LEVEL_0 && iwpbl->pbl_allocated) + irdma_free_pble(iwdev->rf->pble_rsrc, palloc); + ib_umem_release(iwmr->region); + kfree(iwmr); + + return ERR_PTR(err); +} + /** * irdma_reg_phys_mr - register kernel physical memory * @pd: ibpd pointer @@ -4418,6 +4575,7 @@ static const struct ib_device_ops irdma_dev_ops = { .query_port = irdma_query_port, .query_qp = irdma_query_qp, .reg_user_mr = irdma_reg_user_mr, + .reg_user_mr_dmabuf = irdma_reg_user_mr_dmabuf, .req_notify_cq = irdma_req_notify_cq, .resize_cq = irdma_resize_cq, INIT_RDMA_OBJ_SIZE(ib_pd, irdma_pd, ibpd),