Message ID | 20211007104301.76693-3-galpress@amazon.com (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
Series | EFA dmabuf memory regions | expand |
On Thu, Oct 07, 2021 at 01:43:00PM +0300, Gal Pressman wrote: > @@ -1491,26 +1493,29 @@ static int efa_create_pbl(struct efa_dev *dev, > return 0; > } > > -struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length, > - u64 virt_addr, int access_flags, > - struct ib_udata *udata) > +static void efa_dmabuf_invalidate_cb(struct dma_buf_attachment *attach) > +{ > + WARN_ON_ONCE(1, > + "Invalidate callback should not be called when memory is pinned\n"); > +} > + > +static struct dma_buf_attach_ops efa_dmabuf_attach_ops = { > + .allow_peer2peer = true, > + .move_notify = efa_dmabuf_invalidate_cb, > +}; Shouldn't move_notify really just be left as NULL? I mean fixing whatever is preventing that? > +struct ib_mr *efa_reg_user_mr_dmabuf(struct ib_pd *ibpd, u64 start, > + u64 length, u64 virt_addr, > + int fd, int access_flags, > + struct ib_udata *udata) > +{ > + struct efa_dev *dev = to_edev(ibpd->device); > + struct ib_umem_dmabuf *umem_dmabuf; > + struct efa_mr *mr; > + int err; > + > + mr = efa_alloc_mr(ibpd, access_flags, udata); > + if (IS_ERR(mr)) { > + err = PTR_ERR(mr); > + goto err_out; > + } > + > + umem_dmabuf = ib_umem_dmabuf_get(ibpd->device, start, length, fd, > + access_flags, &efa_dmabuf_attach_ops); > + if (IS_ERR(umem_dmabuf)) { > + ibdev_dbg(&dev->ibdev, "Failed to get dmabuf[%d]\n", err); > + err = PTR_ERR(umem_dmabuf); > + goto err_free; > + } > + > + dma_resv_lock(umem_dmabuf->attach->dmabuf->resv, NULL); > + err = dma_buf_pin(umem_dmabuf->attach); > + if (err) { > + ibdev_dbg(&dev->ibdev, "Failed to pin dmabuf memory\n"); > + goto err_release; > + } > + > + err = ib_umem_dmabuf_map_pages(umem_dmabuf); > + if (err) { > + ibdev_dbg(&dev->ibdev, "Failed to map dmabuf pages\n"); > + goto err_unpin; > + } > + dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv); If it is really this simple the core code should have this logic, 'ib_umem_dmabuf_get_pinned()' or something Jason
On 07/10/2021 14:40, Jason Gunthorpe wrote: > On Thu, Oct 07, 2021 at 01:43:00PM +0300, Gal Pressman wrote: > >> @@ -1491,26 +1493,29 @@ static int efa_create_pbl(struct efa_dev *dev, >> return 0; >> } >> >> -struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length, >> - u64 virt_addr, int access_flags, >> - struct ib_udata *udata) >> +static void efa_dmabuf_invalidate_cb(struct dma_buf_attachment *attach) >> +{ >> + WARN_ON_ONCE(1, >> + "Invalidate callback should not be called when memory is pinned\n"); >> +} >> + >> +static struct dma_buf_attach_ops efa_dmabuf_attach_ops = { >> + .allow_peer2peer = true, >> + .move_notify = efa_dmabuf_invalidate_cb, >> +}; > > Shouldn't move_notify really just be left as NULL? I mean fixing > whatever is preventing that? That's what I had in the previous RFC and I think Christian didn't really like it. >> +struct ib_mr *efa_reg_user_mr_dmabuf(struct ib_pd *ibpd, u64 start, >> + u64 length, u64 virt_addr, >> + int fd, int access_flags, >> + struct ib_udata *udata) >> +{ >> + struct efa_dev *dev = to_edev(ibpd->device); >> + struct ib_umem_dmabuf *umem_dmabuf; >> + struct efa_mr *mr; >> + int err; >> + >> + mr = efa_alloc_mr(ibpd, access_flags, udata); >> + if (IS_ERR(mr)) { >> + err = PTR_ERR(mr); >> + goto err_out; >> + } >> + >> + umem_dmabuf = ib_umem_dmabuf_get(ibpd->device, start, length, fd, >> + access_flags, &efa_dmabuf_attach_ops); >> + if (IS_ERR(umem_dmabuf)) { >> + ibdev_dbg(&dev->ibdev, "Failed to get dmabuf[%d]\n", err); >> + err = PTR_ERR(umem_dmabuf); >> + goto err_free; >> + } >> + >> + dma_resv_lock(umem_dmabuf->attach->dmabuf->resv, NULL); >> + err = dma_buf_pin(umem_dmabuf->attach); >> + if (err) { >> + ibdev_dbg(&dev->ibdev, "Failed to pin dmabuf memory\n"); >> + goto err_release; >> + } >> + >> + err = ib_umem_dmabuf_map_pages(umem_dmabuf); >> + if (err) { >> + ibdev_dbg(&dev->ibdev, "Failed to map dmabuf pages\n"); >> + goto err_unpin; >> + } >> + dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv); > > If it is really this simple the core code should have this logic, > 'ib_umem_dmabuf_get_pinned()' or something Should get_pinned do just get + dma_buf_pin, or should it do ib_umem_dmabuf_map_pages as well?
On Sun, Oct 10, 2021 at 09:55:49AM +0300, Gal Pressman wrote: > On 07/10/2021 14:40, Jason Gunthorpe wrote: > > On Thu, Oct 07, 2021 at 01:43:00PM +0300, Gal Pressman wrote: > > > >> @@ -1491,26 +1493,29 @@ static int efa_create_pbl(struct efa_dev *dev, > >> return 0; > >> } > >> > >> -struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length, > >> - u64 virt_addr, int access_flags, > >> - struct ib_udata *udata) > >> +static void efa_dmabuf_invalidate_cb(struct dma_buf_attachment *attach) > >> +{ > >> + WARN_ON_ONCE(1, > >> + "Invalidate callback should not be called when memory is pinned\n"); > >> +} > >> + > >> +static struct dma_buf_attach_ops efa_dmabuf_attach_ops = { > >> + .allow_peer2peer = true, > >> + .move_notify = efa_dmabuf_invalidate_cb, > >> +}; > > > > Shouldn't move_notify really just be left as NULL? I mean fixing > > whatever is preventing that? > > That's what I had in the previous RFC and I think Christian didn't really like it. Well, having drivers define a dummy function that only fails looks a lot worse to me. If not null then it should be a general 'dmabuf_unsupported_move_notify' shared function > >> + err = ib_umem_dmabuf_map_pages(umem_dmabuf); > >> + if (err) { > >> + ibdev_dbg(&dev->ibdev, "Failed to map dmabuf pages\n"); > >> + goto err_unpin; > >> + } > >> + dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv); > > > > If it is really this simple the core code should have this logic, > > 'ib_umem_dmabuf_get_pinned()' or something > > Should get_pinned do just get + dma_buf_pin, or should it do > ib_umem_dmabuf_map_pages as well? Yes the map_pages too, a umem is supposed to be dma mapped after creation. Jason
On 12/10/2021 2:28, Jason Gunthorpe wrote: > On Sun, Oct 10, 2021 at 09:55:49AM +0300, Gal Pressman wrote: >> On 07/10/2021 14:40, Jason Gunthorpe wrote: >>> On Thu, Oct 07, 2021 at 01:43:00PM +0300, Gal Pressman wrote: >>> >>>> @@ -1491,26 +1493,29 @@ static int efa_create_pbl(struct efa_dev *dev, >>>> return 0; >>>> } >>>> >>>> -struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length, >>>> - u64 virt_addr, int access_flags, >>>> - struct ib_udata *udata) >>>> +static void efa_dmabuf_invalidate_cb(struct dma_buf_attachment *attach) >>>> +{ >>>> + WARN_ON_ONCE(1, >>>> + "Invalidate callback should not be called when memory is pinned\n"); >>>> +} >>>> + >>>> +static struct dma_buf_attach_ops efa_dmabuf_attach_ops = { >>>> + .allow_peer2peer = true, >>>> + .move_notify = efa_dmabuf_invalidate_cb, >>>> +}; >>> >>> Shouldn't move_notify really just be left as NULL? I mean fixing >>> whatever is preventing that? >> >> That's what I had in the previous RFC and I think Christian didn't really like it. > > Well, having drivers define a dummy function that only fails looks > a lot worse to me. If not null then it should be a general > 'dmabuf_unsupported_move_notify' shared function Will do. >>>> + err = ib_umem_dmabuf_map_pages(umem_dmabuf); >>>> + if (err) { >>>> + ibdev_dbg(&dev->ibdev, "Failed to map dmabuf pages\n"); >>>> + goto err_unpin; >>>> + } >>>> + dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv); >>> >>> If it is really this simple the core code should have this logic, >>> 'ib_umem_dmabuf_get_pinned()' or something >> >> Should get_pinned do just get + dma_buf_pin, or should it do >> ib_umem_dmabuf_map_pages as well? > > Yes the map_pages too, a umem is supposed to be dma mapped after > creation. Will do, thanks Jason.
diff --git a/drivers/infiniband/hw/efa/efa.h b/drivers/infiniband/hw/efa/efa.h index 2b8ca099b381..407d7c4baa16 100644 --- a/drivers/infiniband/hw/efa/efa.h +++ b/drivers/infiniband/hw/efa/efa.h @@ -141,6 +141,10 @@ int efa_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length, u64 virt_addr, int access_flags, struct ib_udata *udata); +struct ib_mr *efa_reg_user_mr_dmabuf(struct ib_pd *ibpd, u64 start, + u64 length, u64 virt_addr, + int fd, int access_flags, + struct ib_udata *udata); int efa_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata); int efa_get_port_immutable(struct ib_device *ibdev, u32 port_num, struct ib_port_immutable *immutable); diff --git a/drivers/infiniband/hw/efa/efa_main.c b/drivers/infiniband/hw/efa/efa_main.c index 203e6ddcacbc..72cd7d952a07 100644 --- a/drivers/infiniband/hw/efa/efa_main.c +++ b/drivers/infiniband/hw/efa/efa_main.c @@ -267,6 +267,7 @@ static const struct ib_device_ops efa_dev_ops = { .query_port = efa_query_port, .query_qp = efa_query_qp, .reg_user_mr = efa_reg_mr, + .reg_user_mr_dmabuf = efa_reg_user_mr_dmabuf, INIT_RDMA_OBJ_SIZE(ib_ah, efa_ah, ibah), INIT_RDMA_OBJ_SIZE(ib_cq, efa_cq, ibcq), diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c index be6d3ff0f1be..ca907853a84f 100644 --- a/drivers/infiniband/hw/efa/efa_verbs.c +++ b/drivers/infiniband/hw/efa/efa_verbs.c @@ -3,6 +3,8 @@ * Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All rights reserved. */ +#include <linux/dma-buf.h> +#include <linux/dma-resv.h> #include <linux/vmalloc.h> #include <linux/log2.h> @@ -1491,26 +1493,29 @@ static int efa_create_pbl(struct efa_dev *dev, return 0; } -struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length, - u64 virt_addr, int access_flags, - struct ib_udata *udata) +static void efa_dmabuf_invalidate_cb(struct dma_buf_attachment *attach) +{ + WARN_ON_ONCE(1, + "Invalidate callback should not be called when memory is pinned\n"); +} + +static struct dma_buf_attach_ops efa_dmabuf_attach_ops = { + .allow_peer2peer = true, + .move_notify = efa_dmabuf_invalidate_cb, +}; + +static struct efa_mr *efa_alloc_mr(struct ib_pd *ibpd, int access_flags, + struct ib_udata *udata) { struct efa_dev *dev = to_edev(ibpd->device); - struct efa_com_reg_mr_params params = {}; - struct efa_com_reg_mr_result result = {}; - struct pbl_context pbl; int supp_access_flags; - unsigned int pg_sz; struct efa_mr *mr; - int inline_size; - int err; if (udata && udata->inlen && !ib_is_udata_cleared(udata, 0, sizeof(udata->inlen))) { ibdev_dbg(&dev->ibdev, "Incompatible ABI params, udata not cleared\n"); - err = -EINVAL; - goto err_out; + return ERR_PTR(-EINVAL); } supp_access_flags = @@ -1522,23 +1527,26 @@ struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length, ibdev_dbg(&dev->ibdev, "Unsupported access flags[%#x], supported[%#x]\n", access_flags, supp_access_flags); - err = -EOPNOTSUPP; - goto err_out; + return ERR_PTR(-EOPNOTSUPP); } mr = kzalloc(sizeof(*mr), GFP_KERNEL); - if (!mr) { - err = -ENOMEM; - goto err_out; - } + if (!mr) + return ERR_PTR(-ENOMEM); - mr->umem = ib_umem_get(ibpd->device, start, length, access_flags); - if (IS_ERR(mr->umem)) { - err = PTR_ERR(mr->umem); - ibdev_dbg(&dev->ibdev, - "Failed to pin and map user space memory[%d]\n", err); - goto err_free; - } + return mr; +} + +static int efa_register_mr(struct ib_pd *ibpd, struct efa_mr *mr, u64 start, + u64 length, u64 virt_addr, int access_flags) +{ + struct efa_dev *dev = to_edev(ibpd->device); + struct efa_com_reg_mr_params params = {}; + struct efa_com_reg_mr_result result = {}; + struct pbl_context pbl; + unsigned int pg_sz; + int inline_size; + int err; params.pd = to_epd(ibpd)->pdn; params.iova = virt_addr; @@ -1549,10 +1557,9 @@ struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length, dev->dev_attr.page_size_cap, virt_addr); if (!pg_sz) { - err = -EOPNOTSUPP; ibdev_dbg(&dev->ibdev, "Failed to find a suitable page size in page_size_cap %#llx\n", dev->dev_attr.page_size_cap); - goto err_unmap; + return -EOPNOTSUPP; } params.page_shift = order_base_2(pg_sz); @@ -1566,21 +1573,21 @@ struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length, if (params.page_num <= inline_size) { err = efa_create_inline_pbl(dev, mr, ¶ms); if (err) - goto err_unmap; + return err; err = efa_com_register_mr(&dev->edev, ¶ms, &result); if (err) - goto err_unmap; + return err; } else { err = efa_create_pbl(dev, &pbl, mr, ¶ms); if (err) - goto err_unmap; + return err; err = efa_com_register_mr(&dev->edev, ¶ms, &result); pbl_destroy(dev, &pbl); if (err) - goto err_unmap; + return err; } mr->ibmr.lkey = result.l_key; @@ -1588,9 +1595,98 @@ struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length, mr->ibmr.length = length; ibdev_dbg(&dev->ibdev, "Registered mr[%d]\n", mr->ibmr.lkey); + return 0; +} + +struct ib_mr *efa_reg_user_mr_dmabuf(struct ib_pd *ibpd, u64 start, + u64 length, u64 virt_addr, + int fd, int access_flags, + struct ib_udata *udata) +{ + struct efa_dev *dev = to_edev(ibpd->device); + struct ib_umem_dmabuf *umem_dmabuf; + struct efa_mr *mr; + int err; + + mr = efa_alloc_mr(ibpd, access_flags, udata); + if (IS_ERR(mr)) { + err = PTR_ERR(mr); + goto err_out; + } + + umem_dmabuf = ib_umem_dmabuf_get(ibpd->device, start, length, fd, + access_flags, &efa_dmabuf_attach_ops); + if (IS_ERR(umem_dmabuf)) { + ibdev_dbg(&dev->ibdev, "Failed to get dmabuf[%d]\n", err); + err = PTR_ERR(umem_dmabuf); + goto err_free; + } + + dma_resv_lock(umem_dmabuf->attach->dmabuf->resv, NULL); + err = dma_buf_pin(umem_dmabuf->attach); + if (err) { + ibdev_dbg(&dev->ibdev, "Failed to pin dmabuf memory\n"); + goto err_release; + } + + err = ib_umem_dmabuf_map_pages(umem_dmabuf); + if (err) { + ibdev_dbg(&dev->ibdev, "Failed to map dmabuf pages\n"); + goto err_unpin; + } + dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv); + + mr->umem = &umem_dmabuf->umem; + err = efa_register_mr(ibpd, mr, start, length, virt_addr, access_flags); + if (err) + goto err_unmap; + return &mr->ibmr; err_unmap: + dma_resv_lock(umem_dmabuf->attach->dmabuf->resv, NULL); + ib_umem_dmabuf_unmap_pages(umem_dmabuf); +err_unpin: + dma_buf_unpin(umem_dmabuf->attach); +err_release: + dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv); + ib_umem_release(mr->umem); +err_free: + kfree(mr); +err_out: + atomic64_inc(&dev->stats.reg_mr_err); + return ERR_PTR(err); +} + +struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length, + u64 virt_addr, int access_flags, + struct ib_udata *udata) +{ + struct efa_dev *dev = to_edev(ibpd->device); + struct efa_mr *mr; + int err; + + mr = efa_alloc_mr(ibpd, access_flags, udata); + if (IS_ERR(mr)) { + err = PTR_ERR(mr); + goto err_out; + } + + mr->umem = ib_umem_get(ibpd->device, start, length, access_flags); + if (IS_ERR(mr->umem)) { + err = PTR_ERR(mr->umem); + ibdev_dbg(&dev->ibdev, + "Failed to pin and map user space memory[%d]\n", err); + goto err_free; + } + + err = efa_register_mr(ibpd, mr, start, length, virt_addr, access_flags); + if (err) + goto err_release; + + return &mr->ibmr; + +err_release: ib_umem_release(mr->umem); err_free: kfree(mr); @@ -1603,6 +1699,7 @@ int efa_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) { struct efa_dev *dev = to_edev(ibmr->device); struct efa_com_dereg_mr_params params; + struct ib_umem_dmabuf *umem_dmabuf; struct efa_mr *mr = to_emr(ibmr); int err; @@ -1613,6 +1710,15 @@ int efa_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) if (err) return err; + if (mr->umem->is_dmabuf) { + umem_dmabuf = to_ib_umem_dmabuf(mr->umem); + + dma_resv_lock(umem_dmabuf->attach->dmabuf->resv, NULL); + ib_umem_dmabuf_unmap_pages(umem_dmabuf); + dma_buf_unpin(umem_dmabuf->attach); + dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv); + } + ib_umem_release(mr->umem); kfree(mr);
Implement a dmabuf importer for the EFA driver. As ODP is not supported, the dmabuf memory regions always pin the buffers to prevent the move_notify callback from being called. Signed-off-by: Gal Pressman <galpress@amazon.com> --- drivers/infiniband/hw/efa/efa.h | 4 + drivers/infiniband/hw/efa/efa_main.c | 1 + drivers/infiniband/hw/efa/efa_verbs.c | 166 +++++++++++++++++++++----- 3 files changed, 141 insertions(+), 30 deletions(-)