diff mbox series

[RFC,2/2] RDMA/efa: Add support for dmabuf memory regions

Message ID 20211007104301.76693-3-galpress@amazon.com (mailing list archive)
State Superseded
Headers show
Series EFA dmabuf memory regions | expand

Commit Message

Gal Pressman Oct. 7, 2021, 10:43 a.m. UTC
Implement a dmabuf importer for the EFA driver. As ODP is not supported,
the dmabuf memory regions always pin the buffers to prevent the
move_notify callback from being called.

Signed-off-by: Gal Pressman <galpress@amazon.com>
---
 drivers/infiniband/hw/efa/efa.h       |   4 +
 drivers/infiniband/hw/efa/efa_main.c  |   1 +
 drivers/infiniband/hw/efa/efa_verbs.c | 166 +++++++++++++++++++++-----
 3 files changed, 141 insertions(+), 30 deletions(-)

Comments

Jason Gunthorpe Oct. 7, 2021, 11:40 a.m. UTC | #1
On Thu, Oct 07, 2021 at 01:43:00PM +0300, Gal Pressman wrote:

> @@ -1491,26 +1493,29 @@ static int efa_create_pbl(struct efa_dev *dev,
>  	return 0;
>  }
>  
> -struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length,
> -			 u64 virt_addr, int access_flags,
> -			 struct ib_udata *udata)
> +static void efa_dmabuf_invalidate_cb(struct dma_buf_attachment *attach)
> +{
> +	WARN_ON_ONCE(1,
> +		     "Invalidate callback should not be called when memory is pinned\n");
> +}
> +
> +static struct dma_buf_attach_ops efa_dmabuf_attach_ops = {
> +	.allow_peer2peer = true,
> +	.move_notify = efa_dmabuf_invalidate_cb,
> +};

Shouldn't move_notify really just be left as NULL? I mean fixing
whatever is preventing that?

> +struct ib_mr *efa_reg_user_mr_dmabuf(struct ib_pd *ibpd, u64 start,
> +				     u64 length, u64 virt_addr,
> +				     int fd, int access_flags,
> +				     struct ib_udata *udata)
> +{
> +	struct efa_dev *dev = to_edev(ibpd->device);
> +	struct ib_umem_dmabuf *umem_dmabuf;
> +	struct efa_mr *mr;
> +	int err;
> +
> +	mr = efa_alloc_mr(ibpd, access_flags, udata);
> +	if (IS_ERR(mr)) {
> +		err = PTR_ERR(mr);
> +		goto err_out;
> +	}
> +
> +	umem_dmabuf = ib_umem_dmabuf_get(ibpd->device, start, length, fd,
> +					 access_flags, &efa_dmabuf_attach_ops);
> +	if (IS_ERR(umem_dmabuf)) {
> +		ibdev_dbg(&dev->ibdev, "Failed to get dmabuf[%d]\n", err);
> +		err = PTR_ERR(umem_dmabuf);
> +		goto err_free;
> +	}
> +
> +	dma_resv_lock(umem_dmabuf->attach->dmabuf->resv, NULL);
> +	err = dma_buf_pin(umem_dmabuf->attach);
> +	if (err) {
> +		ibdev_dbg(&dev->ibdev, "Failed to pin dmabuf memory\n");
> +		goto err_release;
> +	}
> +
> +	err = ib_umem_dmabuf_map_pages(umem_dmabuf);
> +	if (err) {
> +		ibdev_dbg(&dev->ibdev, "Failed to map dmabuf pages\n");
> +		goto err_unpin;
> +	}
> +	dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv);

If it is really this simple the core code should have this logic,
'ib_umem_dmabuf_get_pinned()' or something

Jason
Gal Pressman Oct. 10, 2021, 6:55 a.m. UTC | #2
On 07/10/2021 14:40, Jason Gunthorpe wrote:
> On Thu, Oct 07, 2021 at 01:43:00PM +0300, Gal Pressman wrote:
> 
>> @@ -1491,26 +1493,29 @@ static int efa_create_pbl(struct efa_dev *dev,
>>  	return 0;
>>  }
>>  
>> -struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length,
>> -			 u64 virt_addr, int access_flags,
>> -			 struct ib_udata *udata)
>> +static void efa_dmabuf_invalidate_cb(struct dma_buf_attachment *attach)
>> +{
>> +	WARN_ON_ONCE(1,
>> +		     "Invalidate callback should not be called when memory is pinned\n");
>> +}
>> +
>> +static struct dma_buf_attach_ops efa_dmabuf_attach_ops = {
>> +	.allow_peer2peer = true,
>> +	.move_notify = efa_dmabuf_invalidate_cb,
>> +};
> 
> Shouldn't move_notify really just be left as NULL? I mean fixing
> whatever is preventing that?

That's what I had in the previous RFC and I think Christian didn't really like it.

>> +struct ib_mr *efa_reg_user_mr_dmabuf(struct ib_pd *ibpd, u64 start,
>> +				     u64 length, u64 virt_addr,
>> +				     int fd, int access_flags,
>> +				     struct ib_udata *udata)
>> +{
>> +	struct efa_dev *dev = to_edev(ibpd->device);
>> +	struct ib_umem_dmabuf *umem_dmabuf;
>> +	struct efa_mr *mr;
>> +	int err;
>> +
>> +	mr = efa_alloc_mr(ibpd, access_flags, udata);
>> +	if (IS_ERR(mr)) {
>> +		err = PTR_ERR(mr);
>> +		goto err_out;
>> +	}
>> +
>> +	umem_dmabuf = ib_umem_dmabuf_get(ibpd->device, start, length, fd,
>> +					 access_flags, &efa_dmabuf_attach_ops);
>> +	if (IS_ERR(umem_dmabuf)) {
>> +		ibdev_dbg(&dev->ibdev, "Failed to get dmabuf[%d]\n", err);
>> +		err = PTR_ERR(umem_dmabuf);
>> +		goto err_free;
>> +	}
>> +
>> +	dma_resv_lock(umem_dmabuf->attach->dmabuf->resv, NULL);
>> +	err = dma_buf_pin(umem_dmabuf->attach);
>> +	if (err) {
>> +		ibdev_dbg(&dev->ibdev, "Failed to pin dmabuf memory\n");
>> +		goto err_release;
>> +	}
>> +
>> +	err = ib_umem_dmabuf_map_pages(umem_dmabuf);
>> +	if (err) {
>> +		ibdev_dbg(&dev->ibdev, "Failed to map dmabuf pages\n");
>> +		goto err_unpin;
>> +	}
>> +	dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv);
> 
> If it is really this simple the core code should have this logic,
> 'ib_umem_dmabuf_get_pinned()' or something

Should get_pinned do just get + dma_buf_pin, or should it do
ib_umem_dmabuf_map_pages as well?
Jason Gunthorpe Oct. 11, 2021, 11:28 p.m. UTC | #3
On Sun, Oct 10, 2021 at 09:55:49AM +0300, Gal Pressman wrote:
> On 07/10/2021 14:40, Jason Gunthorpe wrote:
> > On Thu, Oct 07, 2021 at 01:43:00PM +0300, Gal Pressman wrote:
> > 
> >> @@ -1491,26 +1493,29 @@ static int efa_create_pbl(struct efa_dev *dev,
> >>  	return 0;
> >>  }
> >>  
> >> -struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length,
> >> -			 u64 virt_addr, int access_flags,
> >> -			 struct ib_udata *udata)
> >> +static void efa_dmabuf_invalidate_cb(struct dma_buf_attachment *attach)
> >> +{
> >> +	WARN_ON_ONCE(1,
> >> +		     "Invalidate callback should not be called when memory is pinned\n");
> >> +}
> >> +
> >> +static struct dma_buf_attach_ops efa_dmabuf_attach_ops = {
> >> +	.allow_peer2peer = true,
> >> +	.move_notify = efa_dmabuf_invalidate_cb,
> >> +};
> > 
> > Shouldn't move_notify really just be left as NULL? I mean fixing
> > whatever is preventing that?
> 
> That's what I had in the previous RFC and I think Christian didn't really like it.

Well, having drivers define a dummy function that only fails looks
a lot worse to me. If not null then it should be a general
'dmabuf_unsupported_move_notify' shared function

> >> +	err = ib_umem_dmabuf_map_pages(umem_dmabuf);
> >> +	if (err) {
> >> +		ibdev_dbg(&dev->ibdev, "Failed to map dmabuf pages\n");
> >> +		goto err_unpin;
> >> +	}
> >> +	dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv);
> > 
> > If it is really this simple the core code should have this logic,
> > 'ib_umem_dmabuf_get_pinned()' or something
> 
> Should get_pinned do just get + dma_buf_pin, or should it do
> ib_umem_dmabuf_map_pages as well?

Yes the map_pages too, a umem is supposed to be dma mapped after
creation.

Jason
Gal Pressman Oct. 12, 2021, 11:41 a.m. UTC | #4
On 12/10/2021 2:28, Jason Gunthorpe wrote:
> On Sun, Oct 10, 2021 at 09:55:49AM +0300, Gal Pressman wrote:
>> On 07/10/2021 14:40, Jason Gunthorpe wrote:
>>> On Thu, Oct 07, 2021 at 01:43:00PM +0300, Gal Pressman wrote:
>>>
>>>> @@ -1491,26 +1493,29 @@ static int efa_create_pbl(struct efa_dev *dev,
>>>>    return 0;
>>>>  }
>>>>
>>>> -struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length,
>>>> -                   u64 virt_addr, int access_flags,
>>>> -                   struct ib_udata *udata)
>>>> +static void efa_dmabuf_invalidate_cb(struct dma_buf_attachment *attach)
>>>> +{
>>>> +  WARN_ON_ONCE(1,
>>>> +               "Invalidate callback should not be called when memory is pinned\n");
>>>> +}
>>>> +
>>>> +static struct dma_buf_attach_ops efa_dmabuf_attach_ops = {
>>>> +  .allow_peer2peer = true,
>>>> +  .move_notify = efa_dmabuf_invalidate_cb,
>>>> +};
>>>
>>> Shouldn't move_notify really just be left as NULL? I mean fixing
>>> whatever is preventing that?
>>
>> That's what I had in the previous RFC and I think Christian didn't really like it.
> 
> Well, having drivers define a dummy function that only fails looks
> a lot worse to me. If not null then it should be a general
> 'dmabuf_unsupported_move_notify' shared function

Will do.

>>>> +  err = ib_umem_dmabuf_map_pages(umem_dmabuf);
>>>> +  if (err) {
>>>> +          ibdev_dbg(&dev->ibdev, "Failed to map dmabuf pages\n");
>>>> +          goto err_unpin;
>>>> +  }
>>>> +  dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv);
>>>
>>> If it is really this simple the core code should have this logic,
>>> 'ib_umem_dmabuf_get_pinned()' or something
>>
>> Should get_pinned do just get + dma_buf_pin, or should it do
>> ib_umem_dmabuf_map_pages as well?
> 
> Yes the map_pages too, a umem is supposed to be dma mapped after
> creation.

Will do, thanks Jason.
diff mbox series

Patch

diff --git a/drivers/infiniband/hw/efa/efa.h b/drivers/infiniband/hw/efa/efa.h
index 2b8ca099b381..407d7c4baa16 100644
--- a/drivers/infiniband/hw/efa/efa.h
+++ b/drivers/infiniband/hw/efa/efa.h
@@ -141,6 +141,10 @@  int efa_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
 struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length,
 			 u64 virt_addr, int access_flags,
 			 struct ib_udata *udata);
+struct ib_mr *efa_reg_user_mr_dmabuf(struct ib_pd *ibpd, u64 start,
+				     u64 length, u64 virt_addr,
+				     int fd, int access_flags,
+				     struct ib_udata *udata);
 int efa_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata);
 int efa_get_port_immutable(struct ib_device *ibdev, u32 port_num,
 			   struct ib_port_immutable *immutable);
diff --git a/drivers/infiniband/hw/efa/efa_main.c b/drivers/infiniband/hw/efa/efa_main.c
index 203e6ddcacbc..72cd7d952a07 100644
--- a/drivers/infiniband/hw/efa/efa_main.c
+++ b/drivers/infiniband/hw/efa/efa_main.c
@@ -267,6 +267,7 @@  static const struct ib_device_ops efa_dev_ops = {
 	.query_port = efa_query_port,
 	.query_qp = efa_query_qp,
 	.reg_user_mr = efa_reg_mr,
+	.reg_user_mr_dmabuf = efa_reg_user_mr_dmabuf,
 
 	INIT_RDMA_OBJ_SIZE(ib_ah, efa_ah, ibah),
 	INIT_RDMA_OBJ_SIZE(ib_cq, efa_cq, ibcq),
diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c
index be6d3ff0f1be..ca907853a84f 100644
--- a/drivers/infiniband/hw/efa/efa_verbs.c
+++ b/drivers/infiniband/hw/efa/efa_verbs.c
@@ -3,6 +3,8 @@ 
  * Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 
+#include <linux/dma-buf.h>
+#include <linux/dma-resv.h>
 #include <linux/vmalloc.h>
 #include <linux/log2.h>
 
@@ -1491,26 +1493,29 @@  static int efa_create_pbl(struct efa_dev *dev,
 	return 0;
 }
 
-struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length,
-			 u64 virt_addr, int access_flags,
-			 struct ib_udata *udata)
+static void efa_dmabuf_invalidate_cb(struct dma_buf_attachment *attach)
+{
+	WARN_ON_ONCE(1,
+		     "Invalidate callback should not be called when memory is pinned\n");
+}
+
+static struct dma_buf_attach_ops efa_dmabuf_attach_ops = {
+	.allow_peer2peer = true,
+	.move_notify = efa_dmabuf_invalidate_cb,
+};
+
+static struct efa_mr *efa_alloc_mr(struct ib_pd *ibpd, int access_flags,
+				   struct ib_udata *udata)
 {
 	struct efa_dev *dev = to_edev(ibpd->device);
-	struct efa_com_reg_mr_params params = {};
-	struct efa_com_reg_mr_result result = {};
-	struct pbl_context pbl;
 	int supp_access_flags;
-	unsigned int pg_sz;
 	struct efa_mr *mr;
-	int inline_size;
-	int err;
 
 	if (udata && udata->inlen &&
 	    !ib_is_udata_cleared(udata, 0, sizeof(udata->inlen))) {
 		ibdev_dbg(&dev->ibdev,
 			  "Incompatible ABI params, udata not cleared\n");
-		err = -EINVAL;
-		goto err_out;
+		return ERR_PTR(-EINVAL);
 	}
 
 	supp_access_flags =
@@ -1522,23 +1527,26 @@  struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length,
 		ibdev_dbg(&dev->ibdev,
 			  "Unsupported access flags[%#x], supported[%#x]\n",
 			  access_flags, supp_access_flags);
-		err = -EOPNOTSUPP;
-		goto err_out;
+		return ERR_PTR(-EOPNOTSUPP);
 	}
 
 	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
-	if (!mr) {
-		err = -ENOMEM;
-		goto err_out;
-	}
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
 
-	mr->umem = ib_umem_get(ibpd->device, start, length, access_flags);
-	if (IS_ERR(mr->umem)) {
-		err = PTR_ERR(mr->umem);
-		ibdev_dbg(&dev->ibdev,
-			  "Failed to pin and map user space memory[%d]\n", err);
-		goto err_free;
-	}
+	return mr;
+}
+
+static int efa_register_mr(struct ib_pd *ibpd, struct efa_mr *mr, u64 start,
+			   u64 length, u64 virt_addr, int access_flags)
+{
+	struct efa_dev *dev = to_edev(ibpd->device);
+	struct efa_com_reg_mr_params params = {};
+	struct efa_com_reg_mr_result result = {};
+	struct pbl_context pbl;
+	unsigned int pg_sz;
+	int inline_size;
+	int err;
 
 	params.pd = to_epd(ibpd)->pdn;
 	params.iova = virt_addr;
@@ -1549,10 +1557,9 @@  struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length,
 				       dev->dev_attr.page_size_cap,
 				       virt_addr);
 	if (!pg_sz) {
-		err = -EOPNOTSUPP;
 		ibdev_dbg(&dev->ibdev, "Failed to find a suitable page size in page_size_cap %#llx\n",
 			  dev->dev_attr.page_size_cap);
-		goto err_unmap;
+		return -EOPNOTSUPP;
 	}
 
 	params.page_shift = order_base_2(pg_sz);
@@ -1566,21 +1573,21 @@  struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length,
 	if (params.page_num <= inline_size) {
 		err = efa_create_inline_pbl(dev, mr, &params);
 		if (err)
-			goto err_unmap;
+			return err;
 
 		err = efa_com_register_mr(&dev->edev, &params, &result);
 		if (err)
-			goto err_unmap;
+			return err;
 	} else {
 		err = efa_create_pbl(dev, &pbl, mr, &params);
 		if (err)
-			goto err_unmap;
+			return err;
 
 		err = efa_com_register_mr(&dev->edev, &params, &result);
 		pbl_destroy(dev, &pbl);
 
 		if (err)
-			goto err_unmap;
+			return err;
 	}
 
 	mr->ibmr.lkey = result.l_key;
@@ -1588,9 +1595,98 @@  struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length,
 	mr->ibmr.length = length;
 	ibdev_dbg(&dev->ibdev, "Registered mr[%d]\n", mr->ibmr.lkey);
 
+	return 0;
+}
+
+struct ib_mr *efa_reg_user_mr_dmabuf(struct ib_pd *ibpd, u64 start,
+				     u64 length, u64 virt_addr,
+				     int fd, int access_flags,
+				     struct ib_udata *udata)
+{
+	struct efa_dev *dev = to_edev(ibpd->device);
+	struct ib_umem_dmabuf *umem_dmabuf;
+	struct efa_mr *mr;
+	int err;
+
+	mr = efa_alloc_mr(ibpd, access_flags, udata);
+	if (IS_ERR(mr)) {
+		err = PTR_ERR(mr);
+		goto err_out;
+	}
+
+	umem_dmabuf = ib_umem_dmabuf_get(ibpd->device, start, length, fd,
+					 access_flags, &efa_dmabuf_attach_ops);
+	if (IS_ERR(umem_dmabuf)) {
+		ibdev_dbg(&dev->ibdev, "Failed to get dmabuf[%d]\n", err);
+		err = PTR_ERR(umem_dmabuf);
+		goto err_free;
+	}
+
+	dma_resv_lock(umem_dmabuf->attach->dmabuf->resv, NULL);
+	err = dma_buf_pin(umem_dmabuf->attach);
+	if (err) {
+		ibdev_dbg(&dev->ibdev, "Failed to pin dmabuf memory\n");
+		goto err_release;
+	}
+
+	err = ib_umem_dmabuf_map_pages(umem_dmabuf);
+	if (err) {
+		ibdev_dbg(&dev->ibdev, "Failed to map dmabuf pages\n");
+		goto err_unpin;
+	}
+	dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv);
+
+	mr->umem = &umem_dmabuf->umem;
+	err = efa_register_mr(ibpd, mr, start, length, virt_addr, access_flags);
+	if (err)
+		goto err_unmap;
+
 	return &mr->ibmr;
 
 err_unmap:
+	dma_resv_lock(umem_dmabuf->attach->dmabuf->resv, NULL);
+	ib_umem_dmabuf_unmap_pages(umem_dmabuf);
+err_unpin:
+	dma_buf_unpin(umem_dmabuf->attach);
+err_release:
+	dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv);
+	ib_umem_release(mr->umem);
+err_free:
+	kfree(mr);
+err_out:
+	atomic64_inc(&dev->stats.reg_mr_err);
+	return ERR_PTR(err);
+}
+
+struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length,
+			 u64 virt_addr, int access_flags,
+			 struct ib_udata *udata)
+{
+	struct efa_dev *dev = to_edev(ibpd->device);
+	struct efa_mr *mr;
+	int err;
+
+	mr = efa_alloc_mr(ibpd, access_flags, udata);
+	if (IS_ERR(mr)) {
+		err = PTR_ERR(mr);
+		goto err_out;
+	}
+
+	mr->umem = ib_umem_get(ibpd->device, start, length, access_flags);
+	if (IS_ERR(mr->umem)) {
+		err = PTR_ERR(mr->umem);
+		ibdev_dbg(&dev->ibdev,
+			  "Failed to pin and map user space memory[%d]\n", err);
+		goto err_free;
+	}
+
+	err = efa_register_mr(ibpd, mr, start, length, virt_addr, access_flags);
+	if (err)
+		goto err_release;
+
+	return &mr->ibmr;
+
+err_release:
 	ib_umem_release(mr->umem);
 err_free:
 	kfree(mr);
@@ -1603,6 +1699,7 @@  int efa_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
 {
 	struct efa_dev *dev = to_edev(ibmr->device);
 	struct efa_com_dereg_mr_params params;
+	struct ib_umem_dmabuf *umem_dmabuf;
 	struct efa_mr *mr = to_emr(ibmr);
 	int err;
 
@@ -1613,6 +1710,15 @@  int efa_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
 	if (err)
 		return err;
 
+	if (mr->umem->is_dmabuf) {
+		umem_dmabuf = to_ib_umem_dmabuf(mr->umem);
+
+		dma_resv_lock(umem_dmabuf->attach->dmabuf->resv, NULL);
+		ib_umem_dmabuf_unmap_pages(umem_dmabuf);
+		dma_buf_unpin(umem_dmabuf->attach);
+		dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv);
+	}
+
 	ib_umem_release(mr->umem);
 	kfree(mr);