diff mbox series

[v7,4/5] RDMA/mlx5: Support dma-buf based userspace memory region

Message ID 1604527595-39736-5-git-send-email-jianxin.xiong@intel.com (mailing list archive)
State New, archived
Headers show
Series RDMA: Add dma-buf support | expand

Commit Message

Xiong, Jianxin Nov. 4, 2020, 10:06 p.m. UTC
Implement the new driver method 'reg_user_mr_dmabuf'.  Utilize the core
functions to import dma-buf based memory region and update the mappings.

Add code to handle dma-buf related page fault.

Signed-off-by: Jianxin Xiong <jianxin.xiong@intel.com>
Reviewed-by: Sean Hefty <sean.hefty@intel.com>
Acked-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
Acked-by: Christian Koenig <christian.koenig@amd.com>
Acked-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/infiniband/hw/mlx5/main.c    |   2 +
 drivers/infiniband/hw/mlx5/mlx5_ib.h |  18 +++++
 drivers/infiniband/hw/mlx5/mr.c      | 123 ++++++++++++++++++++++++++++++++++-
 drivers/infiniband/hw/mlx5/odp.c     | 105 +++++++++++++++++++++++++++---
 4 files changed, 236 insertions(+), 12 deletions(-)

Comments

Jason Gunthorpe Nov. 5, 2020, 12:07 a.m. UTC | #1
On Wed, Nov 04, 2020 at 02:06:34PM -0800, Jianxin Xiong wrote:
> +	umem = ib_umem_dmabuf_get(&dev->ib_dev, offset, length, fd, access_flags,
> +				  &mlx5_ib_dmabuf_attach_ops);
> +	if (IS_ERR(umem)) {
> +		mlx5_ib_dbg(dev, "umem get failed (%ld)\n", PTR_ERR(umem));
> +		return ERR_PTR(PTR_ERR(umem));
> +	}
> +
> +	mr = alloc_mr_from_cache(pd, umem, virt_addr, access_flags);

It is very subtle, but this calls mlx5_umem_find_best_pgsz() which
calls ib_umem_find_best_pgsz() which goes over the SGL to determine
the page size to use.

As part of this it does validation of the IOVA vs first page offset vs
first page dma address. These little details come into play if the
IOVA and offset are not PAGE_SIZE aligned, which is very possible if
the dma buf exporter or system PAGE_SIZE is over 4k.

In other words, the dma_address of the first SGL must be the page
aligned starting point of the MR. Since the 'skip' approach is being
done when breaking the SGL into blocks the ib_umem_find_best_pgsz()
sees an invalid page size.

Slicing it has to be done in a way that gives a properly formed
SGL. 

My suggestion is to just change the SGL in place. Iterate to the
starting SGE in the SGL and assign it to the sg table, modify it to
have a offset dma_address and reduced length

Count the number of SGEs to span the remaning range and use that as
the new nmaped

Edit the last SGE to have a reduced length

Upon unmap undo the edits so the exporter doesn't see the mangled SGL.

It would be saner if the exporter could do this, but oh well.

Approximately like this:

	struct ib_umem *umem = &umem_p->umem;
	struct scatterlist *sg;
	int i;

	for_each_sg(umem_p->umem.sg_head.sgl, sg, umem_p->umem.nmap, i) {
		if (cur + sg_dma_len(sg) > ALIGN_DOWN(umem->address, PAGE_SIZE)) {
			unsigned long offset;

			umem_p->first_sg = sg;
			umem_p->first_dma_address = sg->dma_address;
			umem_p->first_dma_length = sg_dma_len(sg);
			umem_p->first_length = sg->length;
			offset = ALIGN_DOWN(umem->addressm PAGE_SIZE) - cur;
			sg->dma_address += offset;
			sg_dma_len(sg) -= offset;
			sg->length -= offset;
		}
		if (ALIGN(umem->address + umem->length, PAGE_SIZE) < cur + sg_dma_len(sg)) {
			unsigned long trim;

			umem_p->last_sg = sg;
			umem_p->last_dma_length = sg_dma_len(sg);
			umem_p->last_length = sg->length;
			trim =  cur + sg_dma_len(sg) - ALIGN(umem->address + umem->length, PAGE_SIZE);
			sg_dma_len(sg) -= trim;
			sg->length -= trim;
			return npages;
		}
                cur += sg_dma_len(sg);
	}
        /* It is essential that the length of the SGL exactly match
  	   the adjusted page aligned length of umem->length */
	return -EINVAL;

Further, this really only works if the umem->page_size is locked to 4k
because this doesn't have code to resize the MKEY, or change the
underlying page size when the SGL changes.

So, I'd say put something like the above in the core code to validate
and properly form the umem->sgl

Then modify the alloc_mr_from_cache to use only PAGE_SIZE:

 if (umem->is_dma_buf)
        page_size = ib_umem_find_best_pgsz(umem, PAGE_SIZE, iova);
 else
    	page_size = mlx5_umem_find_best_pgsz(umem, mkc, log_page_size, 0, iova);

Jason
Xiong, Jianxin Nov. 5, 2020, 12:36 a.m. UTC | #2
> -----Original Message-----
> From: Jason Gunthorpe <jgg@ziepe.ca>
> Sent: Wednesday, November 04, 2020 4:07 PM
> To: Xiong, Jianxin <jianxin.xiong@intel.com>
> Cc: linux-rdma@vger.kernel.org; dri-devel@lists.freedesktop.org; Doug Ledford <dledford@redhat.com>; Leon Romanovsky
> <leon@kernel.org>; Sumit Semwal <sumit.semwal@linaro.org>; Christian Koenig <christian.koenig@amd.com>; Vetter, Daniel
> <daniel.vetter@intel.com>
> Subject: Re: [PATCH v7 4/5] RDMA/mlx5: Support dma-buf based userspace memory region
> 
> On Wed, Nov 04, 2020 at 02:06:34PM -0800, Jianxin Xiong wrote:
> > +	umem = ib_umem_dmabuf_get(&dev->ib_dev, offset, length, fd, access_flags,
> > +				  &mlx5_ib_dmabuf_attach_ops);
> > +	if (IS_ERR(umem)) {
> > +		mlx5_ib_dbg(dev, "umem get failed (%ld)\n", PTR_ERR(umem));
> > +		return ERR_PTR(PTR_ERR(umem));
> > +	}
> > +
> > +	mr = alloc_mr_from_cache(pd, umem, virt_addr, access_flags);
> 
> It is very subtle, but this calls mlx5_umem_find_best_pgsz() which calls ib_umem_find_best_pgsz() which goes over the SGL to determine
> the page size to use.
> 

When this is called here, the umem sglist is still NULL because dma_buf_map_attachment()
is not called until a page fault occurs. In patch 1/5, the function ib_umem_find_best_pgsz()
has been modified to always return PAGE_SIZE for dma-buf based MR.

> As part of this it does validation of the IOVA vs first page offset vs first page dma address. These little details come into play if the IOVA and
> offset are not PAGE_SIZE aligned, which is very possible if the dma buf exporter or system PAGE_SIZE is over 4k.
> 
> In other words, the dma_address of the first SGL must be the page aligned starting point of the MR. Since the 'skip' approach is being done
> when breaking the SGL into blocks the ib_umem_find_best_pgsz() sees an invalid page size.
> 
> Slicing it has to be done in a way that gives a properly formed SGL.
> 
> My suggestion is to just change the SGL in place. Iterate to the starting SGE in the SGL and assign it to the sg table, modify it to have a offset
> dma_address and reduced length
> 
> Count the number of SGEs to span the remaning range and use that as the new nmaped
> 
> Edit the last SGE to have a reduced length

Do you still think modifying the SGL in place needed given the above explanation? I do see
some benefits of doing so -- hiding the discrepancy of sgl and addr/length from the device drivers and avoid special handling in the code that use the sgl. 

> 
> Upon unmap undo the edits so the exporter doesn't see the mangled SGL.
> 
> It would be saner if the exporter could do this, but oh well.
> 
> Approximately like this:
> 
> 	struct ib_umem *umem = &umem_p->umem;
> 	struct scatterlist *sg;
> 	int i;
> 
> 	for_each_sg(umem_p->umem.sg_head.sgl, sg, umem_p->umem.nmap, i) {
> 		if (cur + sg_dma_len(sg) > ALIGN_DOWN(umem->address, PAGE_SIZE)) {
> 			unsigned long offset;
> 
> 			umem_p->first_sg = sg;
> 			umem_p->first_dma_address = sg->dma_address;
> 			umem_p->first_dma_length = sg_dma_len(sg);
> 			umem_p->first_length = sg->length;
> 			offset = ALIGN_DOWN(umem->addressm PAGE_SIZE) - cur;
> 			sg->dma_address += offset;
> 			sg_dma_len(sg) -= offset;
> 			sg->length -= offset;
> 		}
> 		if (ALIGN(umem->address + umem->length, PAGE_SIZE) < cur + sg_dma_len(sg)) {
> 			unsigned long trim;
> 
> 			umem_p->last_sg = sg;
> 			umem_p->last_dma_length = sg_dma_len(sg);
> 			umem_p->last_length = sg->length;
> 			trim =  cur + sg_dma_len(sg) - ALIGN(umem->address + umem->length, PAGE_SIZE);
> 			sg_dma_len(sg) -= trim;
> 			sg->length -= trim;
> 			return npages;
> 		}
>                 cur += sg_dma_len(sg);
> 	}
>         /* It is essential that the length of the SGL exactly match
>   	   the adjusted page aligned length of umem->length */
> 	return -EINVAL;
> 
> Further, this really only works if the umem->page_size is locked to 4k because this doesn't have code to resize the MKEY, or change the
> underlying page size when the SGL changes.

Yes, now it's locked to 4K. 

> 
> So, I'd say put something like the above in the core code to validate and properly form the umem->sgl
> 
> Then modify the alloc_mr_from_cache to use only PAGE_SIZE:
> 
>  if (umem->is_dma_buf)
>         page_size = ib_umem_find_best_pgsz(umem, PAGE_SIZE, iova);  else
>     	page_size = mlx5_umem_find_best_pgsz(umem, mkc, log_page_size, 0, iova);
> 

This should have been addressed in patch 1/5.

Thanks,
Jianxin

> Jason
Jason Gunthorpe Nov. 5, 2020, 2:22 p.m. UTC | #3
On Thu, Nov 05, 2020 at 12:36:25AM +0000, Xiong, Jianxin wrote:
> > From: Jason Gunthorpe <jgg@ziepe.ca>
> > Sent: Wednesday, November 04, 2020 4:07 PM
> > To: Xiong, Jianxin <jianxin.xiong@intel.com>
> > Cc: linux-rdma@vger.kernel.org; dri-devel@lists.freedesktop.org; Doug Ledford <dledford@redhat.com>; Leon Romanovsky
> > <leon@kernel.org>; Sumit Semwal <sumit.semwal@linaro.org>; Christian Koenig <christian.koenig@amd.com>; Vetter, Daniel
> > <daniel.vetter@intel.com>
> > Subject: Re: [PATCH v7 4/5] RDMA/mlx5: Support dma-buf based userspace memory region
> > 
> > On Wed, Nov 04, 2020 at 02:06:34PM -0800, Jianxin Xiong wrote:
> > > +	umem = ib_umem_dmabuf_get(&dev->ib_dev, offset, length, fd, access_flags,
> > > +				  &mlx5_ib_dmabuf_attach_ops);
> > > +	if (IS_ERR(umem)) {
> > > +		mlx5_ib_dbg(dev, "umem get failed (%ld)\n", PTR_ERR(umem));
> > > +		return ERR_PTR(PTR_ERR(umem));
> > > +	}
> > > +
> > > +	mr = alloc_mr_from_cache(pd, umem, virt_addr, access_flags);
> > 
> > It is very subtle, but this calls mlx5_umem_find_best_pgsz() which calls ib_umem_find_best_pgsz() which goes over the SGL to determine
> > the page size to use.
> > 
> 
> When this is called here, the umem sglist is still NULL because dma_buf_map_attachment()
> is not called until a page fault occurs. In patch 1/5, the function ib_umem_find_best_pgsz()
> has been modified to always return PAGE_SIZE for dma-buf based MR.

Oh.. That isn't a good idea.

ib_umem_find_best_pgsz() must be run on any SGL list to validate it
against the constraints, making it un-runable for the dmabuf case
means we can never support large page size or even validate that the
SGL is properly formed.

So I think this need to change the alloc_mr_from_cache() to early exit
for dma_buf ones

And it still need to call ib_umem_find_best_pgsz() but
just check the page size.

> > Edit the last SGE to have a reduced length
> 
> Do you still think modifying the SGL in place needed given the above
> explanation? I do see some benefits of doing so -- hiding the
> discrepancy of sgl and addr/length from the device drivers and avoid
> special handling in the code that use the sgl.

Yes, a umem SGL should always be properly formed or I will have a
meltdown trying to keep all the drivers working :\

Jason
diff mbox series

Patch

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 36b15a0..e647ea4 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -1,6 +1,7 @@ 
 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
 /*
  * Copyright (c) 2013-2020, Mellanox Technologies inc. All rights reserved.
+ * Copyright (c) 2020, Intel Corporation. All rights reserved.
  */
 
 #include <linux/debugfs.h>
@@ -4055,6 +4056,7 @@  static int mlx5_ib_enable_driver(struct ib_device *dev)
 	.query_srq = mlx5_ib_query_srq,
 	.query_ucontext = mlx5_ib_query_ucontext,
 	.reg_user_mr = mlx5_ib_reg_user_mr,
+	.reg_user_mr_dmabuf = mlx5_ib_reg_user_mr_dmabuf,
 	.req_notify_cq = mlx5_ib_arm_cq,
 	.rereg_user_mr = mlx5_ib_rereg_user_mr,
 	.resize_cq = mlx5_ib_resize_cq,
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index bb44080..3ef6872 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -1,6 +1,7 @@ 
 /* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
 /*
  * Copyright (c) 2013-2020, Mellanox Technologies inc. All rights reserved.
+ * Copyright (c) 2020, Intel Corporation. All rights reserved.
  */
 
 #ifndef MLX5_IB_H
@@ -665,6 +666,12 @@  static inline bool is_odp_mr(struct mlx5_ib_mr *mr)
 	       mr->umem->is_odp;
 }
 
+static inline bool is_dmabuf_mr(struct mlx5_ib_mr *mr)
+{
+	return IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) && mr->umem &&
+	       mr->umem->is_dmabuf;
+}
+
 struct mlx5_ib_mw {
 	struct ib_mw		ibmw;
 	struct mlx5_core_mkey	mmkey;
@@ -1200,6 +1207,10 @@  int mlx5_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
 struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 				  u64 virt_addr, int access_flags,
 				  struct ib_udata *udata);
+struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 start,
+					 u64 length, u64 virt_addr,
+					 int fd, int access_flags,
+					 struct ib_udata *udata);
 int mlx5_ib_advise_mr(struct ib_pd *pd,
 		      enum ib_uverbs_advise_mr_advice advice,
 		      u32 flags,
@@ -1210,11 +1221,13 @@  int mlx5_ib_advise_mr(struct ib_pd *pd,
 int mlx5_ib_dealloc_mw(struct ib_mw *mw);
 int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
 		       int page_shift, int flags);
+int mlx5_ib_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags);
 struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
 					     struct ib_udata *udata,
 					     int access_flags);
 void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *mr);
 void mlx5_ib_fence_odp_mr(struct mlx5_ib_mr *mr);
+void mlx5_ib_fence_dmabuf_mr(struct mlx5_ib_mr *mr);
 int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
 			  u64 length, u64 virt_addr, int access_flags,
 			  struct ib_pd *pd, struct ib_udata *udata);
@@ -1306,6 +1319,7 @@  int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd,
 			       enum ib_uverbs_advise_mr_advice advice,
 			       u32 flags, struct ib_sge *sg_list, u32 num_sge);
 int mlx5_ib_init_odp_mr(struct mlx5_ib_mr *mr, bool enable);
+int mlx5_ib_init_dmabuf_mr(struct mlx5_ib_mr *mr);
 #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
 static inline void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev)
 {
@@ -1331,6 +1345,10 @@  static inline int mlx5_ib_init_odp_mr(struct mlx5_ib_mr *mr, bool enable)
 {
 	return -EOPNOTSUPP;
 }
+static inline int mlx5_ib_init_dmabuf_mr(struct mlx5_ib_mr *mr)
+{
+	return -EOPNOTSUPP;
+}
 #endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
 
 extern const struct mmu_interval_notifier_ops mlx5_mn_ops;
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 9f653b4..f39127b 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -1,5 +1,6 @@ 
 /*
  * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2020, Intel Corporation. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -36,6 +37,8 @@ 
 #include <linux/debugfs.h>
 #include <linux/export.h>
 #include <linux/delay.h>
+#include <linux/dma-buf.h>
+#include <linux/dma-resv.h>
 #include <rdma/ib_umem.h>
 #include <rdma/ib_umem_odp.h>
 #include <rdma/ib_verbs.h>
@@ -1212,8 +1215,11 @@  int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
 
 /*
  * Send the DMA list to the HW for a normal MR using UMR.
+ * DMABUF MR is handled in a similar way, with two exceptions:
+ * (1) The MR may only cover a sub-range of the scatterlist;
+ * (2) The MLX5_IB_UPD_XLT_ZAP flag can be used.
  */
-static int mlx5_ib_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags)
+int mlx5_ib_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags)
 {
 	struct mlx5_ib_dev *dev = mr->dev;
 	struct device *ddev = dev->ib_dev.dev.parent;
@@ -1225,6 +1231,8 @@  static int mlx5_ib_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags)
 	size_t final_size;
 	struct ib_sge sg;
 	int err = 0;
+	unsigned int skip;
+	unsigned int todo;
 
 	if (WARN_ON(mr->umem->is_odp))
 		return -EINVAL;
@@ -1237,6 +1245,11 @@  static int mlx5_ib_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags)
 		return -ENOMEM;
 	orig_sg_length = sg.length;
 
+	if (mr->umem->is_dmabuf) {
+		skip = mr->umem->address >> mr->page_shift;
+		todo = ib_umem_num_dma_blocks(mr->umem, 1 << mr->page_shift);
+	}
+
 	cur_mtt = mtt;
 	rdma_for_each_block (mr->umem->sg_head.sgl, &biter, mr->umem->nmap,
 			     BIT(mr->page_shift)) {
@@ -1255,6 +1268,19 @@  static int mlx5_ib_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags)
 		cur_mtt->ptag =
 			cpu_to_be64(rdma_block_iter_dma_address(&biter) |
 				    MLX5_IB_MTT_PRESENT);
+
+		if (mr->umem->is_dmabuf) {
+			if (flags & MLX5_IB_UPD_XLT_ZAP)
+				cur_mtt->ptag = 0;
+			if (skip) {
+				skip--;
+				continue;
+			}
+			if (!todo)
+				break;
+			todo--;
+		}
+
 		cur_mtt++;
 	}
 
@@ -1581,6 +1607,97 @@  struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 	return ERR_PTR(err);
 }
 
+static void mlx5_ib_dmabuf_invalidate_cb(struct dma_buf_attachment *attach)
+{
+	struct ib_umem_dmabuf *umem_dmabuf = attach->importer_priv;
+	struct mlx5_ib_mr *mr = umem_dmabuf->private;
+
+	dma_resv_assert_held(umem_dmabuf->attach->dmabuf->resv);
+
+	/* mr could have been destroyed. see mlx5_ib_fence_dmabuf_mr(). */
+	if (!mr)
+		return;
+
+	mlx5_ib_update_mr_pas(mr, MLX5_IB_UPD_XLT_ZAP);
+	ib_umem_dmabuf_unmap_pages(umem_dmabuf);
+}
+
+static struct dma_buf_attach_ops mlx5_ib_dmabuf_attach_ops = {
+	.allow_peer2peer = 1,
+	.move_notify = mlx5_ib_dmabuf_invalidate_cb,
+};
+
+struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset,
+					 u64 length, u64 virt_addr,
+					 int fd, int access_flags,
+					 struct ib_udata *udata)
+{
+	struct mlx5_ib_dev *dev = to_mdev(pd->device);
+	struct mlx5_ib_mr *mr = NULL;
+	struct ib_umem *umem;
+	int err;
+
+	if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM))
+		return ERR_PTR(-EOPNOTSUPP);
+
+	mlx5_ib_dbg(dev,
+		    "offset 0x%llx, virt_addr 0x%llx, length 0x%llx, fd %d, access_flags 0x%x\n",
+		    offset, virt_addr, length, fd, access_flags);
+
+	if (!mlx5_ib_can_load_pas_with_umr(dev, length))
+		return ERR_PTR(-EINVAL);
+
+	umem = ib_umem_dmabuf_get(&dev->ib_dev, offset, length, fd, access_flags,
+				  &mlx5_ib_dmabuf_attach_ops);
+	if (IS_ERR(umem)) {
+		mlx5_ib_dbg(dev, "umem get failed (%ld)\n", PTR_ERR(umem));
+		return ERR_PTR(PTR_ERR(umem));
+	}
+
+	mr = alloc_mr_from_cache(pd, umem, virt_addr, access_flags);
+	if (IS_ERR(mr))
+		mr = NULL;
+
+	if (!mr) {
+		mutex_lock(&dev->slow_path_mutex);
+		mr = reg_create(NULL, pd, umem, virt_addr, access_flags,
+				false);
+		mutex_unlock(&dev->slow_path_mutex);
+	}
+
+	if (IS_ERR(mr)) {
+		err = PTR_ERR(mr);
+		goto error;
+	}
+
+	mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key);
+
+	mr->umem = umem;
+	atomic_add(ib_umem_num_pages(mr->umem), &dev->mdev->priv.reg_pages);
+	set_mr_fields(dev, mr, length, access_flags);
+
+	to_ib_umem_dmabuf(umem)->private = mr;
+	init_waitqueue_head(&mr->q_deferred_work);
+	atomic_set(&mr->num_deferred_work, 0);
+	err = xa_err(xa_store(&dev->odp_mkeys,
+			      mlx5_base_mkey(mr->mmkey.key), &mr->mmkey,
+			      GFP_KERNEL));
+	if (err) {
+		dereg_mr(dev, mr);
+		return ERR_PTR(err);
+	}
+
+	err = mlx5_ib_init_dmabuf_mr(mr);
+	if (err) {
+		dereg_mr(dev, mr);
+		return ERR_PTR(err);
+	}
+	return &mr->ibmr;
+error:
+	ib_umem_release(umem);
+	return ERR_PTR(err);
+}
+
 /**
  * mlx5_mr_cache_invalidate - Fence all DMA on the MR
  * @mr: The MR to fence
@@ -1649,7 +1766,7 @@  int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
 	if (!mr->umem)
 		return -EINVAL;
 
-	if (is_odp_mr(mr))
+	if (is_odp_mr(mr) || is_dmabuf_mr(mr))
 		return -EOPNOTSUPP;
 
 	if (flags & IB_MR_REREG_TRANS) {
@@ -1812,6 +1929,8 @@  static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
 	/* Stop all DMA */
 	if (is_odp)
 		mlx5_ib_fence_odp_mr(mr);
+	else if (is_dmabuf_mr(mr))
+		mlx5_ib_fence_dmabuf_mr(mr);
 	else
 		clean_mr(dev, mr);
 
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index 5c853ec..546edc6 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -33,6 +33,8 @@ 
 #include <rdma/ib_umem.h>
 #include <rdma/ib_umem_odp.h>
 #include <linux/kernel.h>
+#include <linux/dma-buf.h>
+#include <linux/dma-resv.h>
 
 #include "mlx5_ib.h"
 #include "cmd.h"
@@ -664,6 +666,36 @@  void mlx5_ib_fence_odp_mr(struct mlx5_ib_mr *mr)
 	dma_fence_odp_mr(mr);
 }
 
+/**
+ * mlx5_ib_fence_dmabuf_mr - Stop all access to the dmabuf MR
+ * @mr: to fence
+ *
+ * On return no parallel threads will be touching this MR and no DMA will be
+ * active.
+ */
+void mlx5_ib_fence_dmabuf_mr(struct mlx5_ib_mr *mr)
+{
+	struct ib_umem_dmabuf *umem_dmabuf = to_ib_umem_dmabuf(mr->umem);
+
+	/* Prevent new page faults and prefetch requests from succeeding */
+	xa_erase(&mr->dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key));
+
+	/* Wait for all running page-fault handlers to finish. */
+	synchronize_srcu(&mr->dev->odp_srcu);
+
+	wait_event(mr->q_deferred_work, !atomic_read(&mr->num_deferred_work));
+
+	dma_resv_lock(umem_dmabuf->attach->dmabuf->resv, NULL);
+	mlx5_mr_cache_invalidate(mr);
+	umem_dmabuf->private = NULL;
+	dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv);
+
+	if (!mr->cache_ent) {
+		mlx5_core_destroy_mkey(mr->dev->mdev, &mr->mmkey);
+		WARN_ON(mr->descs);
+	}
+}
+
 #define MLX5_PF_FLAGS_DOWNGRADE BIT(1)
 #define MLX5_PF_FLAGS_SNAPSHOT BIT(2)
 #define MLX5_PF_FLAGS_ENABLE BIT(3)
@@ -801,6 +833,43 @@  static int pagefault_implicit_mr(struct mlx5_ib_mr *imr,
  * Returns:
  *  -EFAULT: The io_virt->bcnt is not within the MR, it covers pages that are
  *           not accessible, or the MR is no longer valid.
+ *  -EAGAIN: The operation should be retried
+ *
+ *  >0: Number of pages mapped
+ */
+static int pagefault_dmabuf_mr(struct mlx5_ib_mr *mr,
+			       struct ib_umem_dmabuf *umem_dmabuf,
+			       u64 user_va, size_t bcnt, u32 *bytes_mapped,
+			       u32 flags)
+{
+	int npages;
+	int err;
+	u32 xlt_flags = 0;
+
+	if (flags & MLX5_PF_FLAGS_ENABLE)
+		xlt_flags |= MLX5_IB_UPD_XLT_ENABLE;
+
+	dma_resv_lock(umem_dmabuf->attach->dmabuf->resv, NULL);
+	err = ib_umem_dmabuf_map_pages(umem_dmabuf);
+	if (!err)
+		err = mlx5_ib_update_mr_pas(mr, xlt_flags);
+	dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv);
+
+	if (err)
+		return err;
+
+	if (bytes_mapped)
+		*bytes_mapped += bcnt;
+
+	npages = (ALIGN(user_va + bcnt, PAGE_SIZE) -
+		 ALIGN_DOWN(user_va, PAGE_SIZE)) >> PAGE_SHIFT;
+	return npages;
+}
+
+/*
+ * Returns:
+ *  -EFAULT: The io_virt->bcnt is not within the MR, it covers pages that are
+ *           not accessible, or the MR is no longer valid.
  *  -EAGAIN/-ENOMEM: The operation should be retried
  *
  *  -EINVAL/others: General internal malfunction
@@ -810,22 +879,31 @@  static int pagefault_mr(struct mlx5_ib_mr *mr, u64 io_virt, size_t bcnt,
 			u32 *bytes_mapped, u32 flags)
 {
 	struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
+	struct ib_umem_dmabuf *umem_dmabuf = to_ib_umem_dmabuf(mr->umem);
 
 	lockdep_assert_held(&mr->dev->odp_srcu);
 	if (unlikely(io_virt < mr->mmkey.iova))
 		return -EFAULT;
 
-	if (!odp->is_implicit_odp) {
+	if (is_dmabuf_mr(mr) || !odp->is_implicit_odp) {
 		u64 user_va;
+		u64 end;
 
 		if (check_add_overflow(io_virt - mr->mmkey.iova,
-				       (u64)odp->umem.address, &user_va))
+				       (u64)mr->umem->address, &user_va))
 			return -EFAULT;
-		if (unlikely(user_va >= ib_umem_end(odp) ||
-			     ib_umem_end(odp) - user_va < bcnt))
+		if (is_dmabuf_mr(mr))
+			end = mr->umem->address + mr->umem->length;
+		else
+			end = ib_umem_end(odp);
+		if (unlikely(user_va >= end || end - user_va < bcnt))
 			return -EFAULT;
-		return pagefault_real_mr(mr, odp, user_va, bcnt, bytes_mapped,
-					 flags);
+		if (is_dmabuf_mr(mr))
+			return pagefault_dmabuf_mr(mr, umem_dmabuf, user_va,
+						   bcnt, bytes_mapped, flags);
+		else
+			return pagefault_real_mr(mr, odp, user_va, bcnt,
+						 bytes_mapped, flags);
 	}
 	return pagefault_implicit_mr(mr, odp, io_virt, bcnt, bytes_mapped,
 				     flags);
@@ -845,6 +923,16 @@  int mlx5_ib_init_odp_mr(struct mlx5_ib_mr *mr, bool enable)
 	return ret >= 0 ? 0 : ret;
 }
 
+int mlx5_ib_init_dmabuf_mr(struct mlx5_ib_mr *mr)
+{
+	int ret;
+
+	ret = pagefault_dmabuf_mr(mr, to_ib_umem_dmabuf(mr->umem),
+				  mr->umem->address, mr->umem->length, NULL,
+				  MLX5_PF_FLAGS_ENABLE);
+	return ret >= 0 ? 0 : ret;
+}
+
 struct pf_frame {
 	struct pf_frame *next;
 	u32 key;
@@ -1747,7 +1835,6 @@  static void destroy_prefetch_work(struct prefetch_mr_work *work)
 {
 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
 	struct mlx5_core_mkey *mmkey;
-	struct ib_umem_odp *odp;
 	struct mlx5_ib_mr *mr;
 
 	lockdep_assert_held(&dev->odp_srcu);
@@ -1761,11 +1848,9 @@  static void destroy_prefetch_work(struct prefetch_mr_work *work)
 	if (mr->ibmr.pd != pd)
 		return NULL;
 
-	odp = to_ib_umem_odp(mr->umem);
-
 	/* prefetch with write-access must be supported by the MR */
 	if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE &&
-	    !odp->umem.writable)
+	    !mr->umem->writable)
 		return NULL;
 
 	return mr;