diff mbox series

[v5,4/5] RDMA/mlx5: Support dma-buf based userspace memory region

Message ID 1602799378-138316-1-git-send-email-jianxin.xiong@intel.com
State New, archived
Headers show
Series [v5,1/5] RDMA/umem: Support importing dma-buf as user memory region | expand

Commit Message

Jianxin Xiong Oct. 15, 2020, 10:02 p.m. UTC
Implement the new driver method 'reg_user_mr_dmabuf'.  Utilize the core
functions to import dma-buf based memory region and update the mappings.

Add code to handle dma-buf related page fault.

Signed-off-by: Jianxin Xiong <jianxin.xiong@intel.com>
Reviewed-by: Sean Hefty <sean.hefty@intel.com>
Acked-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
Acked-by: Christian Koenig <christian.koenig@amd.com>
Acked-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/infiniband/hw/mlx5/main.c    |   2 +
 drivers/infiniband/hw/mlx5/mlx5_ib.h |   5 ++
 drivers/infiniband/hw/mlx5/mr.c      | 119 +++++++++++++++++++++++++++++++++++
 drivers/infiniband/hw/mlx5/odp.c     |  42 +++++++++++++
 4 files changed, 168 insertions(+)

Comments

Jason Gunthorpe Oct. 16, 2020, 12:54 a.m. UTC | #1
On Thu, Oct 15, 2020 at 03:02:58PM -0700, Jianxin Xiong wrote:
> Implement the new driver method 'reg_user_mr_dmabuf'.  Utilize the core
> functions to import dma-buf based memory region and update the mappings.
> 
> Add code to handle dma-buf related page fault.
> 
> Signed-off-by: Jianxin Xiong <jianxin.xiong@intel.com>
> Reviewed-by: Sean Hefty <sean.hefty@intel.com>
> Acked-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
> Acked-by: Christian Koenig <christian.koenig@amd.com>
> Acked-by: Daniel Vetter <daniel.vetter@ffwll.ch>
>  drivers/infiniband/hw/mlx5/main.c    |   2 +
>  drivers/infiniband/hw/mlx5/mlx5_ib.h |   5 ++
>  drivers/infiniband/hw/mlx5/mr.c      | 119 +++++++++++++++++++++++++++++++++++
>  drivers/infiniband/hw/mlx5/odp.c     |  42 +++++++++++++
>  4 files changed, 168 insertions(+)
> 
> diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
> index 89e04ca..ec4ad2f 100644
> +++ b/drivers/infiniband/hw/mlx5/main.c
> @@ -1,6 +1,7 @@
>  // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
>  /*
>   * Copyright (c) 2013-2020, Mellanox Technologies inc. All rights reserved.
> + * Copyright (c) 2020, Intel Corporation. All rights reserved.
>   */
>  
>  #include <linux/debugfs.h>
> @@ -4060,6 +4061,7 @@ static int mlx5_ib_enable_driver(struct ib_device *dev)
>  	.query_srq = mlx5_ib_query_srq,
>  	.query_ucontext = mlx5_ib_query_ucontext,
>  	.reg_user_mr = mlx5_ib_reg_user_mr,
> +	.reg_user_mr_dmabuf = mlx5_ib_reg_user_mr_dmabuf,
>  	.req_notify_cq = mlx5_ib_arm_cq,
>  	.rereg_user_mr = mlx5_ib_rereg_user_mr,
>  	.resize_cq = mlx5_ib_resize_cq,
> diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
> index b1f2b34..65fcc18 100644
> +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
> @@ -1,6 +1,7 @@
>  /* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
>  /*
>   * Copyright (c) 2013-2020, Mellanox Technologies inc. All rights reserved.
> + * Copyright (c) 2020, Intel Corporation. All rights reserved.
>   */
>  
>  #ifndef MLX5_IB_H
> @@ -1174,6 +1175,10 @@ int mlx5_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
>  struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
>  				  u64 virt_addr, int access_flags,
>  				  struct ib_udata *udata);
> +struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 start,
> +					 u64 length, u64 virt_addr,
> +					 int dmabuf_fd, int access_flags,
> +					 struct ib_udata *udata);
>  int mlx5_ib_advise_mr(struct ib_pd *pd,
>  		      enum ib_uverbs_advise_mr_advice advice,
>  		      u32 flags,
> diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
> index b261797..24750f1 100644
> +++ b/drivers/infiniband/hw/mlx5/mr.c
> @@ -1,5 +1,6 @@
>  /*
>   * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
> + * Copyright (c) 2020, Intel Corporation. All rights reserved.
>   *
>   * This software is available to you under a choice of one of two
>   * licenses.  You may choose to be licensed under the terms of the GNU
> @@ -1462,6 +1463,124 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
>  	return ERR_PTR(err);
>  }
>  
> +static int mlx5_ib_umem_dmabuf_xlt_init(struct ib_umem *umem, void *context)
> +{
> +	struct mlx5_ib_mr *mr = context;
> +	int flags = MLX5_IB_UPD_XLT_ENABLE;
> +
> +	if (!mr)
> +		return -EINVAL;
> +
> +	return mlx5_ib_update_xlt(mr, 0, mr->npages, PAGE_SHIFT, flags);
> +}

> +static int mlx5_ib_umem_dmabuf_xlt_update(struct ib_umem *umem, void *context)
> +{
> +	struct mlx5_ib_mr *mr = context;
> +	int flags = MLX5_IB_UPD_XLT_ATOMIC;

Why are these atomic? Why the strange coding style of declaring a
variable?

> +	if (!mr)
> +		return -EINVAL;

Why can this happen? Will dma_buf call move_notify prior to
dma_buf_map_attachment? There are locking problems if that happens.

> +	return mlx5_ib_update_xlt(mr, 0, mr->npages, PAGE_SHIFT, flags);
> +}
> +
> +static int mlx5_ib_umem_dmabuf_xlt_invalidate(struct ib_umem *umem, void *context)
> +{
> +	struct mlx5_ib_mr *mr = context;
> +	int flags = MLX5_IB_UPD_XLT_ZAP | MLX5_IB_UPD_XLT_ATOMIC;
> +
> +	if (!mr)
> +		return -EINVAL;
> +
> +	return mlx5_ib_update_xlt(mr, 0, mr->npages, PAGE_SHIFT, flags);
> +}
> +
> +static struct ib_umem_dmabuf_ops mlx5_ib_umem_dmabuf_ops = {
> +	.init = mlx5_ib_umem_dmabuf_xlt_init,
> +	.update = mlx5_ib_umem_dmabuf_xlt_update,
> +	.invalidate = mlx5_ib_umem_dmabuf_xlt_invalidate,
> +};

I'm not really convinced these should be ops, this is usually a bad
design pattern. 

Why do I need so much code to extract the sgl from the dma_buf? I
would prefer the dma_buf layer simplify this, not by adding a wrapper
around it in the IB core code...

> +struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 start,
> +					 u64 length, u64 virt_addr,
> +					 int dmabuf_fd, int access_flags,
> +					 struct ib_udata *udata)
> +{
> +	struct mlx5_ib_dev *dev = to_mdev(pd->device);
> +	struct mlx5_ib_mr *mr = NULL;
> +	struct ib_umem *umem;
> +	int page_shift;
> +	int npages;
> +	int ncont;
> +	int order;
> +	int err;
> +
> +	if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM))
> +		return ERR_PTR(-EOPNOTSUPP);
> +
> +	mlx5_ib_dbg(dev,
> +		    "start 0x%llx, virt_addr 0x%llx, length 0x%llx, fd %d, access_flags 0x%x\n",
> +		    start, virt_addr, length, dmabuf_fd, access_flags);
> +
> +	if (!mlx5_ib_can_load_pas_with_umr(dev, length))
> +		return ERR_PTR(-EINVAL);
> +
> +	umem = ib_umem_dmabuf_get(&dev->ib_dev, start, length, dmabuf_fd,
> +				  access_flags, &mlx5_ib_umem_dmabuf_ops);
> +	if (IS_ERR(umem)) {
> +		mlx5_ib_dbg(dev, "umem get failed (%ld)\n", PTR_ERR(umem));
> +		return ERR_PTR(PTR_ERR(umem));
> +	}
> +
> +	npages = ib_umem_num_pages(umem);
> +	if (!npages) {
> +		mlx5_ib_warn(dev, "avoid zero region\n");
> +		ib_umem_release(umem);
> +		return ERR_PTR(-EINVAL);
> +	}
> +
> +	page_shift = PAGE_SHIFT;
> +	ncont = npages;
> +	order = ilog2(roundup_pow_of_two(ncont));

We still need to deal with contiguity here, this ncont/npages is just
obfuscation.

I have a patch series that should get posted soon rewriting all of
this stuff..

> +	mlx5_ib_dbg(dev, "npages %d, ncont %d, order %d, page_shift %d\n",
> +		    npages, ncont, order, page_shift);
> +
> +	mr = alloc_mr_from_cache(pd, umem, virt_addr, length, ncont,
> +				 page_shift, order, access_flags);
> +	if (IS_ERR(mr))
> +		mr = NULL;
> +
> +	if (!mr) {
> +		mutex_lock(&dev->slow_path_mutex);
> +		mr = reg_create(NULL, pd, virt_addr, length, umem, ncont,
> +				page_shift, access_flags, false);
> +		mutex_unlock(&dev->slow_path_mutex);
> +	}
> +
> +	if (IS_ERR(mr)) {
> +		err = PTR_ERR(mr);
> +		goto error;
> +	}
> +
> +	mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key);
> +
> +	mr->umem = umem;
> +	set_mr_fields(dev, mr, npages, length, access_flags);

After another series I have there will be three copies of this
sequence :\

> +	err = ib_umem_dmabuf_init_mapping(umem, mr);
> +	if (err) {
> +		dereg_mr(dev, mr);
> +		return ERR_PTR(err);
> +	}

Did you test the page fault path at all? Looks like some xarray code
is missing here, and this is also missing the related complex teardown
logic.

Does this mean you didn't test the pagefault_dmabuf_mr() at all?

Jason
Jianxin Xiong Oct. 16, 2020, 6:40 a.m. UTC | #2
> -----Original Message-----
> From: Jason Gunthorpe <jgg@ziepe.ca>
> Sent: Thursday, October 15, 2020 5:54 PM
> To: Xiong, Jianxin <jianxin.xiong@intel.com>
> Cc: linux-rdma@vger.kernel.org; dri-devel@lists.freedesktop.org; Doug Ledford <dledford@redhat.com>; Leon Romanovsky
> <leon@kernel.org>; Sumit Semwal <sumit.semwal@linaro.org>; Christian Koenig <christian.koenig@amd.com>; Vetter, Daniel
> <daniel.vetter@intel.com>
> Subject: Re: [PATCH v5 4/5] RDMA/mlx5: Support dma-buf based userspace memory region
> 

//snip

> > +static int mlx5_ib_umem_dmabuf_xlt_init(struct ib_umem *umem, void
> > +*context) {
> > +	struct mlx5_ib_mr *mr = context;
> > +	int flags = MLX5_IB_UPD_XLT_ENABLE;
> > +
> > +	if (!mr)
> > +		return -EINVAL;
> > +
> > +	return mlx5_ib_update_xlt(mr, 0, mr->npages, PAGE_SHIFT, flags); }
> 
> > +static int mlx5_ib_umem_dmabuf_xlt_update(struct ib_umem *umem, void
> > +*context) {
> > +	struct mlx5_ib_mr *mr = context;
> > +	int flags = MLX5_IB_UPD_XLT_ATOMIC;
> 
> Why are these atomic? Why the strange coding style of declaring a variable?

The atomic flag is copied from the odp code. I have verified that it is not necessary.
I can remove the definition of 'flags' here.  

> 
> > +	if (!mr)
> > +		return -EINVAL;
> 
> Why can this happen? Will dma_buf call move_notify prior to dma_buf_map_attachment? There are locking problems if that happens.

I agree this check is unnecessary.

> 
> > +	return mlx5_ib_update_xlt(mr, 0, mr->npages, PAGE_SHIFT, flags); }
> > +
> > +static int mlx5_ib_umem_dmabuf_xlt_invalidate(struct ib_umem *umem,
> > +void *context) {
> > +	struct mlx5_ib_mr *mr = context;
> > +	int flags = MLX5_IB_UPD_XLT_ZAP | MLX5_IB_UPD_XLT_ATOMIC;
> > +
> > +	if (!mr)
> > +		return -EINVAL;
> > +
> > +	return mlx5_ib_update_xlt(mr, 0, mr->npages, PAGE_SHIFT, flags); }
> > +
> > +static struct ib_umem_dmabuf_ops mlx5_ib_umem_dmabuf_ops = {
> > +	.init = mlx5_ib_umem_dmabuf_xlt_init,
> > +	.update = mlx5_ib_umem_dmabuf_xlt_update,
> > +	.invalidate = mlx5_ib_umem_dmabuf_xlt_invalidate,
> > +};
> 
> I'm not really convinced these should be ops, this is usually a bad design pattern.
> 
> Why do I need so much code to extract the sgl from the dma_buf? I would prefer the dma_buf layer simplify this, not by adding a wrapper
> around it in the IB core code...
> 

We just need a way to call a device specific function to update the NIC's translation
table.  I considered three ways: (1) ops registered with ib_umem_get_dmabuf; 
(2) a single function pointer registered with ib_umem_get_dmabuf; (3) a method 
in 'struct ib_device'. Option (1) was chosen here with no strong reason. We could
consolidate the three functions of the ops into one, but then we will need to 
define commands or flags for different update operations.   

> > +struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 start,
> > +					 u64 length, u64 virt_addr,
> > +					 int dmabuf_fd, int access_flags,
> > +					 struct ib_udata *udata)
> > +{
> > +	struct mlx5_ib_dev *dev = to_mdev(pd->device);
> > +	struct mlx5_ib_mr *mr = NULL;
> > +	struct ib_umem *umem;
> > +	int page_shift;
> > +	int npages;
> > +	int ncont;
> > +	int order;
> > +	int err;
> > +
> > +	if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM))
> > +		return ERR_PTR(-EOPNOTSUPP);
> > +
> > +	mlx5_ib_dbg(dev,
> > +		    "start 0x%llx, virt_addr 0x%llx, length 0x%llx, fd %d, access_flags 0x%x\n",
> > +		    start, virt_addr, length, dmabuf_fd, access_flags);
> > +
> > +	if (!mlx5_ib_can_load_pas_with_umr(dev, length))
> > +		return ERR_PTR(-EINVAL);
> > +
> > +	umem = ib_umem_dmabuf_get(&dev->ib_dev, start, length, dmabuf_fd,
> > +				  access_flags, &mlx5_ib_umem_dmabuf_ops);
> > +	if (IS_ERR(umem)) {
> > +		mlx5_ib_dbg(dev, "umem get failed (%ld)\n", PTR_ERR(umem));
> > +		return ERR_PTR(PTR_ERR(umem));
> > +	}
> > +
> > +	npages = ib_umem_num_pages(umem);
> > +	if (!npages) {
> > +		mlx5_ib_warn(dev, "avoid zero region\n");
> > +		ib_umem_release(umem);
> > +		return ERR_PTR(-EINVAL);
> > +	}
> > +
> > +	page_shift = PAGE_SHIFT;
> > +	ncont = npages;
> > +	order = ilog2(roundup_pow_of_two(ncont));
> 
> We still need to deal with contiguity here, this ncont/npages is just obfuscation.

Since the pages can move, we can't take advantage of contiguity here. This handling
is similar to the ODP case. The variables 'ncont' and 'page_shift' here are not necessary.
They are kept just for the sake of signifying the semantics of the following functions that
use them.

> 
> I have a patch series that should get posted soon rewriting all of this stuff..

Great. This can be updated accordingly.

> 
> > +	mlx5_ib_dbg(dev, "npages %d, ncont %d, order %d, page_shift %d\n",
> > +		    npages, ncont, order, page_shift);
> > +
> > +	mr = alloc_mr_from_cache(pd, umem, virt_addr, length, ncont,
> > +				 page_shift, order, access_flags);
> > +	if (IS_ERR(mr))
> > +		mr = NULL;
> > +
> > +	if (!mr) {
> > +		mutex_lock(&dev->slow_path_mutex);
> > +		mr = reg_create(NULL, pd, virt_addr, length, umem, ncont,
> > +				page_shift, access_flags, false);
> > +		mutex_unlock(&dev->slow_path_mutex);
> > +	}
> > +
> > +	if (IS_ERR(mr)) {
> > +		err = PTR_ERR(mr);
> > +		goto error;
> > +	}
> > +
> > +	mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key);
> > +
> > +	mr->umem = umem;
> > +	set_mr_fields(dev, mr, npages, length, access_flags);
> 
> After another series I have there will be three copies of this sequence :\

Maybe this can be made into a utility function?

> 
> > +	err = ib_umem_dmabuf_init_mapping(umem, mr);
> > +	if (err) {
> > +		dereg_mr(dev, mr);
> > +		return ERR_PTR(err);
> > +	}
> 
> Did you test the page fault path at all? Looks like some xarray code is missing here, and this is also missing the related complex teardown
> logic.
> 
> Does this mean you didn't test the pagefault_dmabuf_mr() at all?

Thanks for the hint. I was unable to get the test runs reaching the
pagefault_dmabuf_mr() function. Now I have found the reason. Along
the path of all the page fault handlers, the array "odp_mkeys" is checked
against the mr key. Since the dmabuf mr keys are not in the list the
handler is never called.

On the other hand, it seems that pagefault_dmabuf_mr() is not needed at all.
The pagefault is gracefully handled by retrying until the work thread finished
programming the NIC.
  
> 
> Jason
Jason Gunthorpe Oct. 16, 2020, 6:58 p.m. UTC | #3
On Fri, Oct 16, 2020 at 06:40:01AM +0000, Xiong, Jianxin wrote:
> > > +	if (!mr)
> > > +		return -EINVAL;
> > > +
> > > +	return mlx5_ib_update_xlt(mr, 0, mr->npages, PAGE_SHIFT, flags); }
> > > +
> > > +static struct ib_umem_dmabuf_ops mlx5_ib_umem_dmabuf_ops = {
> > > +	.init = mlx5_ib_umem_dmabuf_xlt_init,
> > > +	.update = mlx5_ib_umem_dmabuf_xlt_update,
> > > +	.invalidate = mlx5_ib_umem_dmabuf_xlt_invalidate,
> > > +};
> > 
> > I'm not really convinced these should be ops, this is usually a bad design pattern.
> > 
> > Why do I need so much code to extract the sgl from the dma_buf? I would prefer the dma_buf layer simplify this, not by adding a wrapper
> > around it in the IB core code...
> > 
> 
> We just need a way to call a device specific function to update the NIC's translation
> table.  I considered three ways: (1) ops registered with ib_umem_get_dmabuf; 
> (2) a single function pointer registered with ib_umem_get_dmabuf; (3) a method 
> in 'struct ib_device'. Option (1) was chosen here with no strong reason. We could
> consolidate the three functions of the ops into one, but then we will need to 
> define commands or flags for different update operations.   

I'd rather the driver directly provide the dma_buf ops.. Inserting
layers that do nothing be call other layers is usually a bad idea. I
didn't look carefully yet at how that would be arranged.

> > > +	ncont = npages;
> > > +	order = ilog2(roundup_pow_of_two(ncont));
> > 
> > We still need to deal with contiguity here, this ncont/npages is just obfuscation.
> 
> Since the pages can move, we can't take advantage of contiguity here. This handling
> is similar to the ODP case. The variables 'ncont' and 'page_shift' here are not necessary.
> They are kept just for the sake of signifying the semantics of the following functions that
> use them.

Well, in this case we can manage it, and the performance boost is high
enough we need to. The work on mlx5 to do it is a bit inovlved though.
 
> > > +	err = ib_umem_dmabuf_init_mapping(umem, mr);
> > > +	if (err) {
> > > +		dereg_mr(dev, mr);
> > > +		return ERR_PTR(err);
> > > +	}
> > 
> > Did you test the page fault path at all? Looks like some xarray code is missing here, and this is also missing the related complex teardown
> > logic.
> > 
> > Does this mean you didn't test the pagefault_dmabuf_mr() at all?
> 
> Thanks for the hint. I was unable to get the test runs reaching the
> pagefault_dmabuf_mr() function. Now I have found the reason. Along
> the path of all the page fault handlers, the array "odp_mkeys" is checked
> against the mr key. Since the dmabuf mr keys are not in the list the
> handler is never called.
>
> On the other hand, it seems that pagefault_dmabuf_mr() is not needed at all.
> The pagefault is gracefully handled by retrying until the work thread finished
> programming the NIC.

This is a bug of some kind, pagefaults that can't find a mkey in the
xarray should cause completion with error.

Jason
Jianxin Xiong Oct. 16, 2020, 8:49 p.m. UTC | #4
> -----Original Message-----
> From: Jason Gunthorpe <jgg@ziepe.ca>
> Sent: Friday, October 16, 2020 11:58 AM
> To: Xiong, Jianxin <jianxin.xiong@intel.com>
> Cc: linux-rdma@vger.kernel.org; dri-devel@lists.freedesktop.org; Doug Ledford <dledford@redhat.com>; Leon Romanovsky
> <leon@kernel.org>; Sumit Semwal <sumit.semwal@linaro.org>; Christian Koenig <christian.koenig@amd.com>; Vetter, Daniel
> <daniel.vetter@intel.com>
> Subject: Re: [PATCH v5 4/5] RDMA/mlx5: Support dma-buf based userspace memory region
> 
> On Fri, Oct 16, 2020 at 06:40:01AM +0000, Xiong, Jianxin wrote:
> > > > +	if (!mr)
> > > > +		return -EINVAL;
> > > > +
> > > > +	return mlx5_ib_update_xlt(mr, 0, mr->npages, PAGE_SHIFT, flags);
> > > > +}
> > > > +
> > > > +static struct ib_umem_dmabuf_ops mlx5_ib_umem_dmabuf_ops = {
> > > > +	.init = mlx5_ib_umem_dmabuf_xlt_init,
> > > > +	.update = mlx5_ib_umem_dmabuf_xlt_update,
> > > > +	.invalidate = mlx5_ib_umem_dmabuf_xlt_invalidate,
> > > > +};
> > >
> > > I'm not really convinced these should be ops, this is usually a bad design pattern.
> > >
> > > Why do I need so much code to extract the sgl from the dma_buf? I
> > > would prefer the dma_buf layer simplify this, not by adding a wrapper around it in the IB core code...
> > >
> >
> > We just need a way to call a device specific function to update the
> > NIC's translation table.  I considered three ways: (1) ops registered
> > with ib_umem_get_dmabuf;
> > (2) a single function pointer registered with ib_umem_get_dmabuf; (3)
> > a method in 'struct ib_device'. Option (1) was chosen here with no
> > strong reason. We could consolidate the three functions of the ops into one, but then we will need to
> > define commands or flags for different update operations.
> 
> I'd rather the driver directly provide the dma_buf ops.. Inserting layers that do nothing be call other layers is usually a bad idea. I didn't look
> carefully yet at how that would be arranged.

I can work along that direction. One change I can see is that the umem_dmabuf structure
will need to be exposed to the device driver (currently it's private to the core).
 
> 
> > > > +	ncont = npages;
> > > > +	order = ilog2(roundup_pow_of_two(ncont));
> > >
> > > We still need to deal with contiguity here, this ncont/npages is just obfuscation.
> >
> > Since the pages can move, we can't take advantage of contiguity here.
> > This handling is similar to the ODP case. The variables 'ncont' and 'page_shift' here are not necessary.
> > They are kept just for the sake of signifying the semantics of the
> > following functions that use them.
> 
> Well, in this case we can manage it, and the performance boost is high enough we need to. The work on mlx5 to do it is a bit inovlved
> though.

Maybe as a future enhancement?

> 
> > > > +	err = ib_umem_dmabuf_init_mapping(umem, mr);
> > > > +	if (err) {
> > > > +		dereg_mr(dev, mr);
> > > > +		return ERR_PTR(err);
> > > > +	}
> > >
> > > Did you test the page fault path at all? Looks like some xarray code
> > > is missing here, and this is also missing the related complex teardown logic.
> > >
> > > Does this mean you didn't test the pagefault_dmabuf_mr() at all?
> >
> > Thanks for the hint. I was unable to get the test runs reaching the
> > pagefault_dmabuf_mr() function. Now I have found the reason. Along the
> > path of all the page fault handlers, the array "odp_mkeys" is checked
> > against the mr key. Since the dmabuf mr keys are not in the list the
> > handler is never called.
> >
> > On the other hand, it seems that pagefault_dmabuf_mr() is not needed at all.
> > The pagefault is gracefully handled by retrying until the work thread
> > finished programming the NIC.
> 
> This is a bug of some kind, pagefaults that can't find a mkey in the xarray should cause completion with error.
> 
> Jason
diff mbox series

Patch

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 89e04ca..ec4ad2f 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -1,6 +1,7 @@ 
 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
 /*
  * Copyright (c) 2013-2020, Mellanox Technologies inc. All rights reserved.
+ * Copyright (c) 2020, Intel Corporation. All rights reserved.
  */
 
 #include <linux/debugfs.h>
@@ -4060,6 +4061,7 @@  static int mlx5_ib_enable_driver(struct ib_device *dev)
 	.query_srq = mlx5_ib_query_srq,
 	.query_ucontext = mlx5_ib_query_ucontext,
 	.reg_user_mr = mlx5_ib_reg_user_mr,
+	.reg_user_mr_dmabuf = mlx5_ib_reg_user_mr_dmabuf,
 	.req_notify_cq = mlx5_ib_arm_cq,
 	.rereg_user_mr = mlx5_ib_rereg_user_mr,
 	.resize_cq = mlx5_ib_resize_cq,
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index b1f2b34..65fcc18 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -1,6 +1,7 @@ 
 /* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
 /*
  * Copyright (c) 2013-2020, Mellanox Technologies inc. All rights reserved.
+ * Copyright (c) 2020, Intel Corporation. All rights reserved.
  */
 
 #ifndef MLX5_IB_H
@@ -1174,6 +1175,10 @@  int mlx5_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
 struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 				  u64 virt_addr, int access_flags,
 				  struct ib_udata *udata);
+struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 start,
+					 u64 length, u64 virt_addr,
+					 int dmabuf_fd, int access_flags,
+					 struct ib_udata *udata);
 int mlx5_ib_advise_mr(struct ib_pd *pd,
 		      enum ib_uverbs_advise_mr_advice advice,
 		      u32 flags,
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index b261797..24750f1 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -1,5 +1,6 @@ 
 /*
  * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2020, Intel Corporation. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -1462,6 +1463,124 @@  struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 	return ERR_PTR(err);
 }
 
+static int mlx5_ib_umem_dmabuf_xlt_init(struct ib_umem *umem, void *context)
+{
+	struct mlx5_ib_mr *mr = context;
+	int flags = MLX5_IB_UPD_XLT_ENABLE;
+
+	if (!mr)
+		return -EINVAL;
+
+	return mlx5_ib_update_xlt(mr, 0, mr->npages, PAGE_SHIFT, flags);
+}
+
+static int mlx5_ib_umem_dmabuf_xlt_update(struct ib_umem *umem, void *context)
+{
+	struct mlx5_ib_mr *mr = context;
+	int flags = MLX5_IB_UPD_XLT_ATOMIC;
+
+	if (!mr)
+		return -EINVAL;
+
+	return mlx5_ib_update_xlt(mr, 0, mr->npages, PAGE_SHIFT, flags);
+}
+
+static int mlx5_ib_umem_dmabuf_xlt_invalidate(struct ib_umem *umem, void *context)
+{
+	struct mlx5_ib_mr *mr = context;
+	int flags = MLX5_IB_UPD_XLT_ZAP | MLX5_IB_UPD_XLT_ATOMIC;
+
+	if (!mr)
+		return -EINVAL;
+
+	return mlx5_ib_update_xlt(mr, 0, mr->npages, PAGE_SHIFT, flags);
+}
+
+static struct ib_umem_dmabuf_ops mlx5_ib_umem_dmabuf_ops = {
+	.init = mlx5_ib_umem_dmabuf_xlt_init,
+	.update = mlx5_ib_umem_dmabuf_xlt_update,
+	.invalidate = mlx5_ib_umem_dmabuf_xlt_invalidate,
+};
+
+struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 start,
+					 u64 length, u64 virt_addr,
+					 int dmabuf_fd, int access_flags,
+					 struct ib_udata *udata)
+{
+	struct mlx5_ib_dev *dev = to_mdev(pd->device);
+	struct mlx5_ib_mr *mr = NULL;
+	struct ib_umem *umem;
+	int page_shift;
+	int npages;
+	int ncont;
+	int order;
+	int err;
+
+	if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM))
+		return ERR_PTR(-EOPNOTSUPP);
+
+	mlx5_ib_dbg(dev,
+		    "start 0x%llx, virt_addr 0x%llx, length 0x%llx, fd %d, access_flags 0x%x\n",
+		    start, virt_addr, length, dmabuf_fd, access_flags);
+
+	if (!mlx5_ib_can_load_pas_with_umr(dev, length))
+		return ERR_PTR(-EINVAL);
+
+	umem = ib_umem_dmabuf_get(&dev->ib_dev, start, length, dmabuf_fd,
+				  access_flags, &mlx5_ib_umem_dmabuf_ops);
+	if (IS_ERR(umem)) {
+		mlx5_ib_dbg(dev, "umem get failed (%ld)\n", PTR_ERR(umem));
+		return ERR_PTR(PTR_ERR(umem));
+	}
+
+	npages = ib_umem_num_pages(umem);
+	if (!npages) {
+		mlx5_ib_warn(dev, "avoid zero region\n");
+		ib_umem_release(umem);
+		return ERR_PTR(-EINVAL);
+	}
+
+	page_shift = PAGE_SHIFT;
+	ncont = npages;
+	order = ilog2(roundup_pow_of_two(ncont));
+
+	mlx5_ib_dbg(dev, "npages %d, ncont %d, order %d, page_shift %d\n",
+		    npages, ncont, order, page_shift);
+
+	mr = alloc_mr_from_cache(pd, umem, virt_addr, length, ncont,
+				 page_shift, order, access_flags);
+	if (IS_ERR(mr))
+		mr = NULL;
+
+	if (!mr) {
+		mutex_lock(&dev->slow_path_mutex);
+		mr = reg_create(NULL, pd, virt_addr, length, umem, ncont,
+				page_shift, access_flags, false);
+		mutex_unlock(&dev->slow_path_mutex);
+	}
+
+	if (IS_ERR(mr)) {
+		err = PTR_ERR(mr);
+		goto error;
+	}
+
+	mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key);
+
+	mr->umem = umem;
+	set_mr_fields(dev, mr, npages, length, access_flags);
+
+	err = ib_umem_dmabuf_init_mapping(umem, mr);
+	if (err) {
+		dereg_mr(dev, mr);
+		return ERR_PTR(err);
+	}
+
+	return &mr->ibmr;
+error:
+	ib_umem_release(umem);
+	return ERR_PTR(err);
+}
+
 /**
  * mlx5_mr_cache_invalidate - Fence all DMA on the MR
  * @mr: The MR to fence
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index 5c853ec..16e2e51 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -801,6 +801,44 @@  static int pagefault_implicit_mr(struct mlx5_ib_mr *imr,
  * Returns:
  *  -EFAULT: The io_virt->bcnt is not within the MR, it covers pages that are
  *           not accessible, or the MR is no longer valid.
+ *  -EAGAIN: The operation should be retried
+ *
+ *  >0: Number of pages mapped
+ */
+static int pagefault_dmabuf_mr(struct mlx5_ib_mr *mr, struct ib_umem *umem,
+			       u64 io_virt, size_t bcnt, u32 *bytes_mapped,
+			       u32 flags)
+{
+	u64 user_va;
+	u64 end;
+	int npages;
+
+	if (unlikely(io_virt < mr->mmkey.iova))
+		return -EFAULT;
+	if (check_add_overflow(io_virt - mr->mmkey.iova,
+			       (u64)umem->address, &user_va))
+		return -EFAULT;
+
+	/* Overflow has alreddy been checked at the umem creation time */
+	end = umem->address + umem->length;
+	if (unlikely(user_va >= end || end  - user_va < bcnt))
+		return -EFAULT;
+
+	if (!ib_umem_dmabuf_mapping_ready(umem))
+		return -EAGAIN;
+
+	if (bytes_mapped)
+		*bytes_mapped += bcnt;
+
+	npages = (ALIGN(user_va + bcnt, PAGE_SIZE) -
+		 ALIGN_DOWN(user_va, PAGE_SIZE)) >> PAGE_SHIFT;
+	return npages;
+}
+
+/*
+ * Returns:
+ *  -EFAULT: The io_virt->bcnt is not within the MR, it covers pages that are
+ *           not accessible, or the MR is no longer valid.
  *  -EAGAIN/-ENOMEM: The operation should be retried
  *
  *  -EINVAL/others: General internal malfunction
@@ -811,6 +849,10 @@  static int pagefault_mr(struct mlx5_ib_mr *mr, u64 io_virt, size_t bcnt,
 {
 	struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
 
+	if (mr->umem->is_dmabuf)
+		return pagefault_dmabuf_mr(mr, mr->umem, io_virt, bcnt,
+					   bytes_mapped, flags);
+
 	lockdep_assert_held(&mr->dev->odp_srcu);
 	if (unlikely(io_virt < mr->mmkey.iova))
 		return -EFAULT;