diff mbox series

[RFC,v3,1/4] RDMA/umem: Support importing dma-buf as user memory region

Message ID 1601838751-148544-2-git-send-email-jianxin.xiong@intel.com (mailing list archive)
State New, archived
Headers show
Series RDMA: Add dma-buf support | expand

Commit Message

Xiong, Jianxin Oct. 4, 2020, 7:12 p.m. UTC
Dma-buf is a standard cross-driver buffer sharing mechanism that can be
used to support peer-to-peer access from RDMA devices.

Device memory exported via dma-buf is associated with a file descriptor.
This is passed to the user space as a property associated with the
buffer allocation. When the buffer is registered as a memory region,
the file descriptor is passed to the RDMA driver along with other
parameters.

Implement the common code for importing dma-buf object and mapping
dma-buf pages.

Signed-off-by: Jianxin Xiong <jianxin.xiong@intel.com>
Reviewed-by: Sean Hefty <sean.hefty@intel.com>
Acked-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
---
 drivers/infiniband/core/Makefile      |   2 +-
 drivers/infiniband/core/umem.c        |   4 +
 drivers/infiniband/core/umem_dmabuf.c | 291 ++++++++++++++++++++++++++++++++++
 drivers/infiniband/core/umem_dmabuf.h |  14 ++
 drivers/infiniband/core/umem_odp.c    |  12 ++
 include/rdma/ib_umem.h                |  19 ++-
 6 files changed, 340 insertions(+), 2 deletions(-)
 create mode 100644 drivers/infiniband/core/umem_dmabuf.c
 create mode 100644 drivers/infiniband/core/umem_dmabuf.h

Comments

Christian König Oct. 5, 2020, 10:54 a.m. UTC | #1
Hi Jianxin,

Am 04.10.20 um 21:12 schrieb Jianxin Xiong:
> Dma-buf is a standard cross-driver buffer sharing mechanism that can be
> used to support peer-to-peer access from RDMA devices.
>
> Device memory exported via dma-buf is associated with a file descriptor.
> This is passed to the user space as a property associated with the
> buffer allocation. When the buffer is registered as a memory region,
> the file descriptor is passed to the RDMA driver along with other
> parameters.
>
> Implement the common code for importing dma-buf object and mapping
> dma-buf pages.
>
> Signed-off-by: Jianxin Xiong <jianxin.xiong@intel.com>
> Reviewed-by: Sean Hefty <sean.hefty@intel.com>
> Acked-by: Michael J. Ruhl <michael.j.ruhl@intel.com>

well first of all really nice work you have done here.

Since I'm not an expert on RDMA or its drivers I can't really review any 
of that part.

But at least from the DMA-buf side it looks like you are using the 
interface correctly as intended.

So feel free to add an Acked-by: Christian König 
<christian.koenig@amd.com> if it helps.

Thanks,
Christian.

> ---
>   drivers/infiniband/core/Makefile      |   2 +-
>   drivers/infiniband/core/umem.c        |   4 +
>   drivers/infiniband/core/umem_dmabuf.c | 291 ++++++++++++++++++++++++++++++++++
>   drivers/infiniband/core/umem_dmabuf.h |  14 ++
>   drivers/infiniband/core/umem_odp.c    |  12 ++
>   include/rdma/ib_umem.h                |  19 ++-
>   6 files changed, 340 insertions(+), 2 deletions(-)
>   create mode 100644 drivers/infiniband/core/umem_dmabuf.c
>   create mode 100644 drivers/infiniband/core/umem_dmabuf.h
>
> diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
> index 24cb71a..b8d51a7 100644
> --- a/drivers/infiniband/core/Makefile
> +++ b/drivers/infiniband/core/Makefile
> @@ -40,5 +40,5 @@ ib_uverbs-y :=			uverbs_main.o uverbs_cmd.o uverbs_marshall.o \
>   				uverbs_std_types_srq.o \
>   				uverbs_std_types_wq.o \
>   				uverbs_std_types_qp.o
> -ib_uverbs-$(CONFIG_INFINIBAND_USER_MEM) += umem.o
> +ib_uverbs-$(CONFIG_INFINIBAND_USER_MEM) += umem.o umem_dmabuf.o
>   ib_uverbs-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o
> diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
> index 831bff8..59ec36c 100644
> --- a/drivers/infiniband/core/umem.c
> +++ b/drivers/infiniband/core/umem.c
> @@ -2,6 +2,7 @@
>    * Copyright (c) 2005 Topspin Communications.  All rights reserved.
>    * Copyright (c) 2005 Cisco Systems.  All rights reserved.
>    * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
> + * Copyright (c) 2020 Intel Corporation. All rights reserved.
>    *
>    * This software is available to you under a choice of one of two
>    * licenses.  You may choose to be licensed under the terms of the GNU
> @@ -42,6 +43,7 @@
>   #include <rdma/ib_umem_odp.h>
>   
>   #include "uverbs.h"
> +#include "umem_dmabuf.h"
>   
>   static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty)
>   {
> @@ -318,6 +320,8 @@ void ib_umem_release(struct ib_umem *umem)
>   {
>   	if (!umem)
>   		return;
> +	if (umem->is_dmabuf)
> +		return ib_umem_dmabuf_release(umem);
>   	if (umem->is_odp)
>   		return ib_umem_odp_release(to_ib_umem_odp(umem));
>   
> diff --git a/drivers/infiniband/core/umem_dmabuf.c b/drivers/infiniband/core/umem_dmabuf.c
> new file mode 100644
> index 0000000..10ed646
> --- /dev/null
> +++ b/drivers/infiniband/core/umem_dmabuf.c
> @@ -0,0 +1,291 @@
> +// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
> +/*
> + * Copyright (c) 2020 Intel Corporation. All rights reserved.
> + */
> +
> +#include <linux/dma-buf.h>
> +#include <linux/dma-resv.h>
> +#include <linux/dma-mapping.h>
> +#include <rdma/ib_umem_odp.h>
> +
> +#include "uverbs.h"
> +
> +struct ib_umem_dmabuf {
> +	struct ib_umem_odp umem_odp;
> +	struct dma_buf_attachment *attach;
> +	struct sg_table *sgt;
> +	atomic_t notifier_seq;
> +};
> +
> +static inline struct ib_umem_dmabuf *to_ib_umem_dmabuf(struct ib_umem *umem)
> +{
> +	struct ib_umem_odp *umem_odp = to_ib_umem_odp(umem);
> +	return container_of(umem_odp, struct ib_umem_dmabuf, umem_odp);
> +}
> +
> +static void ib_umem_dmabuf_invalidate_cb(struct dma_buf_attachment *attach)
> +{
> +	struct ib_umem_dmabuf *umem_dmabuf = attach->importer_priv;
> +	struct ib_umem_odp *umem_odp = &umem_dmabuf->umem_odp;
> +	struct mmu_notifier_range range = {};
> +	unsigned long current_seq;
> +
> +	/* no concurrent invalidation due to the dma_resv lock */
> +
> +	dma_resv_assert_held(umem_dmabuf->attach->dmabuf->resv);
> +
> +	if (!umem_dmabuf->sgt)
> +		return;
> +
> +	range.start = ib_umem_start(umem_odp);
> +	range.end = ib_umem_end(umem_odp);
> +	range.flags = MMU_NOTIFIER_RANGE_BLOCKABLE;
> +	current_seq = atomic_read(&umem_dmabuf->notifier_seq);
> +	umem_odp->notifier.ops->invalidate(&umem_odp->notifier, &range,
> +					   current_seq);
> +
> +	atomic_inc(&umem_dmabuf->notifier_seq);
> +}
> +
> +static struct dma_buf_attach_ops ib_umem_dmabuf_attach_ops = {
> +	.allow_peer2peer = 1,
> +	.move_notify = ib_umem_dmabuf_invalidate_cb,
> +};
> +
> +static inline int ib_umem_dmabuf_init_odp(struct ib_umem_odp *umem_odp)
> +{
> +	size_t page_size = 1UL << umem_odp->page_shift;
> +	unsigned long start;
> +	unsigned long end;
> +	size_t pages;
> +
> +	umem_odp->umem.is_odp = 1;
> +	mutex_init(&umem_odp->umem_mutex);
> +
> +	start = ALIGN_DOWN(umem_odp->umem.address, page_size);
> +	if (check_add_overflow(umem_odp->umem.address,
> +			       (unsigned long)umem_odp->umem.length,
> +			       &end))
> +		return -EOVERFLOW;
> +	end = ALIGN(end, page_size);
> +	if (unlikely(end < page_size))
> +		return -EOVERFLOW;
> +
> +	pages = (end - start) >> umem_odp->page_shift;
> +	if (!pages)
> +		return -EINVAL;
> +
> +	/* used for ib_umem_start() & ib_umem_end() */
> +	umem_odp->notifier.interval_tree.start = start;
> +	umem_odp->notifier.interval_tree.last = end - 1;
> +
> +	/* umem_odp->page_list is never used for dma-buf */
> +
> +	umem_odp->dma_list = kvcalloc(
> +		pages, sizeof(*umem_odp->dma_list), GFP_KERNEL);
> +	if (!umem_odp->dma_list)
> +		return -ENOMEM;
> +
> +	return 0;
> +}
> +
> +struct ib_umem *ib_umem_dmabuf_get(struct ib_device *device,
> +				   unsigned long addr, size_t size,
> +				   int dmabuf_fd, int access,
> +				   const struct mmu_interval_notifier_ops *ops)
> +{
> +	struct dma_buf *dmabuf;
> +	struct ib_umem_dmabuf *umem_dmabuf;
> +	struct ib_umem *umem;
> +	struct ib_umem_odp *umem_odp;
> +	unsigned long end;
> +	long ret;
> +
> +	if (check_add_overflow(addr, size, &end))
> +		return ERR_PTR(-EINVAL);
> +
> +	if (unlikely(PAGE_ALIGN(end) < PAGE_SIZE))
> +		return ERR_PTR(-EINVAL);
> +
> +	umem_dmabuf = kzalloc(sizeof(*umem_dmabuf), GFP_KERNEL);
> +	if (!umem_dmabuf)
> +		return ERR_PTR(-ENOMEM);
> +
> +	umem = &umem_dmabuf->umem_odp.umem;
> +	umem->ibdev = device;
> +	umem->length = size;
> +	umem->address = addr;
> +	umem->writable = ib_access_writable(access);
> +	umem->is_dmabuf = 1;
> +
> +	dmabuf = dma_buf_get(dmabuf_fd);
> +	if (IS_ERR(dmabuf)) {
> +		ret = PTR_ERR(dmabuf);
> +		goto out_free_umem;
> +	}
> +
> +	/* always attach dynamically to pass the allow_peer2peer flag */
> +	umem_dmabuf->attach = dma_buf_dynamic_attach(
> +					dmabuf,
> +					device->dma_device,
> +					&ib_umem_dmabuf_attach_ops,
> +					umem_dmabuf);
> +	if (IS_ERR(umem_dmabuf->attach)) {
> +		ret = PTR_ERR(umem_dmabuf->attach);
> +		goto out_release_dmabuf;
> +	}
> +
> +	umem_odp = &umem_dmabuf->umem_odp;
> +	umem_odp->page_shift = PAGE_SHIFT;
> +	if (access & IB_ACCESS_HUGETLB) {
> +		/* don't support huge_tlb at this point */
> +		ret = -EINVAL;
> +		goto out_detach_dmabuf;
> +	}
> +
> +	ret = ib_umem_dmabuf_init_odp(umem_odp);
> +	if (ret)
> +		goto out_detach_dmabuf;
> +
> +	umem_odp->notifier.ops = ops;
> +	return umem;
> +
> +out_detach_dmabuf:
> +	dma_buf_detach(dmabuf, umem_dmabuf->attach);
> +
> +out_release_dmabuf:
> +	dma_buf_put(dmabuf);
> +
> +out_free_umem:
> +	kfree(umem_dmabuf);
> +	return ERR_PTR(ret);
> +}
> +EXPORT_SYMBOL(ib_umem_dmabuf_get);
> +
> +unsigned long ib_umem_dmabuf_notifier_read_begin(struct ib_umem_odp *umem_odp)
> +{
> +	struct ib_umem_dmabuf *umem_dmabuf = to_ib_umem_dmabuf(&umem_odp->umem);
> +
> +	return atomic_read(&umem_dmabuf->notifier_seq);
> +}
> +EXPORT_SYMBOL(ib_umem_dmabuf_notifier_read_begin);
> +
> +int ib_umem_dmabuf_notifier_read_retry(struct ib_umem_odp *umem_odp,
> +				       unsigned long current_seq)
> +{
> +	struct ib_umem_dmabuf *umem_dmabuf = to_ib_umem_dmabuf(&umem_odp->umem);
> +
> +	return (atomic_read(&umem_dmabuf->notifier_seq) != current_seq);
> +}
> +EXPORT_SYMBOL(ib_umem_dmabuf_notifier_read_retry);
> +
> +int ib_umem_dmabuf_map_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt,
> +			     u64 access_mask, unsigned long current_seq)
> +{
> +	struct ib_umem_dmabuf *umem_dmabuf = to_ib_umem_dmabuf(umem);
> +	struct ib_umem_odp *umem_odp = &umem_dmabuf->umem_odp;
> +	u64 start, end, addr;
> +	int j, k, ret = 0, user_pages, pages, total_pages;
> +	unsigned int page_shift;
> +	size_t page_size;
> +	struct scatterlist *sg;
> +	struct sg_table *sgt;
> +
> +	if (access_mask == 0)
> +		return -EINVAL;
> +
> +	if (user_virt < ib_umem_start(umem_odp) ||
> +	    user_virt + bcnt > ib_umem_end(umem_odp))
> +		return -EFAULT;
> +
> +	page_shift = umem_odp->page_shift;
> +	page_size = 1UL << page_shift;
> +	start = ALIGN_DOWN(user_virt, page_size);
> +	end = ALIGN(user_virt + bcnt, page_size);
> +	user_pages = (end - start) >> page_shift;
> +
> +	mutex_lock(&umem_odp->umem_mutex);
> +
> +	/* check for on-ongoing invalidations */
> +	if (ib_umem_dmabuf_notifier_read_retry(umem_odp, current_seq)) {
> +		ret = -EAGAIN;
> +		goto out;
> +	}
> +
> +	ret = user_pages;
> +	if (umem_dmabuf->sgt)
> +		goto out;
> +
> +	dma_resv_lock(umem_dmabuf->attach->dmabuf->resv, NULL);
> +	sgt = dma_buf_map_attachment(umem_dmabuf->attach,
> +				     DMA_BIDIRECTIONAL);
> +	dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv);
> +
> +	if (IS_ERR(sgt)) {
> +		ret = PTR_ERR(sgt);
> +		goto out;
> +	}
> +
> +	umem->sg_head = *sgt;
> +	umem->nmap = sgt->nents;
> +	umem_dmabuf->sgt = sgt;
> +
> +	k = 0;
> +	total_pages = ib_umem_odp_num_pages(umem_odp);
> +	for_each_sg(umem->sg_head.sgl, sg, umem->sg_head.nents, j) {
> +		addr = sg_dma_address(sg);
> +		pages = sg_dma_len(sg) >> page_shift;
> +		while (pages > 0 && k < total_pages) {
> +			umem_odp->dma_list[k++] = addr | access_mask;
> +			umem_odp->npages++;
> +			addr += page_size;
> +			pages--;
> +		}
> +	}
> +
> +	WARN_ON(k != total_pages);
> +
> +out:
> +	mutex_unlock(&umem_odp->umem_mutex);
> +	return ret;
> +}
> +
> +void ib_umem_dmabuf_unmap_pages(struct ib_umem *umem)
> +{
> +	struct ib_umem_dmabuf *umem_dmabuf = to_ib_umem_dmabuf(umem);
> +	struct ib_umem_odp *umem_odp = &umem_dmabuf->umem_odp;
> +	int npages = ib_umem_odp_num_pages(umem_odp);
> +	int i;
> +
> +	lockdep_assert_held(&umem_odp->umem_mutex);
> +	dma_resv_assert_held(umem_dmabuf->attach->dmabuf->resv);
> +
> +	if (!umem_dmabuf->sgt)
> +		return;
> +
> +	dma_buf_unmap_attachment(umem_dmabuf->attach, umem_dmabuf->sgt,
> +				 DMA_BIDIRECTIONAL);
> +
> +	umem_dmabuf->sgt = NULL;
> +
> +	for (i = 0; i < npages; i++)
> +		umem_odp->dma_list[i] = 0;
> +	umem_odp->npages = 0;
> +}
> +
> +void ib_umem_dmabuf_release(struct ib_umem *umem)
> +{
> +	struct ib_umem_dmabuf *umem_dmabuf = to_ib_umem_dmabuf(umem);
> +	struct ib_umem_odp *umem_odp = &umem_dmabuf->umem_odp;
> +	struct dma_buf *dmabuf = umem_dmabuf->attach->dmabuf;
> +
> +	mutex_lock(&umem_odp->umem_mutex);
> +	dma_resv_lock(umem_dmabuf->attach->dmabuf->resv, NULL);
> +	ib_umem_dmabuf_unmap_pages(umem);
> +	dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv);
> +	mutex_unlock(&umem_odp->umem_mutex);
> +	kvfree(umem_odp->dma_list);
> +	dma_buf_detach(dmabuf, umem_dmabuf->attach);
> +	dma_buf_put(dmabuf);
> +	kfree(umem_dmabuf);
> +}
> diff --git a/drivers/infiniband/core/umem_dmabuf.h b/drivers/infiniband/core/umem_dmabuf.h
> new file mode 100644
> index 0000000..b9378bd
> --- /dev/null
> +++ b/drivers/infiniband/core/umem_dmabuf.h
> @@ -0,0 +1,14 @@
> +/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */
> +/*
> + * Copyright (c) 2020 Intel Corporation. All rights reserved.
> + */
> +
> +#ifndef UMEM_DMABUF_H
> +#define UMEM_DMABUF_H
> +
> +int ib_umem_dmabuf_map_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt,
> +			     u64 access_mask, unsigned long current_seq);
> +void ib_umem_dmabuf_unmap_pages(struct ib_umem *umem);
> +void ib_umem_dmabuf_release(struct ib_umem *umem);
> +
> +#endif /* UMEM_DMABUF_H */
> diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c
> index cc6b4be..7e11619 100644
> --- a/drivers/infiniband/core/umem_odp.c
> +++ b/drivers/infiniband/core/umem_odp.c
> @@ -1,5 +1,6 @@
>   /*
>    * Copyright (c) 2014 Mellanox Technologies. All rights reserved.
> + * Copyright (c) 2020 Intel Corporation. All rights reserved.
>    *
>    * This software is available to you under a choice of one of two
>    * licenses.  You may choose to be licensed under the terms of the GNU
> @@ -47,6 +48,7 @@
>   #include <rdma/ib_umem_odp.h>
>   
>   #include "uverbs.h"
> +#include "umem_dmabuf.h"
>   
>   static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp,
>   				   const struct mmu_interval_notifier_ops *ops)
> @@ -263,6 +265,9 @@ struct ib_umem_odp *ib_umem_odp_get(struct ib_device *device,
>   
>   void ib_umem_odp_release(struct ib_umem_odp *umem_odp)
>   {
> +	if (umem_odp->umem.is_dmabuf)
> +		return ib_umem_dmabuf_release(&umem_odp->umem);
> +
>   	/*
>   	 * Ensure that no more pages are mapped in the umem.
>   	 *
> @@ -392,6 +397,10 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt,
>   	unsigned int flags = 0, page_shift;
>   	phys_addr_t p = 0;
>   
> +	if (umem_odp->umem.is_dmabuf)
> +		return ib_umem_dmabuf_map_pages(&umem_odp->umem, user_virt,
> +						bcnt, access_mask, current_seq);
> +
>   	if (access_mask == 0)
>   		return -EINVAL;
>   
> @@ -517,6 +526,9 @@ void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt,
>   	u64 addr;
>   	struct ib_device *dev = umem_odp->umem.ibdev;
>   
> +	if (umem_odp->umem.is_dmabuf)
> +		return ib_umem_dmabuf_unmap_pages(&umem_odp->umem);
> +
>   	lockdep_assert_held(&umem_odp->umem_mutex);
>   
>   	virt = max_t(u64, virt, ib_umem_start(umem_odp));
> diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h
> index 71f573a..b8ea693 100644
> --- a/include/rdma/ib_umem.h
> +++ b/include/rdma/ib_umem.h
> @@ -1,6 +1,7 @@
>   /* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
>   /*
>    * Copyright (c) 2007 Cisco Systems.  All rights reserved.
> + * Copyright (c) 2020 Intel Corporation.  All rights reserved.
>    */
>   
>   #ifndef IB_UMEM_H
> @@ -13,6 +14,7 @@
>   
>   struct ib_ucontext;
>   struct ib_umem_odp;
> +struct ib_umem_dmabuf;
>   
>   struct ib_umem {
>   	struct ib_device       *ibdev;
> @@ -21,6 +23,7 @@ struct ib_umem {
>   	unsigned long		address;
>   	u32 writable : 1;
>   	u32 is_odp : 1;
> +	u32 is_dmabuf : 1;
>   	struct work_struct	work;
>   	struct sg_table sg_head;
>   	int             nmap;
> @@ -51,6 +54,13 @@ int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset,
>   unsigned long ib_umem_find_best_pgsz(struct ib_umem *umem,
>   				     unsigned long pgsz_bitmap,
>   				     unsigned long virt);
> +struct ib_umem *ib_umem_dmabuf_get(struct ib_device *device,
> +				   unsigned long addr, size_t size,
> +				   int dmabuf_fd, int access,
> +				   const struct mmu_interval_notifier_ops *ops);
> +unsigned long ib_umem_dmabuf_notifier_read_begin(struct ib_umem_odp *umem_odp);
> +int ib_umem_dmabuf_notifier_read_retry(struct ib_umem_odp *umem_odp,
> +				       unsigned long current_seq);
>   
>   #else /* CONFIG_INFINIBAND_USER_MEM */
>   
> @@ -73,7 +83,14 @@ static inline int ib_umem_find_best_pgsz(struct ib_umem *umem,
>   					 unsigned long virt) {
>   	return -EINVAL;
>   }
> +static inline struct ib_umem *ib_umem_dmabuf_get(struct ib_device *device,
> +						 unsigned long addr,
> +						 size_t size, int dmabuf_fd,
> +						 int access,
> +						 struct mmu_interval_notifier_ops *ops)
> +{
> +	return ERR_PTR(-EINVAL);
> +}
>   
>   #endif /* CONFIG_INFINIBAND_USER_MEM */
> -
>   #endif /* IB_UMEM_H */
Jason Gunthorpe Oct. 5, 2020, 1:13 p.m. UTC | #2
On Sun, Oct 04, 2020 at 12:12:28PM -0700, Jianxin Xiong wrote:
> Dma-buf is a standard cross-driver buffer sharing mechanism that can be
> used to support peer-to-peer access from RDMA devices.
> 
> Device memory exported via dma-buf is associated with a file descriptor.
> This is passed to the user space as a property associated with the
> buffer allocation. When the buffer is registered as a memory region,
> the file descriptor is passed to the RDMA driver along with other
> parameters.
> 
> Implement the common code for importing dma-buf object and mapping
> dma-buf pages.
> 
> Signed-off-by: Jianxin Xiong <jianxin.xiong@intel.com>
> Reviewed-by: Sean Hefty <sean.hefty@intel.com>
> Acked-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
> ---
>  drivers/infiniband/core/Makefile      |   2 +-
>  drivers/infiniband/core/umem.c        |   4 +
>  drivers/infiniband/core/umem_dmabuf.c | 291 ++++++++++++++++++++++++++++++++++
>  drivers/infiniband/core/umem_dmabuf.h |  14 ++
>  drivers/infiniband/core/umem_odp.c    |  12 ++
>  include/rdma/ib_umem.h                |  19 ++-
>  6 files changed, 340 insertions(+), 2 deletions(-)
>  create mode 100644 drivers/infiniband/core/umem_dmabuf.c
>  create mode 100644 drivers/infiniband/core/umem_dmabuf.h

I think this is using ODP too literally, dmabuf isn't going to need
fine grained page faults, and I'm not sure this locking scheme is OK -
ODP is horrifically complicated.

If this is the approach then I think we should make dmabuf its own
stand alone API, reg_user_mr_dmabuf()

The implementation in mlx5 will be much more understandable, it would
just do dma_buf_dynamic_attach() and program the XLT exactly the same
as a normal umem.

The move_notify() simply zap's the XLT and triggers a work to reload
it after the move. Locking is provided by the dma_resv_lock. Only a
small disruption to the page fault handler is needed.

> +	dma_resv_lock(umem_dmabuf->attach->dmabuf->resv, NULL);
> +	sgt = dma_buf_map_attachment(umem_dmabuf->attach,
> +				     DMA_BIDIRECTIONAL);
> +	dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv);

This doesn't look right, this lock has to be held up until the HW is
prorgammed

The use of atomic looks probably wrong as well.

> +	k = 0;
> +	total_pages = ib_umem_odp_num_pages(umem_odp);
> +	for_each_sg(umem->sg_head.sgl, sg, umem->sg_head.nents, j) {
> +		addr = sg_dma_address(sg);
> +		pages = sg_dma_len(sg) >> page_shift;
> +		while (pages > 0 && k < total_pages) {
> +			umem_odp->dma_list[k++] = addr | access_mask;
> +			umem_odp->npages++;
> +			addr += page_size;
> +			pages--;

This isn't fragmenting the sg into a page list properly, won't work
for unaligned things

And really we don't need the dma_list for this case, with a fixed
whole mapping DMA SGL a normal umem sgl is OK and the normal umem XLT
programming in mlx5 is fine.

Jason
Xiong, Jianxin Oct. 5, 2020, 4:18 p.m. UTC | #3
> -----Original Message-----
> From: Jason Gunthorpe <jgg@ziepe.ca>
> Sent: Monday, October 05, 2020 6:13 AM
> To: Xiong, Jianxin <jianxin.xiong@intel.com>
> Cc: linux-rdma@vger.kernel.org; dri-devel@lists.freedesktop.org; Doug Ledford <dledford@redhat.com>; Leon Romanovsky
> <leon@kernel.org>; Sumit Semwal <sumit.semwal@linaro.org>; Christian Koenig <christian.koenig@amd.com>; Vetter, Daniel
> <daniel.vetter@intel.com>
> Subject: Re: [RFC PATCH v3 1/4] RDMA/umem: Support importing dma-buf as user memory region
> 
> On Sun, Oct 04, 2020 at 12:12:28PM -0700, Jianxin Xiong wrote:
> > Dma-buf is a standard cross-driver buffer sharing mechanism that can
> > be used to support peer-to-peer access from RDMA devices.
> >
> > Device memory exported via dma-buf is associated with a file descriptor.
> > This is passed to the user space as a property associated with the
> > buffer allocation. When the buffer is registered as a memory region,
> > the file descriptor is passed to the RDMA driver along with other
> > parameters.
> >
> > Implement the common code for importing dma-buf object and mapping
> > dma-buf pages.
> >
> > Signed-off-by: Jianxin Xiong <jianxin.xiong@intel.com>
> > Reviewed-by: Sean Hefty <sean.hefty@intel.com>
> > Acked-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
> > ---
> >  drivers/infiniband/core/Makefile      |   2 +-
> >  drivers/infiniband/core/umem.c        |   4 +
> >  drivers/infiniband/core/umem_dmabuf.c | 291
> > ++++++++++++++++++++++++++++++++++
> >  drivers/infiniband/core/umem_dmabuf.h |  14 ++
> >  drivers/infiniband/core/umem_odp.c    |  12 ++
> >  include/rdma/ib_umem.h                |  19 ++-
> >  6 files changed, 340 insertions(+), 2 deletions(-)  create mode
> > 100644 drivers/infiniband/core/umem_dmabuf.c
> >  create mode 100644 drivers/infiniband/core/umem_dmabuf.h
> 
> I think this is using ODP too literally, dmabuf isn't going to need fine grained page faults, and I'm not sure this locking scheme is OK - ODP is
> horrifically complicated.
> 

> If this is the approach then I think we should make dmabuf its own stand alone API, reg_user_mr_dmabuf()

That's the original approach in the first version. We can go back there.

> 
> The implementation in mlx5 will be much more understandable, it would just do dma_buf_dynamic_attach() and program the XLT exactly
> the same as a normal umem.
> 
> The move_notify() simply zap's the XLT and triggers a work to reload it after the move. Locking is provided by the dma_resv_lock. Only a
> small disruption to the page fault handler is needed.
> 

We considered such scheme but didn't go that way due to the lack of notification when the move is done and thus the work wouldn't know when it can reload.

Now I think it again, we could probably signal the reload in the page fault handler. 

> > +	dma_resv_lock(umem_dmabuf->attach->dmabuf->resv, NULL);
> > +	sgt = dma_buf_map_attachment(umem_dmabuf->attach,
> > +				     DMA_BIDIRECTIONAL);
> > +	dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv);
> 
> This doesn't look right, this lock has to be held up until the HW is programmed

The mapping remains valid until being invalidated again. There is a sequence number checking before programming the HW. 

> 
> The use of atomic looks probably wrong as well.

Do you mean umem_dmabuf->notifier_seq? Could you elaborate the concern?

> 
> > +	k = 0;
> > +	total_pages = ib_umem_odp_num_pages(umem_odp);
> > +	for_each_sg(umem->sg_head.sgl, sg, umem->sg_head.nents, j) {
> > +		addr = sg_dma_address(sg);
> > +		pages = sg_dma_len(sg) >> page_shift;
> > +		while (pages > 0 && k < total_pages) {
> > +			umem_odp->dma_list[k++] = addr | access_mask;
> > +			umem_odp->npages++;
> > +			addr += page_size;
> > +			pages--;
> 
> This isn't fragmenting the sg into a page list properly, won't work for unaligned things

I thought the addresses are aligned, but will add explicit alignment here.

> 
> And really we don't need the dma_list for this case, with a fixed whole mapping DMA SGL a normal umem sgl is OK and the normal umem
> XLT programming in mlx5 is fine.

The dma_list is used by both "polulate_mtt()" and "mlx5_ib_invalidate_range", which are used for XLT programming and invalidating (zapping), respectively.

> 
> Jason
Xiong, Jianxin Oct. 5, 2020, 4:19 p.m. UTC | #4
> -----Original Message-----
> From: Christian König <christian.koenig@amd.com>
> Sent: Monday, October 05, 2020 3:55 AM
> To: Xiong, Jianxin <jianxin.xiong@intel.com>; linux-rdma@vger.kernel.org; dri-devel@lists.freedesktop.org
> Cc: Doug Ledford <dledford@redhat.com>; Jason Gunthorpe <jgg@ziepe.ca>; Leon Romanovsky <leon@kernel.org>; Sumit Semwal
> <sumit.semwal@linaro.org>; Vetter, Daniel <daniel.vetter@intel.com>
> Subject: Re: [RFC PATCH v3 1/4] RDMA/umem: Support importing dma-buf as user memory region
> 
> Hi Jianxin,
> 
> Am 04.10.20 um 21:12 schrieb Jianxin Xiong:
> > Dma-buf is a standard cross-driver buffer sharing mechanism that can
> > be used to support peer-to-peer access from RDMA devices.
> >
> > Device memory exported via dma-buf is associated with a file descriptor.
> > This is passed to the user space as a property associated with the
> > buffer allocation. When the buffer is registered as a memory region,
> > the file descriptor is passed to the RDMA driver along with other
> > parameters.
> >
> > Implement the common code for importing dma-buf object and mapping
> > dma-buf pages.
> >
> > Signed-off-by: Jianxin Xiong <jianxin.xiong@intel.com>
> > Reviewed-by: Sean Hefty <sean.hefty@intel.com>
> > Acked-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
> 
> well first of all really nice work you have done here.
> 
> Since I'm not an expert on RDMA or its drivers I can't really review any of that part.
> 
> But at least from the DMA-buf side it looks like you are using the interface correctly as intended.
> 
> So feel free to add an Acked-by: Christian König <christian.koenig@amd.com> if it helps.
> 
> Thanks,
> Christian.

Thanks, will do.
Jason Gunthorpe Oct. 5, 2020, 4:33 p.m. UTC | #5
On Mon, Oct 05, 2020 at 04:18:11PM +0000, Xiong, Jianxin wrote:

> > The implementation in mlx5 will be much more understandable, it would just do dma_buf_dynamic_attach() and program the XLT exactly
> > the same as a normal umem.
> > 
> > The move_notify() simply zap's the XLT and triggers a work to reload it after the move. Locking is provided by the dma_resv_lock. Only a
> > small disruption to the page fault handler is needed.
> 
> We considered such scheme but didn't go that way due to the lack of
> notification when the move is done and thus the work wouldn't know
> when it can reload.

Well, the work would block on the reservation lock and that indicates
the move is done

It would be nicer if the dma_buf could provide an op that things are
ready to go though

> Now I think it again, we could probably signal the reload in the page fault handler. 

This also works, with a performance cost

> > > +	dma_resv_lock(umem_dmabuf->attach->dmabuf->resv, NULL);
> > > +	sgt = dma_buf_map_attachment(umem_dmabuf->attach,
> > > +				     DMA_BIDIRECTIONAL);
> > > +	dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv);
> > 
> > This doesn't look right, this lock has to be held up until the HW is programmed
> 
> The mapping remains valid until being invalidated again. There is a
> sequence number checking before programming the HW.

It races, we could immediately trigger invalidation and then
re-program the HW with this stale data.
 
> > The use of atomic looks probably wrong as well.
> 
> Do you mean umem_dmabuf->notifier_seq? Could you elaborate the concern?

It only increments once per invalidation, that usually is racy.

> > > +	total_pages = ib_umem_odp_num_pages(umem_odp);
> > > +	for_each_sg(umem->sg_head.sgl, sg, umem->sg_head.nents, j) {
> > > +		addr = sg_dma_address(sg);
> > > +		pages = sg_dma_len(sg) >> page_shift;
> > > +		while (pages > 0 && k < total_pages) {
> > > +			umem_odp->dma_list[k++] = addr | access_mask;
> > > +			umem_odp->npages++;
> > > +			addr += page_size;
> > > +			pages--;
> > 
> > This isn't fragmenting the sg into a page list properly, won't work for unaligned things
> 
> I thought the addresses are aligned, but will add explicit alignment here.

I have no idea what comes out of dma_buf, I wouldn't make too many
assumptions since it does have to pass through the IOMMU layer too

> > And really we don't need the dma_list for this case, with a fixed
> > whole mapping DMA SGL a normal umem sgl is OK and the normal umem
> > XLT programming in mlx5 is fine.
> 
> The dma_list is used by both "polulate_mtt()" and
> "mlx5_ib_invalidate_range", which are used for XLT programming and
> invalidating (zapping), respectively.

Don't use those functions for the dma_buf flow.

Jason
Xiong, Jianxin Oct. 5, 2020, 7:41 p.m. UTC | #6
> -----Original Message-----
> From: Jason Gunthorpe <jgg@ziepe.ca>
> Sent: Monday, October 05, 2020 9:33 AM
> To: Xiong, Jianxin <jianxin.xiong@intel.com>
> Cc: linux-rdma@vger.kernel.org; dri-devel@lists.freedesktop.org; Doug Ledford <dledford@redhat.com>; Leon Romanovsky
> <leon@kernel.org>; Sumit Semwal <sumit.semwal@linaro.org>; Christian Koenig <christian.koenig@amd.com>; Vetter, Daniel
> <daniel.vetter@intel.com>
> Subject: Re: [RFC PATCH v3 1/4] RDMA/umem: Support importing dma-buf as user memory region
> 
> On Mon, Oct 05, 2020 at 04:18:11PM +0000, Xiong, Jianxin wrote:
> 
> > > The implementation in mlx5 will be much more understandable, it
> > > would just do dma_buf_dynamic_attach() and program the XLT exactly the same as a normal umem.
> > >
> > > The move_notify() simply zap's the XLT and triggers a work to reload
> > > it after the move. Locking is provided by the dma_resv_lock. Only a small disruption to the page fault handler is needed.
> >
> > We considered such scheme but didn't go that way due to the lack of
> > notification when the move is done and thus the work wouldn't know
> > when it can reload.
> 
> Well, the work would block on the reservation lock and that indicates the move is done

Got it.  Will work on that.

> 
> It would be nicer if the dma_buf could provide an op that things are ready to go though
> 
> > Now I think it again, we could probably signal the reload in the page fault handler.
> 
> This also works, with a performance cost
> 
> > > > +	dma_resv_lock(umem_dmabuf->attach->dmabuf->resv, NULL);
> > > > +	sgt = dma_buf_map_attachment(umem_dmabuf->attach,
> > > > +				     DMA_BIDIRECTIONAL);
> > > > +	dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv);
> > >
> > > This doesn't look right, this lock has to be held up until the HW is
> > > programmed
> >
> > The mapping remains valid until being invalidated again. There is a
> > sequence number checking before programming the HW.
> 
> It races, we could immediately trigger invalidation and then re-program the HW with this stale data.
> 
> > > The use of atomic looks probably wrong as well.
> >
> > Do you mean umem_dmabuf->notifier_seq? Could you elaborate the concern?
> 
> It only increments once per invalidation, that usually is racy.

I will rework these parts.

> 
> > > > +	total_pages = ib_umem_odp_num_pages(umem_odp);
> > > > +	for_each_sg(umem->sg_head.sgl, sg, umem->sg_head.nents, j) {
> > > > +		addr = sg_dma_address(sg);
> > > > +		pages = sg_dma_len(sg) >> page_shift;
> > > > +		while (pages > 0 && k < total_pages) {
> > > > +			umem_odp->dma_list[k++] = addr | access_mask;
> > > > +			umem_odp->npages++;
> > > > +			addr += page_size;
> > > > +			pages--;
> > >
> > > This isn't fragmenting the sg into a page list properly, won't work
> > > for unaligned things
> >
> > I thought the addresses are aligned, but will add explicit alignment here.
> 
> I have no idea what comes out of dma_buf, I wouldn't make too many assumptions since it does have to pass through the IOMMU layer too
> 
> > > And really we don't need the dma_list for this case, with a fixed
> > > whole mapping DMA SGL a normal umem sgl is OK and the normal umem
> > > XLT programming in mlx5 is fine.
> >
> > The dma_list is used by both "polulate_mtt()" and
> > "mlx5_ib_invalidate_range", which are used for XLT programming and
> > invalidating (zapping), respectively.
> 
> Don't use those functions for the dma_buf flow.

Ok.  I think we can use mlx5_ib_update_xlt() directly for dma-buf case. 

> 
> Jason
Daniel Vetter Oct. 6, 2020, 9:22 a.m. UTC | #7
On Mon, Oct 05, 2020 at 04:18:11PM +0000, Xiong, Jianxin wrote:
> > -----Original Message-----
> > From: Jason Gunthorpe <jgg@ziepe.ca>
> > Sent: Monday, October 05, 2020 6:13 AM
> > To: Xiong, Jianxin <jianxin.xiong@intel.com>
> > Cc: linux-rdma@vger.kernel.org; dri-devel@lists.freedesktop.org; Doug Ledford <dledford@redhat.com>; Leon Romanovsky
> > <leon@kernel.org>; Sumit Semwal <sumit.semwal@linaro.org>; Christian Koenig <christian.koenig@amd.com>; Vetter, Daniel
> > <daniel.vetter@intel.com>
> > Subject: Re: [RFC PATCH v3 1/4] RDMA/umem: Support importing dma-buf as user memory region
> > 
> > On Sun, Oct 04, 2020 at 12:12:28PM -0700, Jianxin Xiong wrote:
> > > Dma-buf is a standard cross-driver buffer sharing mechanism that can
> > > be used to support peer-to-peer access from RDMA devices.
> > >
> > > Device memory exported via dma-buf is associated with a file descriptor.
> > > This is passed to the user space as a property associated with the
> > > buffer allocation. When the buffer is registered as a memory region,
> > > the file descriptor is passed to the RDMA driver along with other
> > > parameters.
> > >
> > > Implement the common code for importing dma-buf object and mapping
> > > dma-buf pages.
> > >
> > > Signed-off-by: Jianxin Xiong <jianxin.xiong@intel.com>
> > > Reviewed-by: Sean Hefty <sean.hefty@intel.com>
> > > Acked-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
> > > ---
> > >  drivers/infiniband/core/Makefile      |   2 +-
> > >  drivers/infiniband/core/umem.c        |   4 +
> > >  drivers/infiniband/core/umem_dmabuf.c | 291
> > > ++++++++++++++++++++++++++++++++++
> > >  drivers/infiniband/core/umem_dmabuf.h |  14 ++
> > >  drivers/infiniband/core/umem_odp.c    |  12 ++
> > >  include/rdma/ib_umem.h                |  19 ++-
> > >  6 files changed, 340 insertions(+), 2 deletions(-)  create mode
> > > 100644 drivers/infiniband/core/umem_dmabuf.c
> > >  create mode 100644 drivers/infiniband/core/umem_dmabuf.h
> > 
> > I think this is using ODP too literally, dmabuf isn't going to need fine grained page faults, and I'm not sure this locking scheme is OK - ODP is
> > horrifically complicated.
> > 
> 
> > If this is the approach then I think we should make dmabuf its own stand alone API, reg_user_mr_dmabuf()
> 
> That's the original approach in the first version. We can go back there.
> 
> > 
> > The implementation in mlx5 will be much more understandable, it would just do dma_buf_dynamic_attach() and program the XLT exactly
> > the same as a normal umem.
> > 
> > The move_notify() simply zap's the XLT and triggers a work to reload it after the move. Locking is provided by the dma_resv_lock. Only a
> > small disruption to the page fault handler is needed.
> > 
> 
> We considered such scheme but didn't go that way due to the lack of
> notification when the move is done and thus the work wouldn't know when
> it can reload.
> 
> Now I think it again, we could probably signal the reload in the page fault handler. 

For reinstanting the pages you need:

- dma_resv_lock, this prevents anyone else from issuing new moves or
  anything like that
- dma_resv_get_excl + dma_fence_wait to wait for any pending moves to
  finish. gpus generally don't wait on the cpu, but block the dependent
  dma operations from being scheduled until that fence fired. But for rdma
  odp I think you need the cpu wait in your worker here.
- get the new sg list, write it into your ptes
- dma_resv_unlock to make sure you're not racing with a concurrent
  move_notify

You can also grab multiple dma_resv_lock in atomically, but I think the
odp rdma model doesn't require that (gpus need that).

Note that you're allowed to allocate memory with GFP_KERNEL while holding
dma_resv_lock, so this shouldn't impose any issues. You are otoh not
allowed to cause userspace faults (so no gup/pup or copy*user with
faulting enabled). So all in all this shouldn't be any worse that calling
pup for normal umem.

Unlike mmu notifier the caller holds dma_resv_lock already for you around
the move_notify callback, so you shouldn't need any additional locking in
there (aside from what you need to zap the ptes and flush hw tlbs).

Cheers, Daniel

> 
> > > +	dma_resv_lock(umem_dmabuf->attach->dmabuf->resv, NULL);
> > > +	sgt = dma_buf_map_attachment(umem_dmabuf->attach,
> > > +				     DMA_BIDIRECTIONAL);
> > > +	dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv);
> > 
> > This doesn't look right, this lock has to be held up until the HW is programmed
> 
> The mapping remains valid until being invalidated again. There is a sequence number checking before programming the HW. 
> 
> > 
> > The use of atomic looks probably wrong as well.
> 
> Do you mean umem_dmabuf->notifier_seq? Could you elaborate the concern?
> 
> > 
> > > +	k = 0;
> > > +	total_pages = ib_umem_odp_num_pages(umem_odp);
> > > +	for_each_sg(umem->sg_head.sgl, sg, umem->sg_head.nents, j) {
> > > +		addr = sg_dma_address(sg);
> > > +		pages = sg_dma_len(sg) >> page_shift;
> > > +		while (pages > 0 && k < total_pages) {
> > > +			umem_odp->dma_list[k++] = addr | access_mask;
> > > +			umem_odp->npages++;
> > > +			addr += page_size;
> > > +			pages--;
> > 
> > This isn't fragmenting the sg into a page list properly, won't work for unaligned things
> 
> I thought the addresses are aligned, but will add explicit alignment here.
> 
> > 
> > And really we don't need the dma_list for this case, with a fixed whole mapping DMA SGL a normal umem sgl is OK and the normal umem
> > XLT programming in mlx5 is fine.
> 
> The dma_list is used by both "polulate_mtt()" and "mlx5_ib_invalidate_range", which are used for XLT programming and invalidating (zapping), respectively.
> 
> > 
> > Jason
> _______________________________________________
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/dri-devel
Xiong, Jianxin Oct. 6, 2020, 3:26 p.m. UTC | #8
> -----Original Message-----
> From: Daniel Vetter <daniel@ffwll.ch>
> Sent: Tuesday, October 06, 2020 2:22 AM
> To: Xiong, Jianxin <jianxin.xiong@intel.com>
> Cc: Jason Gunthorpe <jgg@ziepe.ca>; Leon Romanovsky <leon@kernel.org>; linux-rdma@vger.kernel.org; dri-devel@lists.freedesktop.org;
> Doug Ledford <dledford@redhat.com>; Vetter, Daniel <daniel.vetter@intel.com>; Christian Koenig <christian.koenig@amd.com>
> Subject: Re: [RFC PATCH v3 1/4] RDMA/umem: Support importing dma-buf as user memory region
> 
> On Mon, Oct 05, 2020 at 04:18:11PM +0000, Xiong, Jianxin wrote:
> > > -----Original Message-----
> > > From: Jason Gunthorpe <jgg@ziepe.ca>
> > > Sent: Monday, October 05, 2020 6:13 AM
> > > To: Xiong, Jianxin <jianxin.xiong@intel.com>
> > > Cc: linux-rdma@vger.kernel.org; dri-devel@lists.freedesktop.org;
> > > Doug Ledford <dledford@redhat.com>; Leon Romanovsky
> > > <leon@kernel.org>; Sumit Semwal <sumit.semwal@linaro.org>; Christian
> > > Koenig <christian.koenig@amd.com>; Vetter, Daniel
> > > <daniel.vetter@intel.com>
> > > Subject: Re: [RFC PATCH v3 1/4] RDMA/umem: Support importing dma-buf
> > > as user memory region
> > >
> > > On Sun, Oct 04, 2020 at 12:12:28PM -0700, Jianxin Xiong wrote:
> > > > Dma-buf is a standard cross-driver buffer sharing mechanism that
> > > > can be used to support peer-to-peer access from RDMA devices.
> > > >
> > > > Device memory exported via dma-buf is associated with a file descriptor.
> > > > This is passed to the user space as a property associated with the
> > > > buffer allocation. When the buffer is registered as a memory
> > > > region, the file descriptor is passed to the RDMA driver along
> > > > with other parameters.
> > > >
> > > > Implement the common code for importing dma-buf object and mapping
> > > > dma-buf pages.
> > > >
> > > > Signed-off-by: Jianxin Xiong <jianxin.xiong@intel.com>
> > > > Reviewed-by: Sean Hefty <sean.hefty@intel.com>
> > > > Acked-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
> > > > ---
> > > >  drivers/infiniband/core/Makefile      |   2 +-
> > > >  drivers/infiniband/core/umem.c        |   4 +
> > > >  drivers/infiniband/core/umem_dmabuf.c | 291
> > > > ++++++++++++++++++++++++++++++++++
> > > >  drivers/infiniband/core/umem_dmabuf.h |  14 ++
> > > >  drivers/infiniband/core/umem_odp.c    |  12 ++
> > > >  include/rdma/ib_umem.h                |  19 ++-
> > > >  6 files changed, 340 insertions(+), 2 deletions(-)  create mode
> > > > 100644 drivers/infiniband/core/umem_dmabuf.c
> > > >  create mode 100644 drivers/infiniband/core/umem_dmabuf.h
> > >
> > > I think this is using ODP too literally, dmabuf isn't going to need
> > > fine grained page faults, and I'm not sure this locking scheme is OK - ODP is horrifically complicated.
> > >
> >
> > > If this is the approach then I think we should make dmabuf its own
> > > stand alone API, reg_user_mr_dmabuf()
> >
> > That's the original approach in the first version. We can go back there.
> >
> > >
> > > The implementation in mlx5 will be much more understandable, it
> > > would just do dma_buf_dynamic_attach() and program the XLT exactly the same as a normal umem.
> > >
> > > The move_notify() simply zap's the XLT and triggers a work to reload
> > > it after the move. Locking is provided by the dma_resv_lock. Only a small disruption to the page fault handler is needed.
> > >
> >
> > We considered such scheme but didn't go that way due to the lack of
> > notification when the move is done and thus the work wouldn't know
> > when it can reload.
> >
> > Now I think it again, we could probably signal the reload in the page fault handler.
> 
> For reinstanting the pages you need:
> 
> - dma_resv_lock, this prevents anyone else from issuing new moves or
>   anything like that
> - dma_resv_get_excl + dma_fence_wait to wait for any pending moves to
>   finish. gpus generally don't wait on the cpu, but block the dependent
>   dma operations from being scheduled until that fence fired. But for rdma
>   odp I think you need the cpu wait in your worker here.
> - get the new sg list, write it into your ptes
> - dma_resv_unlock to make sure you're not racing with a concurrent
>   move_notify
> 
> You can also grab multiple dma_resv_lock in atomically, but I think the odp rdma model doesn't require that (gpus need that).
> 
> Note that you're allowed to allocate memory with GFP_KERNEL while holding dma_resv_lock, so this shouldn't impose any issues. You are
> otoh not allowed to cause userspace faults (so no gup/pup or copy*user with faulting enabled). So all in all this shouldn't be any worse that
> calling pup for normal umem.
> 
> Unlike mmu notifier the caller holds dma_resv_lock already for you around the move_notify callback, so you shouldn't need any additional
> locking in there (aside from what you need to zap the ptes and flush hw tlbs).
> 
> Cheers, Daniel
> 

Hi Daniel, thanks for providing the details. I would have missed the dma_resv_get_excl + dma_fence_wait part otherwise. 

> >
> > > > +	dma_resv_lock(umem_dmabuf->attach->dmabuf->resv, NULL);
> > > > +	sgt = dma_buf_map_attachment(umem_dmabuf->attach,
> > > > +				     DMA_BIDIRECTIONAL);
> > > > +	dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv);
> > >
> > > This doesn't look right, this lock has to be held up until the HW is
> > > programmed
> >
> > The mapping remains valid until being invalidated again. There is a sequence number checking before programming the HW.
> >
> > >
> > > The use of atomic looks probably wrong as well.
> >
> > Do you mean umem_dmabuf->notifier_seq? Could you elaborate the concern?
> >
> > >
> > > > +	k = 0;
> > > > +	total_pages = ib_umem_odp_num_pages(umem_odp);
> > > > +	for_each_sg(umem->sg_head.sgl, sg, umem->sg_head.nents, j) {
> > > > +		addr = sg_dma_address(sg);
> > > > +		pages = sg_dma_len(sg) >> page_shift;
> > > > +		while (pages > 0 && k < total_pages) {
> > > > +			umem_odp->dma_list[k++] = addr | access_mask;
> > > > +			umem_odp->npages++;
> > > > +			addr += page_size;
> > > > +			pages--;
> > >
> > > This isn't fragmenting the sg into a page list properly, won't work
> > > for unaligned things
> >
> > I thought the addresses are aligned, but will add explicit alignment here.
> >
> > >
> > > And really we don't need the dma_list for this case, with a fixed
> > > whole mapping DMA SGL a normal umem sgl is OK and the normal umem XLT programming in mlx5 is fine.
> >
> > The dma_list is used by both "polulate_mtt()" and "mlx5_ib_invalidate_range", which are used for XLT programming and invalidating
> (zapping), respectively.
> >
> > >
> > > Jason
> > _______________________________________________
> > dri-devel mailing list
> > dri-devel@lists.freedesktop.org
> > https://lists.freedesktop.org/mailman/listinfo/dri-devel
> 
> --
> Daniel Vetter
> Software Engineer, Intel Corporation
> http://blog.ffwll.ch
Jason Gunthorpe Oct. 6, 2020, 3:49 p.m. UTC | #9
On Tue, Oct 06, 2020 at 11:22:14AM +0200, Daniel Vetter wrote:
> 
> For reinstanting the pages you need:
> 
> - dma_resv_lock, this prevents anyone else from issuing new moves or
>   anything like that
> - dma_resv_get_excl + dma_fence_wait to wait for any pending moves to
>   finish. gpus generally don't wait on the cpu, but block the dependent
>   dma operations from being scheduled until that fence fired. But for rdma
>   odp I think you need the cpu wait in your worker here.

Reinstating is not really any different that the first insertion, so
then all this should be needed in every case?

Jason
Daniel Vetter Oct. 6, 2020, 4:34 p.m. UTC | #10
On Tue, Oct 06, 2020 at 12:49:56PM -0300, Jason Gunthorpe wrote:
> On Tue, Oct 06, 2020 at 11:22:14AM +0200, Daniel Vetter wrote:
> > 
> > For reinstanting the pages you need:
> > 
> > - dma_resv_lock, this prevents anyone else from issuing new moves or
> >   anything like that
> > - dma_resv_get_excl + dma_fence_wait to wait for any pending moves to
> >   finish. gpus generally don't wait on the cpu, but block the dependent
> >   dma operations from being scheduled until that fence fired. But for rdma
> >   odp I think you need the cpu wait in your worker here.
> 
> Reinstating is not really any different that the first insertion, so
> then all this should be needed in every case?

Yes. Without move_notify we pin the dma-buf into system memory, so it
can't move, and hence you also don't have to chase it. But with
move_notify this all becomes possible.
-Daniel
Daniel Vetter Oct. 6, 2020, 4:40 p.m. UTC | #11
On Mon, Oct 05, 2020 at 04:18:11PM +0000, Xiong, Jianxin wrote:
> > -----Original Message-----
> > From: Jason Gunthorpe <jgg@ziepe.ca>
> > Sent: Monday, October 05, 2020 6:13 AM
> > To: Xiong, Jianxin <jianxin.xiong@intel.com>
> > Cc: linux-rdma@vger.kernel.org; dri-devel@lists.freedesktop.org; Doug Ledford <dledford@redhat.com>; Leon Romanovsky
> > <leon@kernel.org>; Sumit Semwal <sumit.semwal@linaro.org>; Christian Koenig <christian.koenig@amd.com>; Vetter, Daniel
> > <daniel.vetter@intel.com>
> > Subject: Re: [RFC PATCH v3 1/4] RDMA/umem: Support importing dma-buf as user memory region
> > 
> > On Sun, Oct 04, 2020 at 12:12:28PM -0700, Jianxin Xiong wrote:
> > > Dma-buf is a standard cross-driver buffer sharing mechanism that can
> > > be used to support peer-to-peer access from RDMA devices.
> > >
> > > Device memory exported via dma-buf is associated with a file descriptor.
> > > This is passed to the user space as a property associated with the
> > > buffer allocation. When the buffer is registered as a memory region,
> > > the file descriptor is passed to the RDMA driver along with other
> > > parameters.
> > >
> > > Implement the common code for importing dma-buf object and mapping
> > > dma-buf pages.
> > >
> > > Signed-off-by: Jianxin Xiong <jianxin.xiong@intel.com>
> > > Reviewed-by: Sean Hefty <sean.hefty@intel.com>
> > > Acked-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
> > > ---
> > >  drivers/infiniband/core/Makefile      |   2 +-
> > >  drivers/infiniband/core/umem.c        |   4 +
> > >  drivers/infiniband/core/umem_dmabuf.c | 291
> > > ++++++++++++++++++++++++++++++++++
> > >  drivers/infiniband/core/umem_dmabuf.h |  14 ++
> > >  drivers/infiniband/core/umem_odp.c    |  12 ++
> > >  include/rdma/ib_umem.h                |  19 ++-
> > >  6 files changed, 340 insertions(+), 2 deletions(-)  create mode
> > > 100644 drivers/infiniband/core/umem_dmabuf.c
> > >  create mode 100644 drivers/infiniband/core/umem_dmabuf.h
> > 
> > I think this is using ODP too literally, dmabuf isn't going to need fine grained page faults, and I'm not sure this locking scheme is OK - ODP is
> > horrifically complicated.
> > 
> 
> > If this is the approach then I think we should make dmabuf its own stand alone API, reg_user_mr_dmabuf()
> 
> That's the original approach in the first version. We can go back there.
> 
> > 
> > The implementation in mlx5 will be much more understandable, it would just do dma_buf_dynamic_attach() and program the XLT exactly
> > the same as a normal umem.
> > 
> > The move_notify() simply zap's the XLT and triggers a work to reload it after the move. Locking is provided by the dma_resv_lock. Only a
> > small disruption to the page fault handler is needed.
> > 
> 
> We considered such scheme but didn't go that way due to the lack of notification when the move is done and thus the work wouldn't know when it can reload.
> 
> Now I think it again, we could probably signal the reload in the page fault handler. 
> 
> > > +	dma_resv_lock(umem_dmabuf->attach->dmabuf->resv, NULL);
> > > +	sgt = dma_buf_map_attachment(umem_dmabuf->attach,
> > > +				     DMA_BIDIRECTIONAL);
> > > +	dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv);
> > 
> > This doesn't look right, this lock has to be held up until the HW is programmed
> 
> The mapping remains valid until being invalidated again. There is a sequence number checking before programming the HW. 
> 
> > 
> > The use of atomic looks probably wrong as well.
> 
> Do you mean umem_dmabuf->notifier_seq? Could you elaborate the concern?
> 
> > 
> > > +	k = 0;
> > > +	total_pages = ib_umem_odp_num_pages(umem_odp);
> > > +	for_each_sg(umem->sg_head.sgl, sg, umem->sg_head.nents, j) {
> > > +		addr = sg_dma_address(sg);
> > > +		pages = sg_dma_len(sg) >> page_shift;
> > > +		while (pages > 0 && k < total_pages) {
> > > +			umem_odp->dma_list[k++] = addr | access_mask;
> > > +			umem_odp->npages++;
> > > +			addr += page_size;
> > > +			pages--;
> > 
> > This isn't fragmenting the sg into a page list properly, won't work for unaligned things
> 
> I thought the addresses are aligned, but will add explicit alignment here.

Everyone's working under the assumption that dma_buf sg lists are fully
PAGE aligned. Lots of stuff can break otherwise all over the place. You
might get more than that, especially for p2p since many gpus work on 64kb
pages for the vram, but system pages are 4k aligned.

I think we need a patch to clarify this in the kerneldoc, bot for
dma_buf_map_attachment for importers and the dma_buf_ops callback for
exporters.

Also I think it'd be good to verify that when dma api debugging is
enabled, i.e. when dma_buf_map_attachment succeeds, walk the entire sg
list and validate that all dma_addr_t segments are page aligned. And if
not, WARN loudly and fail.
-Daniel

 
> > 
> > And really we don't need the dma_list for this case, with a fixed whole mapping DMA SGL a normal umem sgl is OK and the normal umem
> > XLT programming in mlx5 is fine.
> 
> The dma_list is used by both "polulate_mtt()" and "mlx5_ib_invalidate_range", which are used for XLT programming and invalidating (zapping), respectively.
> 
> > 
> > Jason
> _______________________________________________
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/dri-devel
Daniel Vetter Oct. 6, 2020, 5:24 p.m. UTC | #12
On Tue, Oct 6, 2020 at 6:34 PM Daniel Vetter <daniel@ffwll.ch> wrote:
>
> On Tue, Oct 06, 2020 at 12:49:56PM -0300, Jason Gunthorpe wrote:
> > On Tue, Oct 06, 2020 at 11:22:14AM +0200, Daniel Vetter wrote:
> > >
> > > For reinstanting the pages you need:
> > >
> > > - dma_resv_lock, this prevents anyone else from issuing new moves or
> > >   anything like that
> > > - dma_resv_get_excl + dma_fence_wait to wait for any pending moves to
> > >   finish. gpus generally don't wait on the cpu, but block the dependent
> > >   dma operations from being scheduled until that fence fired. But for rdma
> > >   odp I think you need the cpu wait in your worker here.
> >
> > Reinstating is not really any different that the first insertion, so
> > then all this should be needed in every case?
>
> Yes. Without move_notify we pin the dma-buf into system memory, so it
> can't move, and hence you also don't have to chase it. But with
> move_notify this all becomes possible.

I just realized I got it wrong compared to gpus. I needs to be:
1. dma_resv_lock
2. dma_buf_map_attachment, which might have to move the buffer around
again if you're unlucky
3. wait for the exclusive fence
4. put sgt into your rdma ptes
5 dma_resv_unlock

Maybe also something we should document somewhere for dynamic buffers.
Assuming I got it right this time around ... Christian?
-Daniel
Jason Gunthorpe Oct. 6, 2020, 6:02 p.m. UTC | #13
On Tue, Oct 06, 2020 at 07:24:30PM +0200, Daniel Vetter wrote:
> On Tue, Oct 6, 2020 at 6:34 PM Daniel Vetter <daniel@ffwll.ch> wrote:
> >
> > On Tue, Oct 06, 2020 at 12:49:56PM -0300, Jason Gunthorpe wrote:
> > > On Tue, Oct 06, 2020 at 11:22:14AM +0200, Daniel Vetter wrote:
> > > >
> > > > For reinstanting the pages you need:
> > > >
> > > > - dma_resv_lock, this prevents anyone else from issuing new moves or
> > > >   anything like that
> > > > - dma_resv_get_excl + dma_fence_wait to wait for any pending moves to
> > > >   finish. gpus generally don't wait on the cpu, but block the dependent
> > > >   dma operations from being scheduled until that fence fired. But for rdma
> > > >   odp I think you need the cpu wait in your worker here.
> > >
> > > Reinstating is not really any different that the first insertion, so
> > > then all this should be needed in every case?
> >
> > Yes. Without move_notify we pin the dma-buf into system memory, so it
> > can't move, and hence you also don't have to chase it. But with
> > move_notify this all becomes possible.
> 
> I just realized I got it wrong compared to gpus. I needs to be:
> 1. dma_resv_lock
> 2. dma_buf_map_attachment, which might have to move the buffer around
> again if you're unlucky
> 3. wait for the exclusive fence
> 4. put sgt into your rdma ptes
> 5 dma_resv_unlock
> 
> Maybe also something we should document somewhere for dynamic buffers.
> Assuming I got it right this time around ... Christian?

#3 between 2 and 4 seems strange - I would expect once
dma_buf_map_attachment() returns that the buffer can be placed in the
ptes. It certianly can't be changed after the SGL is returned..

Feels like #2 should serialize all this internally? An API that
returns invalidate data sometimes is dangerous :)

Jason
Daniel Vetter Oct. 6, 2020, 6:17 p.m. UTC | #14
On Tue, Oct 6, 2020 at 8:02 PM Jason Gunthorpe <jgg@ziepe.ca> wrote:
>
> On Tue, Oct 06, 2020 at 07:24:30PM +0200, Daniel Vetter wrote:
> > On Tue, Oct 6, 2020 at 6:34 PM Daniel Vetter <daniel@ffwll.ch> wrote:
> > >
> > > On Tue, Oct 06, 2020 at 12:49:56PM -0300, Jason Gunthorpe wrote:
> > > > On Tue, Oct 06, 2020 at 11:22:14AM +0200, Daniel Vetter wrote:
> > > > >
> > > > > For reinstanting the pages you need:
> > > > >
> > > > > - dma_resv_lock, this prevents anyone else from issuing new moves or
> > > > >   anything like that
> > > > > - dma_resv_get_excl + dma_fence_wait to wait for any pending moves to
> > > > >   finish. gpus generally don't wait on the cpu, but block the dependent
> > > > >   dma operations from being scheduled until that fence fired. But for rdma
> > > > >   odp I think you need the cpu wait in your worker here.
> > > >
> > > > Reinstating is not really any different that the first insertion, so
> > > > then all this should be needed in every case?
> > >
> > > Yes. Without move_notify we pin the dma-buf into system memory, so it
> > > can't move, and hence you also don't have to chase it. But with
> > > move_notify this all becomes possible.
> >
> > I just realized I got it wrong compared to gpus. I needs to be:
> > 1. dma_resv_lock
> > 2. dma_buf_map_attachment, which might have to move the buffer around
> > again if you're unlucky
> > 3. wait for the exclusive fence
> > 4. put sgt into your rdma ptes
> > 5 dma_resv_unlock
> >
> > Maybe also something we should document somewhere for dynamic buffers.
> > Assuming I got it right this time around ... Christian?
>
> #3 between 2 and 4 seems strange - I would expect once
> dma_buf_map_attachment() returns that the buffer can be placed in the
> ptes. It certianly can't be changed after the SGL is returned..


So on the gpu we pipeline this all. So step 4 doesn't happen on the
cpu, but instead we queue up a bunch of command buffers so that the
gpu writes these pagetables (and the flushes tlbs and then does the
actual stuff userspace wants it to do).

And all that is being blocked in the gpu scheduler on the fences we
acquire in step 3. Again we don't wait on the cpu for that fence, we
just queue it all up and let the gpu scheduler sort out the mess. End
result is that you get a sgt that points at stuff which very well
might have nothing even remotely resembling your buffer in there at
the moment. But all the copy operations are queued up, so rsn the data
will also be there.

This is also why the precise semantics of move_notify for gpu<->gpu
sharing took forever to discuss and are still a bit wip, because you
have the inverse problem: The dma api mapping might still be there in
the iommu, but the data behind it is long gone and replaced. So we
need to be really carefully with making sure that dma operations are
blocked in the gpu properly, with all the flushing and everything. I
think we've reached the conclusion that ->move_notify is allowed to
change the set of fences in the dma_resv so that these flushes and pte
writes can be queued up correctly (on many gpu you can't synchronously
flush tlbs, yay). The exporter then needs to make sure that the actual
buffer move is queued up behind all these operations too.

But rdma doesn't work like that, so it looks all a bit funny.

Anticipating your next question: Can this mean there's a bunch of
differnt dma/buffer mappings in flight for the same buffer?

Yes. We call the ghost objects, at least in the ttm helpers.

> Feels like #2 should serialize all this internally? An API that
> returns invalidate data sometimes is dangerous :)

If you use the non-dynamic mode, where we pin the buffer into system
memory at dma_buf_attach time, it kinda works like that. Also it's not
flat out invalide date, it's the most up-to-date data reflecting all
committed changes. Plus dma_resv tells you when that will actually be
reality in the hardware, not just the software tracking of what's
going on.
-Daniel
Jason Gunthorpe Oct. 6, 2020, 6:38 p.m. UTC | #15
On Tue, Oct 06, 2020 at 08:17:05PM +0200, Daniel Vetter wrote:

> So on the gpu we pipeline this all. So step 4 doesn't happen on the
> cpu, but instead we queue up a bunch of command buffers so that the
> gpu writes these pagetables (and the flushes tlbs and then does the
> actual stuff userspace wants it to do).

mlx5 HW does basically this as well.

We just apply scheduling for this work on the device, not in the CPU.
 
> just queue it all up and let the gpu scheduler sort out the mess. End
> result is that you get a sgt that points at stuff which very well
> might have nothing even remotely resembling your buffer in there at
> the moment. But all the copy operations are queued up, so rsn the data
> will also be there.

The explanation make sense, thanks

> But rdma doesn't work like that, so it looks all a bit funny.

Well, I guess it could, but how would it make anything better? I can
overlap building the SGL and the device PTEs with something else doing
'move', but is that a workload that needs such agressive optimization?

> This is also why the precise semantics of move_notify for gpu<->gpu
> sharing took forever to discuss and are still a bit wip, because you
> have the inverse problem: The dma api mapping might still be there

Seems like this all makes a graph of operations, can't start the next
one until all deps are finished. Actually sounds a lot like futures.

Would be clearer if this attach API provided some indication that the
SGL is actually a future valid SGL..

Jason
Daniel Vetter Oct. 6, 2020, 7:12 p.m. UTC | #16
On Tue, Oct 6, 2020 at 8:38 PM Jason Gunthorpe <jgg@ziepe.ca> wrote:
>
> On Tue, Oct 06, 2020 at 08:17:05PM +0200, Daniel Vetter wrote:
>
> > So on the gpu we pipeline this all. So step 4 doesn't happen on the
> > cpu, but instead we queue up a bunch of command buffers so that the
> > gpu writes these pagetables (and the flushes tlbs and then does the
> > actual stuff userspace wants it to do).
>
> mlx5 HW does basically this as well.
>
> We just apply scheduling for this work on the device, not in the CPU.
>
> > just queue it all up and let the gpu scheduler sort out the mess. End
> > result is that you get a sgt that points at stuff which very well
> > might have nothing even remotely resembling your buffer in there at
> > the moment. But all the copy operations are queued up, so rsn the data
> > will also be there.
>
> The explanation make sense, thanks
>
> > But rdma doesn't work like that, so it looks all a bit funny.
>
> Well, I guess it could, but how would it make anything better? I can
> overlap building the SGL and the device PTEs with something else doing
> 'move', but is that a workload that needs such agressive optimization?

The compounding issue with gpus is that we need entire lists of
buffers, atomically, for our dma operations. Which means that the
cliff you jump over with a working set that's slightly too big is very
steep, so that you have to pipeline your buffer moves interleaved with
dma operations to keep the hw busy. Having per page fault handling and
hw that can continue in other places while that fault is repaired
should smooth that cliff out enough that you don't need to bother.

I think at worst we might worry about unfairness. With the "entire
list of buffers" workload model gpus might starve out rdma badly by
constantly moving all the buffers around. Installing a dma_fence in
the rdma page fault handler, to keep the dma-buf busy for a small
amount of time to make sure at least the next rdma transfer goes
through without more faults should be able to fix that though. Such a
keepalive fence should be in the shared slots for dma_resv, to not
blocker other access. This wouldn't even need any other changes in
rdma (although delaying the pte zapping when we get a move_notify
would be better), since an active fence alone makes that buffer a much
less likely target for eviction.

> > This is also why the precise semantics of move_notify for gpu<->gpu
> > sharing took forever to discuss and are still a bit wip, because you
> > have the inverse problem: The dma api mapping might still be there
>
> Seems like this all makes a graph of operations, can't start the next
> one until all deps are finished. Actually sounds a lot like futures.
>
> Would be clearer if this attach API provided some indication that the
> SGL is actually a future valid SGL..

Yeah I think one of the things we've discussed is whether dma_buf
should pass around the fences more explicitly, or whether we should
continue to smash on the more implicit dma_resv tracking. Inertia won
out, at least for now because gpu drivers do all the book-keeping
directly in the shared dma_resv structure anyway, so this wouldn't
have helped to get cleaner code.
-Daniel
Christian König Oct. 7, 2020, 7:13 a.m. UTC | #17
Hi guys,

maybe it becomes clearer to understand when you see this as two 
different things:
1. The current location where the buffer is.
2. If the data inside the buffer can be accessed.

The current location is returned back by dma_buf_map_attachment() and 
the result of it can be used to fill up your page tables.

But before you can access the data at this location you need to wait for 
the exclusive fence to signal.

As Daniel explained the reason for this is that GPUs are heavily 
pipelined pieces of hardware. To keep all blocks busy all the time you 
need to prepare things ahead of time.

This is not only to make buffers able to move around, but also for 
example for cache coherency. In other words the buffer could have been 
at the given location all the time, but you need to wait for the 
exclusive fence to guarantee that write back caches are done with their job.

Regards,
Christian.

Am 06.10.20 um 21:12 schrieb Daniel Vetter:
> On Tue, Oct 6, 2020 at 8:38 PM Jason Gunthorpe <jgg@ziepe.ca> wrote:
>> On Tue, Oct 06, 2020 at 08:17:05PM +0200, Daniel Vetter wrote:
>>
>>> So on the gpu we pipeline this all. So step 4 doesn't happen on the
>>> cpu, but instead we queue up a bunch of command buffers so that the
>>> gpu writes these pagetables (and the flushes tlbs and then does the
>>> actual stuff userspace wants it to do).
>> mlx5 HW does basically this as well.
>>
>> We just apply scheduling for this work on the device, not in the CPU.
>>
>>> just queue it all up and let the gpu scheduler sort out the mess. End
>>> result is that you get a sgt that points at stuff which very well
>>> might have nothing even remotely resembling your buffer in there at
>>> the moment. But all the copy operations are queued up, so rsn the data
>>> will also be there.
>> The explanation make sense, thanks
>>
>>> But rdma doesn't work like that, so it looks all a bit funny.
>> Well, I guess it could, but how would it make anything better? I can
>> overlap building the SGL and the device PTEs with something else doing
>> 'move', but is that a workload that needs such agressive optimization?
> The compounding issue with gpus is that we need entire lists of
> buffers, atomically, for our dma operations. Which means that the
> cliff you jump over with a working set that's slightly too big is very
> steep, so that you have to pipeline your buffer moves interleaved with
> dma operations to keep the hw busy. Having per page fault handling and
> hw that can continue in other places while that fault is repaired
> should smooth that cliff out enough that you don't need to bother.
>
> I think at worst we might worry about unfairness. With the "entire
> list of buffers" workload model gpus might starve out rdma badly by
> constantly moving all the buffers around. Installing a dma_fence in
> the rdma page fault handler, to keep the dma-buf busy for a small
> amount of time to make sure at least the next rdma transfer goes
> through without more faults should be able to fix that though. Such a
> keepalive fence should be in the shared slots for dma_resv, to not
> blocker other access. This wouldn't even need any other changes in
> rdma (although delaying the pte zapping when we get a move_notify
> would be better), since an active fence alone makes that buffer a much
> less likely target for eviction.
>
>>> This is also why the precise semantics of move_notify for gpu<->gpu
>>> sharing took forever to discuss and are still a bit wip, because you
>>> have the inverse problem: The dma api mapping might still be there
>> Seems like this all makes a graph of operations, can't start the next
>> one until all deps are finished. Actually sounds a lot like futures.
>>
>> Would be clearer if this attach API provided some indication that the
>> SGL is actually a future valid SGL..
> Yeah I think one of the things we've discussed is whether dma_buf
> should pass around the fences more explicitly, or whether we should
> continue to smash on the more implicit dma_resv tracking. Inertia won
> out, at least for now because gpu drivers do all the book-keeping
> directly in the shared dma_resv structure anyway, so this wouldn't
> have helped to get cleaner code.
> -Daniel
diff mbox series

Patch

diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
index 24cb71a..b8d51a7 100644
--- a/drivers/infiniband/core/Makefile
+++ b/drivers/infiniband/core/Makefile
@@ -40,5 +40,5 @@  ib_uverbs-y :=			uverbs_main.o uverbs_cmd.o uverbs_marshall.o \
 				uverbs_std_types_srq.o \
 				uverbs_std_types_wq.o \
 				uverbs_std_types_qp.o
-ib_uverbs-$(CONFIG_INFINIBAND_USER_MEM) += umem.o
+ib_uverbs-$(CONFIG_INFINIBAND_USER_MEM) += umem.o umem_dmabuf.o
 ib_uverbs-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o
diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index 831bff8..59ec36c 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -2,6 +2,7 @@ 
  * Copyright (c) 2005 Topspin Communications.  All rights reserved.
  * Copyright (c) 2005 Cisco Systems.  All rights reserved.
  * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2020 Intel Corporation. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -42,6 +43,7 @@ 
 #include <rdma/ib_umem_odp.h>
 
 #include "uverbs.h"
+#include "umem_dmabuf.h"
 
 static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty)
 {
@@ -318,6 +320,8 @@  void ib_umem_release(struct ib_umem *umem)
 {
 	if (!umem)
 		return;
+	if (umem->is_dmabuf)
+		return ib_umem_dmabuf_release(umem);
 	if (umem->is_odp)
 		return ib_umem_odp_release(to_ib_umem_odp(umem));
 
diff --git a/drivers/infiniband/core/umem_dmabuf.c b/drivers/infiniband/core/umem_dmabuf.c
new file mode 100644
index 0000000..10ed646
--- /dev/null
+++ b/drivers/infiniband/core/umem_dmabuf.c
@@ -0,0 +1,291 @@ 
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
+/*
+ * Copyright (c) 2020 Intel Corporation. All rights reserved.
+ */
+
+#include <linux/dma-buf.h>
+#include <linux/dma-resv.h>
+#include <linux/dma-mapping.h>
+#include <rdma/ib_umem_odp.h>
+
+#include "uverbs.h"
+
+struct ib_umem_dmabuf {
+	struct ib_umem_odp umem_odp;
+	struct dma_buf_attachment *attach;
+	struct sg_table *sgt;
+	atomic_t notifier_seq;
+};
+
+static inline struct ib_umem_dmabuf *to_ib_umem_dmabuf(struct ib_umem *umem)
+{
+	struct ib_umem_odp *umem_odp = to_ib_umem_odp(umem);
+	return container_of(umem_odp, struct ib_umem_dmabuf, umem_odp);
+}
+
+static void ib_umem_dmabuf_invalidate_cb(struct dma_buf_attachment *attach)
+{
+	struct ib_umem_dmabuf *umem_dmabuf = attach->importer_priv;
+	struct ib_umem_odp *umem_odp = &umem_dmabuf->umem_odp;
+	struct mmu_notifier_range range = {};
+	unsigned long current_seq;
+
+	/* no concurrent invalidation due to the dma_resv lock */
+
+	dma_resv_assert_held(umem_dmabuf->attach->dmabuf->resv);
+
+	if (!umem_dmabuf->sgt)
+		return;
+
+	range.start = ib_umem_start(umem_odp);
+	range.end = ib_umem_end(umem_odp);
+	range.flags = MMU_NOTIFIER_RANGE_BLOCKABLE;
+	current_seq = atomic_read(&umem_dmabuf->notifier_seq);
+	umem_odp->notifier.ops->invalidate(&umem_odp->notifier, &range,
+					   current_seq);
+
+	atomic_inc(&umem_dmabuf->notifier_seq);
+}
+
+static struct dma_buf_attach_ops ib_umem_dmabuf_attach_ops = {
+	.allow_peer2peer = 1,
+	.move_notify = ib_umem_dmabuf_invalidate_cb,
+};
+
+static inline int ib_umem_dmabuf_init_odp(struct ib_umem_odp *umem_odp)
+{
+	size_t page_size = 1UL << umem_odp->page_shift;
+	unsigned long start;
+	unsigned long end;
+	size_t pages;
+
+	umem_odp->umem.is_odp = 1;
+	mutex_init(&umem_odp->umem_mutex);
+
+	start = ALIGN_DOWN(umem_odp->umem.address, page_size);
+	if (check_add_overflow(umem_odp->umem.address,
+			       (unsigned long)umem_odp->umem.length,
+			       &end))
+		return -EOVERFLOW;
+	end = ALIGN(end, page_size);
+	if (unlikely(end < page_size))
+		return -EOVERFLOW;
+
+	pages = (end - start) >> umem_odp->page_shift;
+	if (!pages)
+		return -EINVAL;
+
+	/* used for ib_umem_start() & ib_umem_end() */
+	umem_odp->notifier.interval_tree.start = start;
+	umem_odp->notifier.interval_tree.last = end - 1;
+
+	/* umem_odp->page_list is never used for dma-buf */
+
+	umem_odp->dma_list = kvcalloc(
+		pages, sizeof(*umem_odp->dma_list), GFP_KERNEL);
+	if (!umem_odp->dma_list)
+		return -ENOMEM;
+
+	return 0;
+}
+
+struct ib_umem *ib_umem_dmabuf_get(struct ib_device *device,
+				   unsigned long addr, size_t size,
+				   int dmabuf_fd, int access,
+				   const struct mmu_interval_notifier_ops *ops)
+{
+	struct dma_buf *dmabuf;
+	struct ib_umem_dmabuf *umem_dmabuf;
+	struct ib_umem *umem;
+	struct ib_umem_odp *umem_odp;
+	unsigned long end;
+	long ret;
+
+	if (check_add_overflow(addr, size, &end))
+		return ERR_PTR(-EINVAL);
+
+	if (unlikely(PAGE_ALIGN(end) < PAGE_SIZE))
+		return ERR_PTR(-EINVAL);
+
+	umem_dmabuf = kzalloc(sizeof(*umem_dmabuf), GFP_KERNEL);
+	if (!umem_dmabuf)
+		return ERR_PTR(-ENOMEM);
+
+	umem = &umem_dmabuf->umem_odp.umem;
+	umem->ibdev = device;
+	umem->length = size;
+	umem->address = addr;
+	umem->writable = ib_access_writable(access);
+	umem->is_dmabuf = 1;
+
+	dmabuf = dma_buf_get(dmabuf_fd);
+	if (IS_ERR(dmabuf)) {
+		ret = PTR_ERR(dmabuf);
+		goto out_free_umem;
+	}
+
+	/* always attach dynamically to pass the allow_peer2peer flag */
+	umem_dmabuf->attach = dma_buf_dynamic_attach(
+					dmabuf,
+					device->dma_device,
+					&ib_umem_dmabuf_attach_ops,
+					umem_dmabuf);
+	if (IS_ERR(umem_dmabuf->attach)) {
+		ret = PTR_ERR(umem_dmabuf->attach);
+		goto out_release_dmabuf;
+	}
+
+	umem_odp = &umem_dmabuf->umem_odp;
+	umem_odp->page_shift = PAGE_SHIFT;
+	if (access & IB_ACCESS_HUGETLB) {
+		/* don't support huge_tlb at this point */
+		ret = -EINVAL;
+		goto out_detach_dmabuf;
+	}
+
+	ret = ib_umem_dmabuf_init_odp(umem_odp);
+	if (ret)
+		goto out_detach_dmabuf;
+
+	umem_odp->notifier.ops = ops;
+	return umem;
+
+out_detach_dmabuf:
+	dma_buf_detach(dmabuf, umem_dmabuf->attach);
+
+out_release_dmabuf:
+	dma_buf_put(dmabuf);
+
+out_free_umem:
+	kfree(umem_dmabuf);
+	return ERR_PTR(ret);
+}
+EXPORT_SYMBOL(ib_umem_dmabuf_get);
+
+unsigned long ib_umem_dmabuf_notifier_read_begin(struct ib_umem_odp *umem_odp)
+{
+	struct ib_umem_dmabuf *umem_dmabuf = to_ib_umem_dmabuf(&umem_odp->umem);
+
+	return atomic_read(&umem_dmabuf->notifier_seq);
+}
+EXPORT_SYMBOL(ib_umem_dmabuf_notifier_read_begin);
+
+int ib_umem_dmabuf_notifier_read_retry(struct ib_umem_odp *umem_odp,
+				       unsigned long current_seq)
+{
+	struct ib_umem_dmabuf *umem_dmabuf = to_ib_umem_dmabuf(&umem_odp->umem);
+
+	return (atomic_read(&umem_dmabuf->notifier_seq) != current_seq);
+}
+EXPORT_SYMBOL(ib_umem_dmabuf_notifier_read_retry);
+
+int ib_umem_dmabuf_map_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt,
+			     u64 access_mask, unsigned long current_seq)
+{
+	struct ib_umem_dmabuf *umem_dmabuf = to_ib_umem_dmabuf(umem);
+	struct ib_umem_odp *umem_odp = &umem_dmabuf->umem_odp;
+	u64 start, end, addr;
+	int j, k, ret = 0, user_pages, pages, total_pages;
+	unsigned int page_shift;
+	size_t page_size;
+	struct scatterlist *sg;
+	struct sg_table *sgt;
+
+	if (access_mask == 0)
+		return -EINVAL;
+
+	if (user_virt < ib_umem_start(umem_odp) ||
+	    user_virt + bcnt > ib_umem_end(umem_odp))
+		return -EFAULT;
+
+	page_shift = umem_odp->page_shift;
+	page_size = 1UL << page_shift;
+	start = ALIGN_DOWN(user_virt, page_size);
+	end = ALIGN(user_virt + bcnt, page_size);
+	user_pages = (end - start) >> page_shift;
+
+	mutex_lock(&umem_odp->umem_mutex);
+
+	/* check for on-ongoing invalidations */
+	if (ib_umem_dmabuf_notifier_read_retry(umem_odp, current_seq)) {
+		ret = -EAGAIN;
+		goto out;
+	}
+
+	ret = user_pages;
+	if (umem_dmabuf->sgt)
+		goto out;
+
+	dma_resv_lock(umem_dmabuf->attach->dmabuf->resv, NULL);
+	sgt = dma_buf_map_attachment(umem_dmabuf->attach,
+				     DMA_BIDIRECTIONAL);
+	dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv);
+
+	if (IS_ERR(sgt)) {
+		ret = PTR_ERR(sgt);
+		goto out;
+	}
+
+	umem->sg_head = *sgt;
+	umem->nmap = sgt->nents;
+	umem_dmabuf->sgt = sgt;
+
+	k = 0;
+	total_pages = ib_umem_odp_num_pages(umem_odp);
+	for_each_sg(umem->sg_head.sgl, sg, umem->sg_head.nents, j) {
+		addr = sg_dma_address(sg);
+		pages = sg_dma_len(sg) >> page_shift;
+		while (pages > 0 && k < total_pages) {
+			umem_odp->dma_list[k++] = addr | access_mask;
+			umem_odp->npages++;
+			addr += page_size;
+			pages--;
+		}
+	}
+
+	WARN_ON(k != total_pages);
+
+out:
+	mutex_unlock(&umem_odp->umem_mutex);
+	return ret;
+}
+
+void ib_umem_dmabuf_unmap_pages(struct ib_umem *umem)
+{
+	struct ib_umem_dmabuf *umem_dmabuf = to_ib_umem_dmabuf(umem);
+	struct ib_umem_odp *umem_odp = &umem_dmabuf->umem_odp;
+	int npages = ib_umem_odp_num_pages(umem_odp);
+	int i;
+
+	lockdep_assert_held(&umem_odp->umem_mutex);
+	dma_resv_assert_held(umem_dmabuf->attach->dmabuf->resv);
+
+	if (!umem_dmabuf->sgt)
+		return;
+
+	dma_buf_unmap_attachment(umem_dmabuf->attach, umem_dmabuf->sgt,
+				 DMA_BIDIRECTIONAL);
+
+	umem_dmabuf->sgt = NULL;
+
+	for (i = 0; i < npages; i++)
+		umem_odp->dma_list[i] = 0;
+	umem_odp->npages = 0;
+}
+
+void ib_umem_dmabuf_release(struct ib_umem *umem)
+{
+	struct ib_umem_dmabuf *umem_dmabuf = to_ib_umem_dmabuf(umem);
+	struct ib_umem_odp *umem_odp = &umem_dmabuf->umem_odp;
+	struct dma_buf *dmabuf = umem_dmabuf->attach->dmabuf;
+
+	mutex_lock(&umem_odp->umem_mutex);
+	dma_resv_lock(umem_dmabuf->attach->dmabuf->resv, NULL);
+	ib_umem_dmabuf_unmap_pages(umem);
+	dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv);
+	mutex_unlock(&umem_odp->umem_mutex);
+	kvfree(umem_odp->dma_list);
+	dma_buf_detach(dmabuf, umem_dmabuf->attach);
+	dma_buf_put(dmabuf);
+	kfree(umem_dmabuf);
+}
diff --git a/drivers/infiniband/core/umem_dmabuf.h b/drivers/infiniband/core/umem_dmabuf.h
new file mode 100644
index 0000000..b9378bd
--- /dev/null
+++ b/drivers/infiniband/core/umem_dmabuf.h
@@ -0,0 +1,14 @@ 
+/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */
+/*
+ * Copyright (c) 2020 Intel Corporation. All rights reserved.
+ */
+
+#ifndef UMEM_DMABUF_H
+#define UMEM_DMABUF_H
+
+int ib_umem_dmabuf_map_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt,
+			     u64 access_mask, unsigned long current_seq);
+void ib_umem_dmabuf_unmap_pages(struct ib_umem *umem);
+void ib_umem_dmabuf_release(struct ib_umem *umem);
+
+#endif /* UMEM_DMABUF_H */
diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c
index cc6b4be..7e11619 100644
--- a/drivers/infiniband/core/umem_odp.c
+++ b/drivers/infiniband/core/umem_odp.c
@@ -1,5 +1,6 @@ 
 /*
  * Copyright (c) 2014 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2020 Intel Corporation. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -47,6 +48,7 @@ 
 #include <rdma/ib_umem_odp.h>
 
 #include "uverbs.h"
+#include "umem_dmabuf.h"
 
 static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp,
 				   const struct mmu_interval_notifier_ops *ops)
@@ -263,6 +265,9 @@  struct ib_umem_odp *ib_umem_odp_get(struct ib_device *device,
 
 void ib_umem_odp_release(struct ib_umem_odp *umem_odp)
 {
+	if (umem_odp->umem.is_dmabuf)
+		return ib_umem_dmabuf_release(&umem_odp->umem);
+
 	/*
 	 * Ensure that no more pages are mapped in the umem.
 	 *
@@ -392,6 +397,10 @@  int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt,
 	unsigned int flags = 0, page_shift;
 	phys_addr_t p = 0;
 
+	if (umem_odp->umem.is_dmabuf)
+		return ib_umem_dmabuf_map_pages(&umem_odp->umem, user_virt,
+						bcnt, access_mask, current_seq);
+
 	if (access_mask == 0)
 		return -EINVAL;
 
@@ -517,6 +526,9 @@  void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt,
 	u64 addr;
 	struct ib_device *dev = umem_odp->umem.ibdev;
 
+	if (umem_odp->umem.is_dmabuf)
+		return ib_umem_dmabuf_unmap_pages(&umem_odp->umem);
+
 	lockdep_assert_held(&umem_odp->umem_mutex);
 
 	virt = max_t(u64, virt, ib_umem_start(umem_odp));
diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h
index 71f573a..b8ea693 100644
--- a/include/rdma/ib_umem.h
+++ b/include/rdma/ib_umem.h
@@ -1,6 +1,7 @@ 
 /* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
 /*
  * Copyright (c) 2007 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2020 Intel Corporation.  All rights reserved.
  */
 
 #ifndef IB_UMEM_H
@@ -13,6 +14,7 @@ 
 
 struct ib_ucontext;
 struct ib_umem_odp;
+struct ib_umem_dmabuf;
 
 struct ib_umem {
 	struct ib_device       *ibdev;
@@ -21,6 +23,7 @@  struct ib_umem {
 	unsigned long		address;
 	u32 writable : 1;
 	u32 is_odp : 1;
+	u32 is_dmabuf : 1;
 	struct work_struct	work;
 	struct sg_table sg_head;
 	int             nmap;
@@ -51,6 +54,13 @@  int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset,
 unsigned long ib_umem_find_best_pgsz(struct ib_umem *umem,
 				     unsigned long pgsz_bitmap,
 				     unsigned long virt);
+struct ib_umem *ib_umem_dmabuf_get(struct ib_device *device,
+				   unsigned long addr, size_t size,
+				   int dmabuf_fd, int access,
+				   const struct mmu_interval_notifier_ops *ops);
+unsigned long ib_umem_dmabuf_notifier_read_begin(struct ib_umem_odp *umem_odp);
+int ib_umem_dmabuf_notifier_read_retry(struct ib_umem_odp *umem_odp,
+				       unsigned long current_seq);
 
 #else /* CONFIG_INFINIBAND_USER_MEM */
 
@@ -73,7 +83,14 @@  static inline int ib_umem_find_best_pgsz(struct ib_umem *umem,
 					 unsigned long virt) {
 	return -EINVAL;
 }
+static inline struct ib_umem *ib_umem_dmabuf_get(struct ib_device *device,
+						 unsigned long addr,
+						 size_t size, int dmabuf_fd,
+						 int access,
+						 struct mmu_interval_notifier_ops *ops)
+{
+	return ERR_PTR(-EINVAL);
+}
 
 #endif /* CONFIG_INFINIBAND_USER_MEM */
-
 #endif /* IB_UMEM_H */