diff mbox series

[for-next,v2] RDMA/siw: Use ib_umem_get() to pin user pages

Message ID 20231104075643.195186-1-bmt@zurich.ibm.com (mailing list archive)
State Accepted
Headers show
Series [for-next,v2] RDMA/siw: Use ib_umem_get() to pin user pages | expand

Commit Message

Bernard Metzler Nov. 4, 2023, 7:56 a.m. UTC
Abandon siw private code to pin user pages during user
memory registration, but use ib_umem_get() instead.
This will help maintaining the driver in case of changes
to the memory subsystem.

Signed-off-by: Bernard Metzler <bmt@zurich.ibm.com>
---
v1 -> v2: remove RLIMIT memlock check logic, now done in ib_umem_get()
---
 drivers/infiniband/sw/siw/siw.h       |   3 +-
 drivers/infiniband/sw/siw/siw_mem.c   | 109 ++++++++++----------------
 drivers/infiniband/sw/siw/siw_mem.h   |   5 +-
 drivers/infiniband/sw/siw/siw_verbs.c |  19 +----
 4 files changed, 47 insertions(+), 89 deletions(-)

Comments

Guoqing Jiang Nov. 10, 2023, 6:37 a.m. UTC | #1
Hi,

On 11/4/23 15:56, Bernard Metzler wrote:
> Abandon siw private code to pin user pages during user
> memory registration, but use ib_umem_get() instead.
> This will help maintaining the driver in case of changes
> to the memory subsystem.
>
> Signed-off-by: Bernard Metzler <bmt@zurich.ibm.com>
> ---
> v1 -> v2: remove RLIMIT memlock check logic, now done in ib_umem_get()
> ---

Looks good, Acked-by: Guoqing Jiang <guoqing.jiang@linux.dev>

...

> +	for (i = 0; num_pages > 0; i++) {
>   		int nents = min_t(int, num_pages, PAGES_PER_CHUNK);
>   		struct page **plist =
>   			kcalloc(nents, sizeof(struct page *), GFP_KERNEL);
>   
>   		if (!plist) {
>   			rv = -ENOMEM;
> -			goto out_sem_up;
> +			goto err_out;
>   		}
>   		umem->page_chunk[i].plist = plist;

One off topic question, why two dimensional list is needed for siw umem?
Thanks in advance.

Guoqing
Bernard Metzler Nov. 10, 2023, 3:09 p.m. UTC | #2
> -----Original Message-----
> From: Guoqing Jiang <guoqing.jiang@linux.dev>
> Sent: Friday, November 10, 2023 7:37 AM
> To: Bernard Metzler <BMT@zurich.ibm.com>; linux-rdma@vger.kernel.org
> Cc: jgg@ziepe.ca; leon@kernel.org; max7255@meta.com;
> dennis.dalessandro@cornelisnetworks.com; benve@cisco.com;
> vadim.fedorenko@linux.dev
> Subject: [EXTERNAL] Re: [PATCH for-next v2] RDMA/siw: Use ib_umem_get() to
> pin user pages
> 
> Hi,
> 
> On 11/4/23 15:56, Bernard Metzler wrote:
> > Abandon siw private code to pin user pages during user
> > memory registration, but use ib_umem_get() instead.
> > This will help maintaining the driver in case of changes
> > to the memory subsystem.
> >
> > Signed-off-by: Bernard Metzler <bmt@zurich.ibm.com>
> > ---
> > v1 -> v2: remove RLIMIT memlock check logic, now done in ib_umem_get()
> > ---
> 
> Looks good, Acked-by: Guoqing Jiang <guoqing.jiang@linux.dev>
> 
> ...
> 
> > +	for (i = 0; num_pages > 0; i++) {
> >   		int nents = min_t(int, num_pages, PAGES_PER_CHUNK);
> >   		struct page **plist =
> >   			kcalloc(nents, sizeof(struct page *), GFP_KERNEL);
> >
> >   		if (!plist) {
> >   			rv = -ENOMEM;
> > -			goto out_sem_up;
> > +			goto err_out;
> >   		}
> >   		umem->page_chunk[i].plist = plist;
> 
> One off topic question, why two dimensional list is needed for siw umem?
> Thanks in advance.
> 

not needed. any list like this can be one dimensional. I wanted to keep
the page list compact in memory. I consider moving to xarray - like rxe
does - later, but wanted to move to ib_umem_get() first.
Leon Romanovsky Nov. 13, 2023, 8:16 a.m. UTC | #3
On Sat, 04 Nov 2023 08:56:43 +0100, Bernard Metzler wrote:
> Abandon siw private code to pin user pages during user
> memory registration, but use ib_umem_get() instead.
> This will help maintaining the driver in case of changes
> to the memory subsystem.
> 
> 

Applied, thanks!

[1/1] RDMA/siw: Use ib_umem_get() to pin user pages
      https://git.kernel.org/rdma/rdma/c/476b7c7e00ec3d

Best regards,
diff mbox series

Patch

diff --git a/drivers/infiniband/sw/siw/siw.h b/drivers/infiniband/sw/siw/siw.h
index 58dddb143b9f..6930586109d4 100644
--- a/drivers/infiniband/sw/siw/siw.h
+++ b/drivers/infiniband/sw/siw/siw.h
@@ -121,11 +121,10 @@  struct siw_page_chunk {
 };
 
 struct siw_umem {
+	struct ib_umem *base_mem;
 	struct siw_page_chunk *page_chunk;
 	int num_pages;
-	bool writable;
 	u64 fp_addr; /* First page base address */
-	struct mm_struct *owning_mm;
 };
 
 struct siw_pble {
diff --git a/drivers/infiniband/sw/siw/siw_mem.c b/drivers/infiniband/sw/siw/siw_mem.c
index e6e25f15567d..6cc44df2ece5 100644
--- a/drivers/infiniband/sw/siw/siw_mem.c
+++ b/drivers/infiniband/sw/siw/siw_mem.c
@@ -5,6 +5,7 @@ 
 
 #include <linux/gfp.h>
 #include <rdma/ib_verbs.h>
+#include <rdma/ib_umem.h>
 #include <linux/dma-mapping.h>
 #include <linux/slab.h>
 #include <linux/sched/mm.h>
@@ -60,28 +61,17 @@  struct siw_mem *siw_mem_id2obj(struct siw_device *sdev, int stag_index)
 	return NULL;
 }
 
-static void siw_free_plist(struct siw_page_chunk *chunk, int num_pages,
-			   bool dirty)
+void siw_umem_release(struct siw_umem *umem)
 {
-	unpin_user_pages_dirty_lock(chunk->plist, num_pages, dirty);
-}
-
-void siw_umem_release(struct siw_umem *umem, bool dirty)
-{
-	struct mm_struct *mm_s = umem->owning_mm;
 	int i, num_pages = umem->num_pages;
 
-	for (i = 0; num_pages; i++) {
-		int to_free = min_t(int, PAGES_PER_CHUNK, num_pages);
+	if (umem->base_mem)
+		ib_umem_release(umem->base_mem);
 
-		siw_free_plist(&umem->page_chunk[i], to_free,
-			       umem->writable && dirty);
+	for (i = 0; num_pages > 0; i++) {
 		kfree(umem->page_chunk[i].plist);
-		num_pages -= to_free;
+		num_pages -= PAGES_PER_CHUNK;
 	}
-	atomic64_sub(umem->num_pages, &mm_s->pinned_vm);
-
-	mmdrop(mm_s);
 	kfree(umem->page_chunk);
 	kfree(umem);
 }
@@ -145,7 +135,7 @@  void siw_free_mem(struct kref *ref)
 
 	if (!mem->is_mw && mem->mem_obj) {
 		if (mem->is_pbl == 0)
-			siw_umem_release(mem->umem, true);
+			siw_umem_release(mem->umem);
 		else
 			kfree(mem->pbl);
 	}
@@ -362,18 +352,16 @@  struct siw_pbl *siw_pbl_alloc(u32 num_buf)
 	return pbl;
 }
 
-struct siw_umem *siw_umem_get(u64 start, u64 len, bool writable)
+struct siw_umem *siw_umem_get(struct ib_device *base_dev, u64 start,
+			      u64 len, int rights)
 {
 	struct siw_umem *umem;
-	struct mm_struct *mm_s;
+	struct ib_umem *base_mem;
+	struct sg_page_iter sg_iter;
+	struct sg_table *sgt;
 	u64 first_page_va;
-	unsigned long mlock_limit;
-	unsigned int foll_flags = FOLL_LONGTERM;
 	int num_pages, num_chunks, i, rv = 0;
 
-	if (!can_do_mlock())
-		return ERR_PTR(-EPERM);
-
 	if (!len)
 		return ERR_PTR(-EINVAL);
 
@@ -385,65 +373,50 @@  struct siw_umem *siw_umem_get(u64 start, u64 len, bool writable)
 	if (!umem)
 		return ERR_PTR(-ENOMEM);
 
-	mm_s = current->mm;
-	umem->owning_mm = mm_s;
-	umem->writable = writable;
-
-	mmgrab(mm_s);
-
-	if (writable)
-		foll_flags |= FOLL_WRITE;
-
-	mmap_read_lock(mm_s);
-
-	mlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-
-	if (atomic64_add_return(num_pages, &mm_s->pinned_vm) > mlock_limit) {
-		rv = -ENOMEM;
-		goto out_sem_up;
-	}
-	umem->fp_addr = first_page_va;
-
 	umem->page_chunk =
 		kcalloc(num_chunks, sizeof(struct siw_page_chunk), GFP_KERNEL);
 	if (!umem->page_chunk) {
 		rv = -ENOMEM;
-		goto out_sem_up;
+		goto err_out;
 	}
-	for (i = 0; num_pages; i++) {
+	base_mem = ib_umem_get(base_dev, start, len, rights);
+	if (IS_ERR(base_mem)) {
+		rv = PTR_ERR(base_mem);
+		siw_dbg(base_dev, "Cannot pin user memory: %d\n", rv);
+		goto err_out;
+	}
+	umem->fp_addr = first_page_va;
+	umem->base_mem = base_mem;
+
+	sgt = &base_mem->sgt_append.sgt;
+	__sg_page_iter_start(&sg_iter, sgt->sgl, sgt->orig_nents, 0);
+
+	if (!__sg_page_iter_next(&sg_iter)) {
+		rv = -EINVAL;
+		goto err_out;
+	}
+	for (i = 0; num_pages > 0; i++) {
 		int nents = min_t(int, num_pages, PAGES_PER_CHUNK);
 		struct page **plist =
 			kcalloc(nents, sizeof(struct page *), GFP_KERNEL);
 
 		if (!plist) {
 			rv = -ENOMEM;
-			goto out_sem_up;
+			goto err_out;
 		}
 		umem->page_chunk[i].plist = plist;
-		while (nents) {
-			rv = pin_user_pages(first_page_va, nents, foll_flags,
-					    plist);
-			if (rv < 0)
-				goto out_sem_up;
-
-			umem->num_pages += rv;
-			first_page_va += rv * PAGE_SIZE;
-			plist += rv;
-			nents -= rv;
-			num_pages -= rv;
+		while (nents--) {
+			*plist = sg_page_iter_page(&sg_iter);
+			umem->num_pages++;
+			num_pages--;
+			plist++;
+			if (!__sg_page_iter_next(&sg_iter))
+				break;
 		}
 	}
-out_sem_up:
-	mmap_read_unlock(mm_s);
-
-	if (rv > 0)
-		return umem;
-
-	/* Adjust accounting for pages not pinned */
-	if (num_pages)
-		atomic64_sub(num_pages, &mm_s->pinned_vm);
-
-	siw_umem_release(umem, false);
+	return umem;
+err_out:
+	siw_umem_release(umem);
 
 	return ERR_PTR(rv);
 }
diff --git a/drivers/infiniband/sw/siw/siw_mem.h b/drivers/infiniband/sw/siw/siw_mem.h
index f911287576d1..562a693f7662 100644
--- a/drivers/infiniband/sw/siw/siw_mem.h
+++ b/drivers/infiniband/sw/siw/siw_mem.h
@@ -6,8 +6,9 @@ 
 #ifndef _SIW_MEM_H
 #define _SIW_MEM_H
 
-struct siw_umem *siw_umem_get(u64 start, u64 len, bool writable);
-void siw_umem_release(struct siw_umem *umem, bool dirty);
+struct siw_umem *siw_umem_get(struct ib_device *base_dave, u64 start,
+			      u64 len, int rights);
+void siw_umem_release(struct siw_umem *umem);
 struct siw_pbl *siw_pbl_alloc(u32 num_buf);
 dma_addr_t siw_pbl_get_buffer(struct siw_pbl *pbl, u64 off, int *len, int *idx);
 struct siw_mem *siw_mem_id2obj(struct siw_device *sdev, int stag_index);
diff --git a/drivers/infiniband/sw/siw/siw_verbs.c b/drivers/infiniband/sw/siw/siw_verbs.c
index fdbef3254e30..5910207f60b1 100644
--- a/drivers/infiniband/sw/siw/siw_verbs.c
+++ b/drivers/infiniband/sw/siw/siw_verbs.c
@@ -1321,8 +1321,6 @@  struct ib_mr *siw_reg_user_mr(struct ib_pd *pd, u64 start, u64 len,
 	struct siw_umem *umem = NULL;
 	struct siw_ureq_reg_mr ureq;
 	struct siw_device *sdev = to_siw_dev(pd->device);
-
-	unsigned long mem_limit = rlimit(RLIMIT_MEMLOCK);
 	int rv;
 
 	siw_dbg_pd(pd, "start: 0x%pK, va: 0x%pK, len: %llu\n",
@@ -1338,20 +1336,7 @@  struct ib_mr *siw_reg_user_mr(struct ib_pd *pd, u64 start, u64 len,
 		rv = -EINVAL;
 		goto err_out;
 	}
-	if (mem_limit != RLIM_INFINITY) {
-		unsigned long num_pages =
-			(PAGE_ALIGN(len + (start & ~PAGE_MASK))) >> PAGE_SHIFT;
-		mem_limit >>= PAGE_SHIFT;
-
-		if (num_pages > mem_limit - current->mm->locked_vm) {
-			siw_dbg_pd(pd, "pages req %lu, max %lu, lock %lu\n",
-				   num_pages, mem_limit,
-				   current->mm->locked_vm);
-			rv = -ENOMEM;
-			goto err_out;
-		}
-	}
-	umem = siw_umem_get(start, len, ib_access_writable(rights));
+	umem = siw_umem_get(pd->device, start, len, rights);
 	if (IS_ERR(umem)) {
 		rv = PTR_ERR(umem);
 		siw_dbg_pd(pd, "getting user memory failed: %d\n", rv);
@@ -1404,7 +1389,7 @@  struct ib_mr *siw_reg_user_mr(struct ib_pd *pd, u64 start, u64 len,
 		kfree_rcu(mr, rcu);
 	} else {
 		if (umem)
-			siw_umem_release(umem, false);
+			siw_umem_release(umem);
 	}
 	return ERR_PTR(rv);
 }