diff mbox series

[rdma-next,v4,4/4] RDMA/umem: Move to allocate SG table from pages

Message ID 20200927064647.3106737-5-leon@kernel.org
State New, archived
Headers show
Series Dynamicaly allocate SG table from the pages | expand

Commit Message

Leon Romanovsky Sept. 27, 2020, 6:46 a.m. UTC
From: Maor Gottlieb <maorg@nvidia.com>

Remove the implementation of ib_umem_add_sg_table and instead
call to __sg_alloc_table_from_pages which already has the logic to
merge contiguous pages.

Besides that it removes duplicated functionality, it reduces the
memory consumption of the SG table significantly. Prior to this
patch, the SG table was allocated in advance regardless consideration
of contiguous pages.

In huge pages system of 2MB page size, without this change, the SG table
would contain x512 SG entries. E.g. for 100GB memory registration:

	 Number of entries	Size
Before 	      26214400          600.0MB
After            51200		  1.2MB

Signed-off-by: Maor Gottlieb <maorg@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 drivers/infiniband/core/umem.c | 92 +++++-----------------------------
 1 file changed, 12 insertions(+), 80 deletions(-)

--
2.26.2

Comments

Jason Gunthorpe Sept. 29, 2020, 7:59 p.m. UTC | #1
On Sun, Sep 27, 2020 at 09:46:47AM +0300, Leon Romanovsky wrote:
> @@ -296,11 +223,17 @@ static struct ib_umem *__ib_umem_get(struct ib_device *device,
>  			goto umem_release;
> 
>  		cur_base += ret * PAGE_SIZE;
> -		npages   -= ret;
> -
> -		sg = ib_umem_add_sg_table(sg, page_list, ret,
> -			dma_get_max_seg_size(device->dma_device),
> -			&umem->sg_nents);
> +		npages -= ret;
> +		sg = __sg_alloc_table_from_pages(
> +			&umem->sg_head, page_list, ret, 0, ret << PAGE_SHIFT,
> +			dma_get_max_seg_size(device->dma_device), sg, npages,
> +			GFP_KERNEL);
> +		umem->sg_nents = umem->sg_head.nents;
> +		if (IS_ERR(sg)) {
> +			unpin_user_pages_dirty_lock(page_list, ret, 0);
> +			ret = PTR_ERR(sg);
> +			goto umem_release;
> +		}
>  	}
> 
>  	sg_mark_end(sg);

Does it still need the sg_mark_end?

Jason
Leon Romanovsky Sept. 30, 2020, 9:53 a.m. UTC | #2
On Tue, Sep 29, 2020 at 04:59:29PM -0300, Jason Gunthorpe wrote:
> On Sun, Sep 27, 2020 at 09:46:47AM +0300, Leon Romanovsky wrote:
> > @@ -296,11 +223,17 @@ static struct ib_umem *__ib_umem_get(struct ib_device *device,
> >  			goto umem_release;
> >
> >  		cur_base += ret * PAGE_SIZE;
> > -		npages   -= ret;
> > -
> > -		sg = ib_umem_add_sg_table(sg, page_list, ret,
> > -			dma_get_max_seg_size(device->dma_device),
> > -			&umem->sg_nents);
> > +		npages -= ret;
> > +		sg = __sg_alloc_table_from_pages(
> > +			&umem->sg_head, page_list, ret, 0, ret << PAGE_SHIFT,
> > +			dma_get_max_seg_size(device->dma_device), sg, npages,
> > +			GFP_KERNEL);
> > +		umem->sg_nents = umem->sg_head.nents;
> > +		if (IS_ERR(sg)) {
> > +			unpin_user_pages_dirty_lock(page_list, ret, 0);
> > +			ret = PTR_ERR(sg);
> > +			goto umem_release;
> > +		}
> >  	}
> >
> >  	sg_mark_end(sg);
>
> Does it still need the sg_mark_end?

It is preserved here for correctness, the release logic doesn't rely on
this marker, but it is better to leave it.

Thanks

>
> Jason
Jason Gunthorpe Sept. 30, 2020, 11:45 a.m. UTC | #3
On Wed, Sep 30, 2020 at 12:53:21PM +0300, Leon Romanovsky wrote:
> On Tue, Sep 29, 2020 at 04:59:29PM -0300, Jason Gunthorpe wrote:
> > On Sun, Sep 27, 2020 at 09:46:47AM +0300, Leon Romanovsky wrote:
> > > @@ -296,11 +223,17 @@ static struct ib_umem *__ib_umem_get(struct ib_device *device,
> > >  			goto umem_release;
> > >
> > >  		cur_base += ret * PAGE_SIZE;
> > > -		npages   -= ret;
> > > -
> > > -		sg = ib_umem_add_sg_table(sg, page_list, ret,
> > > -			dma_get_max_seg_size(device->dma_device),
> > > -			&umem->sg_nents);
> > > +		npages -= ret;
> > > +		sg = __sg_alloc_table_from_pages(
> > > +			&umem->sg_head, page_list, ret, 0, ret << PAGE_SHIFT,
> > > +			dma_get_max_seg_size(device->dma_device), sg, npages,
> > > +			GFP_KERNEL);
> > > +		umem->sg_nents = umem->sg_head.nents;
> > > +		if (IS_ERR(sg)) {
> > > +			unpin_user_pages_dirty_lock(page_list, ret, 0);
> > > +			ret = PTR_ERR(sg);
> > > +			goto umem_release;
> > > +		}
> > >  	}
> > >
> > >  	sg_mark_end(sg);
> >
> > Does it still need the sg_mark_end?
> 
> It is preserved here for correctness, the release logic doesn't rely on
> this marker, but it is better to leave it.

I mean, my read of __sg_alloc_table_from_pages() is that it already
placed it, the final __alloc_table() does it?

Jason
Maor Gottlieb Sept. 30, 2020, 11:53 a.m. UTC | #4
On 9/30/2020 2:45 PM, Jason Gunthorpe wrote:
> On Wed, Sep 30, 2020 at 12:53:21PM +0300, Leon Romanovsky wrote:
>> On Tue, Sep 29, 2020 at 04:59:29PM -0300, Jason Gunthorpe wrote:
>>> On Sun, Sep 27, 2020 at 09:46:47AM +0300, Leon Romanovsky wrote:
>>>> @@ -296,11 +223,17 @@ static struct ib_umem *__ib_umem_get(struct ib_device *device,
>>>>   			goto umem_release;
>>>>
>>>>   		cur_base += ret * PAGE_SIZE;
>>>> -		npages   -= ret;
>>>> -
>>>> -		sg = ib_umem_add_sg_table(sg, page_list, ret,
>>>> -			dma_get_max_seg_size(device->dma_device),
>>>> -			&umem->sg_nents);
>>>> +		npages -= ret;
>>>> +		sg = __sg_alloc_table_from_pages(
>>>> +			&umem->sg_head, page_list, ret, 0, ret << PAGE_SHIFT,
>>>> +			dma_get_max_seg_size(device->dma_device), sg, npages,
>>>> +			GFP_KERNEL);
>>>> +		umem->sg_nents = umem->sg_head.nents;
>>>> +		if (IS_ERR(sg)) {
>>>> +			unpin_user_pages_dirty_lock(page_list, ret, 0);
>>>> +			ret = PTR_ERR(sg);
>>>> +			goto umem_release;
>>>> +		}
>>>>   	}
>>>>
>>>>   	sg_mark_end(sg);
>>> Does it still need the sg_mark_end?
>> It is preserved here for correctness, the release logic doesn't rely on
>> this marker, but it is better to leave it.
> I mean, my read of __sg_alloc_table_from_pages() is that it already
> placed it, the final __alloc_table() does it?
>
> Jason


It marks the last allocated sge, but not the last populated sge (with page).
Jason Gunthorpe Sept. 30, 2020, 11:58 a.m. UTC | #5
On Wed, Sep 30, 2020 at 02:53:58PM +0300, Maor Gottlieb wrote:
> 
> On 9/30/2020 2:45 PM, Jason Gunthorpe wrote:
> > On Wed, Sep 30, 2020 at 12:53:21PM +0300, Leon Romanovsky wrote:
> > > On Tue, Sep 29, 2020 at 04:59:29PM -0300, Jason Gunthorpe wrote:
> > > > On Sun, Sep 27, 2020 at 09:46:47AM +0300, Leon Romanovsky wrote:
> > > > > @@ -296,11 +223,17 @@ static struct ib_umem *__ib_umem_get(struct ib_device *device,
> > > > >   			goto umem_release;
> > > > > 
> > > > >   		cur_base += ret * PAGE_SIZE;
> > > > > -		npages   -= ret;
> > > > > -
> > > > > -		sg = ib_umem_add_sg_table(sg, page_list, ret,
> > > > > -			dma_get_max_seg_size(device->dma_device),
> > > > > -			&umem->sg_nents);
> > > > > +		npages -= ret;
> > > > > +		sg = __sg_alloc_table_from_pages(
> > > > > +			&umem->sg_head, page_list, ret, 0, ret << PAGE_SHIFT,
> > > > > +			dma_get_max_seg_size(device->dma_device), sg, npages,
> > > > > +			GFP_KERNEL);
> > > > > +		umem->sg_nents = umem->sg_head.nents;
> > > > > +		if (IS_ERR(sg)) {
> > > > > +			unpin_user_pages_dirty_lock(page_list, ret, 0);
> > > > > +			ret = PTR_ERR(sg);
> > > > > +			goto umem_release;
> > > > > +		}
> > > > >   	}
> > > > > 
> > > > >   	sg_mark_end(sg);
> > > > Does it still need the sg_mark_end?
> > > It is preserved here for correctness, the release logic doesn't rely on
> > > this marker, but it is better to leave it.
> > I mean, my read of __sg_alloc_table_from_pages() is that it already
> > placed it, the final __alloc_table() does it?
> 
> It marks the last allocated sge, but not the last populated sge (with page).

Why are those different?

It looks like the last iteration calls __alloc_table() with an exact
number of sges

+	if (!prv) {
+		/* Only the last allocation could be less than the maximum */
+		table_size = left_pages ? SG_MAX_SINGLE_ALLOC : chunks;
+		ret = sg_alloc_table(sgt, table_size, gfp_mask);
+		if (unlikely(ret))
+			return ERR_PTR(ret);
+	}

Jason
Maor Gottlieb Sept. 30, 2020, 3:05 p.m. UTC | #6
On 9/30/2020 2:58 PM, Jason Gunthorpe wrote:
> On Wed, Sep 30, 2020 at 02:53:58PM +0300, Maor Gottlieb wrote:
>> On 9/30/2020 2:45 PM, Jason Gunthorpe wrote:
>>> On Wed, Sep 30, 2020 at 12:53:21PM +0300, Leon Romanovsky wrote:
>>>> On Tue, Sep 29, 2020 at 04:59:29PM -0300, Jason Gunthorpe wrote:
>>>>> On Sun, Sep 27, 2020 at 09:46:47AM +0300, Leon Romanovsky wrote:
>>>>>> @@ -296,11 +223,17 @@ static struct ib_umem *__ib_umem_get(struct ib_device *device,
>>>>>>    			goto umem_release;
>>>>>>
>>>>>>    		cur_base += ret * PAGE_SIZE;
>>>>>> -		npages   -= ret;
>>>>>> -
>>>>>> -		sg = ib_umem_add_sg_table(sg, page_list, ret,
>>>>>> -			dma_get_max_seg_size(device->dma_device),
>>>>>> -			&umem->sg_nents);
>>>>>> +		npages -= ret;
>>>>>> +		sg = __sg_alloc_table_from_pages(
>>>>>> +			&umem->sg_head, page_list, ret, 0, ret << PAGE_SHIFT,
>>>>>> +			dma_get_max_seg_size(device->dma_device), sg, npages,
>>>>>> +			GFP_KERNEL);
>>>>>> +		umem->sg_nents = umem->sg_head.nents;
>>>>>> +		if (IS_ERR(sg)) {
>>>>>> +			unpin_user_pages_dirty_lock(page_list, ret, 0);
>>>>>> +			ret = PTR_ERR(sg);
>>>>>> +			goto umem_release;
>>>>>> +		}
>>>>>>    	}
>>>>>>
>>>>>>    	sg_mark_end(sg);
>>>>> Does it still need the sg_mark_end?
>>>> It is preserved here for correctness, the release logic doesn't rely on
>>>> this marker, but it is better to leave it.
>>> I mean, my read of __sg_alloc_table_from_pages() is that it already
>>> placed it, the final __alloc_table() does it?
>> It marks the last allocated sge, but not the last populated sge (with page).
> Why are those different?
>
> It looks like the last iteration calls __alloc_table() with an exact
> number of sges
>
> +	if (!prv) {
> +		/* Only the last allocation could be less than the maximum */
> +		table_size = left_pages ? SG_MAX_SINGLE_ALLOC : chunks;
> +		ret = sg_alloc_table(sgt, table_size, gfp_mask);
> +		if (unlikely(ret))
> +			return ERR_PTR(ret);
> +	}
>
> Jason

This is right only for the last iteration. E.g. in the first iteration 
in case that there are more pages (left_pages), then we allocate 
SG_MAX_SINGLE_ALLOC.  We don't know how many pages from the second 
iteration will be squashed to the SGE from the first iteration.
Jason Gunthorpe Sept. 30, 2020, 3:14 p.m. UTC | #7
On Wed, Sep 30, 2020 at 06:05:15PM +0300, Maor Gottlieb wrote:
> This is right only for the last iteration. E.g. in the first iteration in
> case that there are more pages (left_pages), then we allocate
> SG_MAX_SINGLE_ALLOC.  We don't know how many pages from the second iteration
> will be squashed to the SGE from the first iteration.

Well, it is 0 or 1 SGE's. Check if the first page is mergable and
subtract one from the required length?

I dislike this sg_mark_end() it is something that should be internal,
IMHO.

Jason
Maor Gottlieb Sept. 30, 2020, 3:40 p.m. UTC | #8
On 9/30/2020 6:14 PM, Jason Gunthorpe wrote:
> On Wed, Sep 30, 2020 at 06:05:15PM +0300, Maor Gottlieb wrote:
>> This is right only for the last iteration. E.g. in the first iteration in
>> case that there are more pages (left_pages), then we allocate
>> SG_MAX_SINGLE_ALLOC.  We don't know how many pages from the second iteration
>> will be squashed to the SGE from the first iteration.
> Well, it is 0 or 1 SGE's. Check if the first page is mergable and
> subtract one from the required length?
>
> I dislike this sg_mark_end() it is something that should be internal,
> IMHO.

I can move it to __sg_alloc_table_from_pages:

         sgt->nents = tmp_nents;
+ if (!left_pages)
+         sg_mark_end(s);
  out:
         return s;

>
> Jason
Leon Romanovsky Sept. 30, 2020, 4:51 p.m. UTC | #9
On Wed, Sep 30, 2020 at 12:14:06PM -0300, Jason Gunthorpe wrote:
> On Wed, Sep 30, 2020 at 06:05:15PM +0300, Maor Gottlieb wrote:
> > This is right only for the last iteration. E.g. in the first iteration in
> > case that there are more pages (left_pages), then we allocate
> > SG_MAX_SINGLE_ALLOC.  We don't know how many pages from the second iteration
> > will be squashed to the SGE from the first iteration.
>
> Well, it is 0 or 1 SGE's. Check if the first page is mergable and
> subtract one from the required length?
>
> I dislike this sg_mark_end() it is something that should be internal,
> IMHO.

I don't think so, but Maor provided possible solution.
Can you take the patches?

Thanks

>
> Jason
diff mbox series

Patch

diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index 01b680b62846..0ef736970aba 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -63,73 +63,6 @@  static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int d
 	sg_free_table(&umem->sg_head);
 }

-/* ib_umem_add_sg_table - Add N contiguous pages to scatter table
- *
- * sg: current scatterlist entry
- * page_list: array of npage struct page pointers
- * npages: number of pages in page_list
- * max_seg_sz: maximum segment size in bytes
- * nents: [out] number of entries in the scatterlist
- *
- * Return new end of scatterlist
- */
-static struct scatterlist *ib_umem_add_sg_table(struct scatterlist *sg,
-						struct page **page_list,
-						unsigned long npages,
-						unsigned int max_seg_sz,
-						int *nents)
-{
-	unsigned long first_pfn;
-	unsigned long i = 0;
-	bool update_cur_sg = false;
-	bool first = !sg_page(sg);
-
-	/* Check if new page_list is contiguous with end of previous page_list.
-	 * sg->length here is a multiple of PAGE_SIZE and sg->offset is 0.
-	 */
-	if (!first && (page_to_pfn(sg_page(sg)) + (sg->length >> PAGE_SHIFT) ==
-		       page_to_pfn(page_list[0])))
-		update_cur_sg = true;
-
-	while (i != npages) {
-		unsigned long len;
-		struct page *first_page = page_list[i];
-
-		first_pfn = page_to_pfn(first_page);
-
-		/* Compute the number of contiguous pages we have starting
-		 * at i
-		 */
-		for (len = 0; i != npages &&
-			      first_pfn + len == page_to_pfn(page_list[i]) &&
-			      len < (max_seg_sz >> PAGE_SHIFT);
-		     len++)
-			i++;
-
-		/* Squash N contiguous pages from page_list into current sge */
-		if (update_cur_sg) {
-			if ((max_seg_sz - sg->length) >= (len << PAGE_SHIFT)) {
-				sg_set_page(sg, sg_page(sg),
-					    sg->length + (len << PAGE_SHIFT),
-					    0);
-				update_cur_sg = false;
-				continue;
-			}
-			update_cur_sg = false;
-		}
-
-		/* Squash N contiguous pages into next sge or first sge */
-		if (!first)
-			sg = sg_next(sg);
-
-		(*nents)++;
-		sg_set_page(sg, first_page, len << PAGE_SHIFT, 0);
-		first = false;
-	}
-
-	return sg;
-}
-
 /**
  * ib_umem_find_best_pgsz - Find best HW page size to use for this MR
  *
@@ -221,7 +154,7 @@  static struct ib_umem *__ib_umem_get(struct ib_device *device,
 	struct mm_struct *mm;
 	unsigned long npages;
 	int ret;
-	struct scatterlist *sg;
+	struct scatterlist *sg = NULL;
 	unsigned int gup_flags = FOLL_WRITE;

 	/*
@@ -276,15 +209,9 @@  static struct ib_umem *__ib_umem_get(struct ib_device *device,

 	cur_base = addr & PAGE_MASK;

-	ret = sg_alloc_table(&umem->sg_head, npages, GFP_KERNEL);
-	if (ret)
-		goto vma;
-
 	if (!umem->writable)
 		gup_flags |= FOLL_FORCE;

-	sg = umem->sg_head.sgl;
-
 	while (npages) {
 		cond_resched();
 		ret = pin_user_pages_fast(cur_base,
@@ -296,11 +223,17 @@  static struct ib_umem *__ib_umem_get(struct ib_device *device,
 			goto umem_release;

 		cur_base += ret * PAGE_SIZE;
-		npages   -= ret;
-
-		sg = ib_umem_add_sg_table(sg, page_list, ret,
-			dma_get_max_seg_size(device->dma_device),
-			&umem->sg_nents);
+		npages -= ret;
+		sg = __sg_alloc_table_from_pages(
+			&umem->sg_head, page_list, ret, 0, ret << PAGE_SHIFT,
+			dma_get_max_seg_size(device->dma_device), sg, npages,
+			GFP_KERNEL);
+		umem->sg_nents = umem->sg_head.nents;
+		if (IS_ERR(sg)) {
+			unpin_user_pages_dirty_lock(page_list, ret, 0);
+			ret = PTR_ERR(sg);
+			goto umem_release;
+		}
 	}

 	sg_mark_end(sg);
@@ -322,7 +255,6 @@  static struct ib_umem *__ib_umem_get(struct ib_device *device,

 umem_release:
 	__ib_umem_release(device, umem, 0);
-vma:
 	atomic64_sub(ib_umem_num_pages(umem), &mm->pinned_vm);
 out:
 	free_page((unsigned long) page_list);