diff mbox series

[RFC,8/9] RDMA/umem: batch page unpin in __ib_mem_release()

Message ID 20201208172901.17384-10-joao.m.martins@oracle.com (mailing list archive)
State New, archived
Headers show
Series mm, sparse-vmemmap: Introduce compound pagemaps | expand

Commit Message

Joao Martins Dec. 8, 2020, 5:29 p.m. UTC
Take advantage of the newly added unpin_user_pages() batched
refcount update, by calculating a page array from an SGL
(same size as the one used in ib_mem_get()) and call
unpin_user_pages() with that.

unpin_user_pages() will check on consecutive pages that belong
to the same compound page set and batch the refcount update in
a single write.

Running a test program which calls mr reg/unreg on a 1G in size
and measures cost of both operations together (in a guest using rxe)
with device-dax and hugetlbfs:

Before:
159 rounds in 5.027 sec: 31617.923 usec / round (device-dax)
466 rounds in 5.009 sec: 10748.456 usec / round (hugetlbfs)

After:
 305 rounds in 5.010 sec: 16426.047 usec / round (device-dax)
1073 rounds in 5.004 sec: 4663.622 usec / round (hugetlbfs)

We also see similar improvements on a setup with pmem and RDMA hardware.

Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
---
 drivers/infiniband/core/umem.c | 25 ++++++++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

Comments

John Hubbard Dec. 9, 2020, 5:18 a.m. UTC | #1
On 12/8/20 9:29 AM, Joao Martins wrote:
> Take advantage of the newly added unpin_user_pages() batched
> refcount update, by calculating a page array from an SGL
> (same size as the one used in ib_mem_get()) and call
> unpin_user_pages() with that.
> 
> unpin_user_pages() will check on consecutive pages that belong
> to the same compound page set and batch the refcount update in
> a single write.
> 
> Running a test program which calls mr reg/unreg on a 1G in size
> and measures cost of both operations together (in a guest using rxe)
> with device-dax and hugetlbfs:
> 
> Before:
> 159 rounds in 5.027 sec: 31617.923 usec / round (device-dax)
> 466 rounds in 5.009 sec: 10748.456 usec / round (hugetlbfs)
> 
> After:
>   305 rounds in 5.010 sec: 16426.047 usec / round (device-dax)
> 1073 rounds in 5.004 sec: 4663.622 usec / round (hugetlbfs)
> 
> We also see similar improvements on a setup with pmem and RDMA hardware.
> 
> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
> ---
>   drivers/infiniband/core/umem.c | 25 ++++++++++++++++++++++---
>   1 file changed, 22 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
> index e9fecbdf391b..493cfdcf7381 100644
> --- a/drivers/infiniband/core/umem.c
> +++ b/drivers/infiniband/core/umem.c
> @@ -44,20 +44,40 @@
>   
>   #include "uverbs.h"
>   
> +#define PAGES_PER_LIST (PAGE_SIZE / sizeof(struct page *))

I was going to maybe suggest that this item, and the "bool make_dirty" cleanup,
be a separate patch, because they are just cleanups. But the memory allocation issue
below might make that whole (minor) point obsolete.

> +
>   static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty)
>   {
> +	bool make_dirty = umem->writable && dirty;
> +	struct page **page_list = NULL;
>   	struct sg_page_iter sg_iter;
> +	unsigned long nr = 0;
>   	struct page *page;
>   
> +	page_list = (struct page **) __get_free_page(GFP_KERNEL);

Yeah, allocating memory in a free/release path is not good. btw, for future use,
I see that kmalloc() is generally recommended these days (that's a change), when
you want a pointer to storage, as opposed to wanting struct pages:

https://lore.kernel.org/lkml/CA+55aFwyxJ+TOpaJZnC5MPJ-25xbLAEu8iJP8zTYhmA3LXFF8Q@mail.gmail.com/

> +
>   	if (umem->nmap > 0)
>   		ib_dma_unmap_sg(dev, umem->sg_head.sgl, umem->sg_nents,
>   				DMA_BIDIRECTIONAL);
>   
>   	for_each_sg_page(umem->sg_head.sgl, &sg_iter, umem->sg_nents, 0) {
>   		page = sg_page_iter_page(&sg_iter);
> -		unpin_user_pages_dirty_lock(&page, 1, umem->writable && dirty);
> +		if (page_list)
> +			page_list[nr++] = page;
> +
> +		if (!page_list) {
> +			unpin_user_pages_dirty_lock(&page, 1, make_dirty);
> +		} else if (nr == PAGES_PER_LIST) {
> +			unpin_user_pages_dirty_lock(page_list, nr, make_dirty);
> +			nr = 0;
> +		}
>   	}
>   
> +	if (nr)
> +		unpin_user_pages_dirty_lock(page_list, nr, make_dirty);
> +
> +	if (page_list)
> +		free_page((unsigned long) page_list);
>   	sg_free_table(&umem->sg_head);
>   }
>   
> @@ -212,8 +232,7 @@ struct ib_umem *ib_umem_get(struct ib_device *device, unsigned long addr,
>   		cond_resched();
>   		ret = pin_user_pages_fast(cur_base,
>   					  min_t(unsigned long, npages,
> -						PAGE_SIZE /
> -						sizeof(struct page *)),
> +						PAGES_PER_LIST),
>   					  gup_flags | FOLL_LONGTERM, page_list);
>   		if (ret < 0)
>   			goto umem_release;
> 

thanks,
Joao Martins Dec. 9, 2020, 10:59 a.m. UTC | #2
On 12/8/20 7:29 PM, Jason Gunthorpe wrote:
> On Tue, Dec 08, 2020 at 05:29:00PM +0000, Joao Martins wrote:
> 
>>  static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty)
>>  {
>> +	bool make_dirty = umem->writable && dirty;
>> +	struct page **page_list = NULL;
>>  	struct sg_page_iter sg_iter;
>> +	unsigned long nr = 0;
>>  	struct page *page;
>>  
>> +	page_list = (struct page **) __get_free_page(GFP_KERNEL);
> 
> Gah, no, don't do it like this!
> 
> Instead something like:
> 
> 	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, i)
> 	      unpin_use_pages_range_dirty_lock(sg_page(sg), sg->length/PAGE_SIZE,
>                                                umem->writable && dirty);
> 
> And have the mm implementation split the contiguous range of pages into
> pairs of (compound head, ntails) with a bit of maths.
> 
Got it :)

I was trying to avoid another exported symbol.

Albeit upon your suggestion below, it doesn't justify the efficiency/clearness lost.

	Joao
Joao Martins Dec. 19, 2020, 1:15 p.m. UTC | #3
On 12/9/20 10:59 AM, Joao Martins wrote:
> On 12/8/20 7:29 PM, Jason Gunthorpe wrote:
>> On Tue, Dec 08, 2020 at 05:29:00PM +0000, Joao Martins wrote:
>>
>>>  static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty)
>>>  {
>>> +	bool make_dirty = umem->writable && dirty;
>>> +	struct page **page_list = NULL;
>>>  	struct sg_page_iter sg_iter;
>>> +	unsigned long nr = 0;
>>>  	struct page *page;
>>>  
>>> +	page_list = (struct page **) __get_free_page(GFP_KERNEL);
>>
>> Gah, no, don't do it like this!
>>
>> Instead something like:
>>
>> 	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, i)
>> 	      unpin_use_pages_range_dirty_lock(sg_page(sg), sg->length/PAGE_SIZE,
>>                                                umem->writable && dirty);
>>
>> And have the mm implementation split the contiguous range of pages into
>> pairs of (compound head, ntails) with a bit of maths.
>>
> Got it :)
> 
> I was trying to avoid another exported symbol.
> 
> Albeit upon your suggestion below, it doesn't justify the efficiency/clearness lost.
> 
This more efficient suggestion of yours leads to a further speed up from:

	1073 rounds in 5.004 sec: 4663.622 usec / round (hugetlbfs)

to

	1370 rounds in 5.003 sec: 3651.562 usec / round (hugetlbfs)

Right after I come back from holidays I will follow up with this series in separate.

	Joao
diff mbox series

Patch

diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index e9fecbdf391b..493cfdcf7381 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -44,20 +44,40 @@ 
 
 #include "uverbs.h"
 
+#define PAGES_PER_LIST (PAGE_SIZE / sizeof(struct page *))
+
 static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty)
 {
+	bool make_dirty = umem->writable && dirty;
+	struct page **page_list = NULL;
 	struct sg_page_iter sg_iter;
+	unsigned long nr = 0;
 	struct page *page;
 
+	page_list = (struct page **) __get_free_page(GFP_KERNEL);
+
 	if (umem->nmap > 0)
 		ib_dma_unmap_sg(dev, umem->sg_head.sgl, umem->sg_nents,
 				DMA_BIDIRECTIONAL);
 
 	for_each_sg_page(umem->sg_head.sgl, &sg_iter, umem->sg_nents, 0) {
 		page = sg_page_iter_page(&sg_iter);
-		unpin_user_pages_dirty_lock(&page, 1, umem->writable && dirty);
+		if (page_list)
+			page_list[nr++] = page;
+
+		if (!page_list) {
+			unpin_user_pages_dirty_lock(&page, 1, make_dirty);
+		} else if (nr == PAGES_PER_LIST) {
+			unpin_user_pages_dirty_lock(page_list, nr, make_dirty);
+			nr = 0;
+		}
 	}
 
+	if (nr)
+		unpin_user_pages_dirty_lock(page_list, nr, make_dirty);
+
+	if (page_list)
+		free_page((unsigned long) page_list);
 	sg_free_table(&umem->sg_head);
 }
 
@@ -212,8 +232,7 @@  struct ib_umem *ib_umem_get(struct ib_device *device, unsigned long addr,
 		cond_resched();
 		ret = pin_user_pages_fast(cur_base,
 					  min_t(unsigned long, npages,
-						PAGE_SIZE /
-						sizeof(struct page *)),
+						PAGES_PER_LIST),
 					  gup_flags | FOLL_LONGTERM, page_list);
 		if (ret < 0)
 			goto umem_release;