diff mbox series

[RFC] SUNRPC: Refresh rq_pages using a bulk page allocator

Message ID 161340498400.7780.962495219428962117.stgit@klimt.1015granger.net (mailing list archive)
State New, archived
Headers show
Series [RFC] SUNRPC: Refresh rq_pages using a bulk page allocator | expand

Commit Message

Chuck Lever Feb. 15, 2021, 4:06 p.m. UTC
Reduce the rate at which nfsd threads hammer on the page allocator.
This improves throughput scalability by enabling the nfsd threads to
run more independently of each other.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 net/sunrpc/svc_xprt.c |   44 ++++++++++++++++++++++++++++++++------------
 1 file changed, 32 insertions(+), 12 deletions(-)

Note: I haven't actually tried the "min_t(needed, 13)" bit yet.

Comments

Mel Gorman Feb. 22, 2021, 9:35 a.m. UTC | #1
On Mon, Feb 15, 2021 at 11:06:07AM -0500, Chuck Lever wrote:
> Reduce the rate at which nfsd threads hammer on the page allocator.
> This improves throughput scalability by enabling the nfsd threads to
> run more independently of each other.
> 

Sorry this is taking so long, there is a lot going on.

This patch has pre-requisites that are not in mainline which makes it
harder to evaluate what the semantics of the API should be.

> @@ -659,19 +659,33 @@ static int svc_alloc_arg(struct svc_rqst *rqstp)
>  		/* use as many pages as possible */
>  		pages = RPCSVC_MAXPAGES;
>  	}
> -	for (i = 0; i < pages ; i++)
> -		while (rqstp->rq_pages[i] == NULL) {
> -			struct page *p = alloc_page(GFP_KERNEL);
> -			if (!p) {
> -				set_current_state(TASK_INTERRUPTIBLE);
> -				if (signalled() || kthread_should_stop()) {
> -					set_current_state(TASK_RUNNING);
> -					return -EINTR;
> -				}
> -				schedule_timeout(msecs_to_jiffies(500));
> +
> +	for (needed = 0, i = 0; i < pages ; i++)
> +		if (!rqstp->rq_pages[i])
> +			needed++;
> +	if (needed) {
> +		LIST_HEAD(list);
> +
> +retry:
> +		alloc_pages_bulk(GFP_KERNEL, 0,
> +				 /* to test the retry logic: */
> +				 min_t(unsigned long, needed, 13),
> +				 &list);
> +		for (i = 0; i < pages; i++) {
> +			if (!rqstp->rq_pages[i]) {
> +				struct page *page;
> +
> +				page = list_first_entry_or_null(&list,
> +								struct page,
> +								lru);
> +				if (unlikely(!page))
> +					goto empty_list;
> +				list_del(&page->lru);
> +				rqstp->rq_pages[i] = page;
> +				needed--;
>  			}
> -			rqstp->rq_pages[i] = p;
>  		}
> +	}
>  	rqstp->rq_page_end = &rqstp->rq_pages[pages];
>  	rqstp->rq_pages[pages] = NULL; /* this might be seen in nfsd_splice_actor() */
>  

There is a conflict at the end where rq_page_end gets updated. The 5.11
code assumes that the loop around the allocator definitely gets all
the required pages. What tree is this patch based on and is it going in
during this merge window? While the conflict is "trivial" to resolve,
it would be buggy because on retry, "i" will be pointing to the wrong
index and pages potentially leak. Rather than guessing, I'd prefer to
base a series on code you've tested.

The slowpath for the bulk allocator also sucks a bit for the semantics
required by this caller. As the bulk allocator does not walk the zonelist,
it can return failures prematurely -- fine for an optimistic bulk allocator
that can return a subset of pages but not for this caller which really
wants those pages. The allocator may need NOFAIL-like semantics to walk
the zonelist if the caller really requires success or at least walk the
zonelist if the preferred zone is low on pages. This patch would also
need to preserve the schedule_timeout behaviour so it does not use a lot
of CPU time retrying allocations in the presense of memory pressure.
Chuck Lever Feb. 22, 2021, 2:58 p.m. UTC | #2
> On Feb 22, 2021, at 4:35 AM, Mel Gorman <mgorman@techsingularity.net> wrote:
> 
> On Mon, Feb 15, 2021 at 11:06:07AM -0500, Chuck Lever wrote:
>> Reduce the rate at which nfsd threads hammer on the page allocator.
>> This improves throughput scalability by enabling the nfsd threads to
>> run more independently of each other.
>> 
> 
> Sorry this is taking so long, there is a lot going on.
> 
> This patch has pre-requisites that are not in mainline which makes it
> harder to evaluate what the semantics of the API should be.
> 
>> @@ -659,19 +659,33 @@ static int svc_alloc_arg(struct svc_rqst *rqstp)
>> 		/* use as many pages as possible */
>> 		pages = RPCSVC_MAXPAGES;
>> 	}
>> -	for (i = 0; i < pages ; i++)
>> -		while (rqstp->rq_pages[i] == NULL) {
>> -			struct page *p = alloc_page(GFP_KERNEL);
>> -			if (!p) {
>> -				set_current_state(TASK_INTERRUPTIBLE);
>> -				if (signalled() || kthread_should_stop()) {
>> -					set_current_state(TASK_RUNNING);
>> -					return -EINTR;
>> -				}
>> -				schedule_timeout(msecs_to_jiffies(500));
>> +
>> +	for (needed = 0, i = 0; i < pages ; i++)
>> +		if (!rqstp->rq_pages[i])
>> +			needed++;
>> +	if (needed) {
>> +		LIST_HEAD(list);
>> +
>> +retry:
>> +		alloc_pages_bulk(GFP_KERNEL, 0,
>> +				 /* to test the retry logic: */
>> +				 min_t(unsigned long, needed, 13),
>> +				 &list);
>> +		for (i = 0; i < pages; i++) {
>> +			if (!rqstp->rq_pages[i]) {
>> +				struct page *page;
>> +
>> +				page = list_first_entry_or_null(&list,
>> +								struct page,
>> +								lru);
>> +				if (unlikely(!page))
>> +					goto empty_list;
>> +				list_del(&page->lru);
>> +				rqstp->rq_pages[i] = page;
>> +				needed--;
>> 			}
>> -			rqstp->rq_pages[i] = p;
>> 		}
>> +	}
>> 	rqstp->rq_page_end = &rqstp->rq_pages[pages];
>> 	rqstp->rq_pages[pages] = NULL; /* this might be seen in nfsd_splice_actor() */
>> 
> 
> There is a conflict at the end where rq_page_end gets updated. The 5.11
> code assumes that the loop around the allocator definitely gets all
> the required pages. What tree is this patch based on and is it going in
> during this merge window? While the conflict is "trivial" to resolve,
> it would be buggy because on retry, "i" will be pointing to the wrong
> index and pages potentially leak. Rather than guessing, I'd prefer to
> base a series on code you've tested.

I posted this patch as a proof of concept. There is a clean-up patch
that goes before it to deal properly with rq_page_end. I can post
both if you really want to apply this and play with it.


> The slowpath for the bulk allocator also sucks a bit for the semantics
> required by this caller. As the bulk allocator does not walk the zonelist,
> it can return failures prematurely -- fine for an optimistic bulk allocator
> that can return a subset of pages but not for this caller which really
> wants those pages. The allocator may need NOFAIL-like semantics to walk
> the zonelist if the caller really requires success or at least walk the
> zonelist if the preferred zone is low on pages. This patch would also
> need to preserve the schedule_timeout behaviour so it does not use a lot
> of CPU time retrying allocations in the presense of memory pressure.

Waiting half a second before trying again seems like overkill, though.


--
Chuck Lever
Mel Gorman Feb. 22, 2021, 5:43 p.m. UTC | #3
On Mon, Feb 22, 2021 at 02:58:04PM +0000, Chuck Lever wrote:
> > There is a conflict at the end where rq_page_end gets updated. The 5.11
> > code assumes that the loop around the allocator definitely gets all
> > the required pages. What tree is this patch based on and is it going in
> > during this merge window? While the conflict is "trivial" to resolve,
> > it would be buggy because on retry, "i" will be pointing to the wrong
> > index and pages potentially leak. Rather than guessing, I'd prefer to
> > base a series on code you've tested.
> 
> I posted this patch as a proof of concept. There is a clean-up patch
> that goes before it to deal properly with rq_page_end. I can post
> both if you really want to apply this and play with it.
> 

It's for the best. It doesn't belong in the series as such but it may
affect what the bulk allocator usage looks like.

> 
> > The slowpath for the bulk allocator also sucks a bit for the semantics
> > required by this caller. As the bulk allocator does not walk the zonelist,
> > it can return failures prematurely -- fine for an optimistic bulk allocator
> > that can return a subset of pages but not for this caller which really
> > wants those pages. The allocator may need NOFAIL-like semantics to walk
> > the zonelist if the caller really requires success or at least walk the
> > zonelist if the preferred zone is low on pages. This patch would also
> > need to preserve the schedule_timeout behaviour so it does not use a lot
> > of CPU time retrying allocations in the presense of memory pressure.
> 
> Waiting half a second before trying again seems like overkill, though.
> 

It is both overkill and time is not directly correlated with memory
pressure. However, I would also suggest removing the timeout as a separate
patch as it's not related to the bulk allocator in case someone does
encounter a high CPU usage problem and bisects it the patch using the
bulk allocator for the first time.
diff mbox series

Patch

diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index 4730bac409b5..8f398179d818 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -647,11 +647,11 @@  static void svc_check_conn_limits(struct svc_serv *serv)
 static int svc_alloc_arg(struct svc_rqst *rqstp)
 {
 	struct svc_serv *serv = rqstp->rq_server;
+	unsigned long needed;
 	struct xdr_buf *arg;
 	int pages;
 	int i;
 
-	/* now allocate needed pages.  If we get a failure, sleep briefly */
 	pages = (serv->sv_max_mesg + 2 * PAGE_SIZE) >> PAGE_SHIFT;
 	if (pages > RPCSVC_MAXPAGES) {
 		pr_warn_once("svc: warning: pages=%u > RPCSVC_MAXPAGES=%lu\n",
@@ -659,19 +659,33 @@  static int svc_alloc_arg(struct svc_rqst *rqstp)
 		/* use as many pages as possible */
 		pages = RPCSVC_MAXPAGES;
 	}
-	for (i = 0; i < pages ; i++)
-		while (rqstp->rq_pages[i] == NULL) {
-			struct page *p = alloc_page(GFP_KERNEL);
-			if (!p) {
-				set_current_state(TASK_INTERRUPTIBLE);
-				if (signalled() || kthread_should_stop()) {
-					set_current_state(TASK_RUNNING);
-					return -EINTR;
-				}
-				schedule_timeout(msecs_to_jiffies(500));
+
+	for (needed = 0, i = 0; i < pages ; i++)
+		if (!rqstp->rq_pages[i])
+			needed++;
+	if (needed) {
+		LIST_HEAD(list);
+
+retry:
+		alloc_pages_bulk(GFP_KERNEL, 0,
+				 /* to test the retry logic: */
+				 min_t(unsigned long, needed, 13),
+				 &list);
+		for (i = 0; i < pages; i++) {
+			if (!rqstp->rq_pages[i]) {
+				struct page *page;
+
+				page = list_first_entry_or_null(&list,
+								struct page,
+								lru);
+				if (unlikely(!page))
+					goto empty_list;
+				list_del(&page->lru);
+				rqstp->rq_pages[i] = page;
+				needed--;
 			}
-			rqstp->rq_pages[i] = p;
 		}
+	}
 	rqstp->rq_page_end = &rqstp->rq_pages[pages];
 	rqstp->rq_pages[pages] = NULL; /* this might be seen in nfsd_splice_actor() */
 
@@ -686,6 +700,12 @@  static int svc_alloc_arg(struct svc_rqst *rqstp)
 	arg->len = (pages-1)*PAGE_SIZE;
 	arg->tail[0].iov_len = 0;
 	return 0;
+
+empty_list:
+	if (signalled() || kthread_should_stop())
+		return -EINTR;
+	cond_resched();
+	goto retry;
 }
 
 static bool