diff mbox series

[v3,5/9] SUNRPC: Count pool threads that were awoken but found no work to do

Message ID 168900734678.7514.887270657845753276.stgit@manet.1015granger.net (mailing list archive)
State New, archived
Headers show
Series SUNRPC service thread scheduler optimizations | expand

Commit Message

Chuck Lever July 10, 2023, 4:42 p.m. UTC
From: Chuck Lever <chuck.lever@oracle.com>

Measure a source of thread scheduling inefficiency -- count threads
that were awoken but found that the transport queue had already been
emptied.

An empty transport queue is possible when threads that run between
the wake_up_process() call and the woken thread returning from the
scheduler have pulled all remaining work off the transport queue
using the first svc_xprt_dequeue() in svc_get_next_xprt().

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc.h |    1 +
 net/sunrpc/svc.c           |    2 ++
 net/sunrpc/svc_xprt.c      |    7 ++++---
 3 files changed, 7 insertions(+), 3 deletions(-)

Comments

NeilBrown July 10, 2023, 10:29 p.m. UTC | #1
On Tue, 11 Jul 2023, Chuck Lever wrote:
> From: Chuck Lever <chuck.lever@oracle.com>
> 
> Measure a source of thread scheduling inefficiency -- count threads
> that were awoken but found that the transport queue had already been
> emptied.
> 
> An empty transport queue is possible when threads that run between
> the wake_up_process() call and the woken thread returning from the
> scheduler have pulled all remaining work off the transport queue
> using the first svc_xprt_dequeue() in svc_get_next_xprt().

I'm in two minds about this.  The data being gathered here is
potentially useful - but who it is useful to?
I think it is primarily useful for us - to understand the behaviour of
the implementation so we can know what needs to be optimised.
It isn't really of any use to a sysadmin who wants to understand how
their system is performing.

But then .. who are tracepoints for?  Developers or admins?
I guess that fact that we feel free to modify them whenever we need
means they are primarily for developers?  In which case this is a good
patch, and maybe we'll revert the functionality one day if it turns out
that we can change the implementation so that a thread is never woken
when there is no work to do ....

Thanks,
NeilBrown


> 
> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> ---
>  include/linux/sunrpc/svc.h |    1 +
>  net/sunrpc/svc.c           |    2 ++
>  net/sunrpc/svc_xprt.c      |    7 ++++---
>  3 files changed, 7 insertions(+), 3 deletions(-)
> 
> diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
> index 74ea13270679..9dd3b16cc4c2 100644
> --- a/include/linux/sunrpc/svc.h
> +++ b/include/linux/sunrpc/svc.h
> @@ -43,6 +43,7 @@ struct svc_pool {
>  	struct percpu_counter	sp_threads_woken;
>  	struct percpu_counter	sp_threads_timedout;
>  	struct percpu_counter	sp_threads_starved;
> +	struct percpu_counter	sp_threads_no_work;
>  
>  #define	SP_TASK_PENDING		(0)		/* still work to do even if no
>  						 * xprt is queued. */
> diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
> index 88b7b5fb6d75..b7a02309ecb1 100644
> --- a/net/sunrpc/svc.c
> +++ b/net/sunrpc/svc.c
> @@ -518,6 +518,7 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
>  		percpu_counter_init(&pool->sp_threads_woken, 0, GFP_KERNEL);
>  		percpu_counter_init(&pool->sp_threads_timedout, 0, GFP_KERNEL);
>  		percpu_counter_init(&pool->sp_threads_starved, 0, GFP_KERNEL);
> +		percpu_counter_init(&pool->sp_threads_no_work, 0, GFP_KERNEL);
>  	}
>  
>  	return serv;
> @@ -595,6 +596,7 @@ svc_destroy(struct kref *ref)
>  		percpu_counter_destroy(&pool->sp_threads_woken);
>  		percpu_counter_destroy(&pool->sp_threads_timedout);
>  		percpu_counter_destroy(&pool->sp_threads_starved);
> +		percpu_counter_destroy(&pool->sp_threads_no_work);
>  	}
>  	kfree(serv->sv_pools);
>  	kfree(serv);
> diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
> index ecbccf0d89b9..6c2a702aa469 100644
> --- a/net/sunrpc/svc_xprt.c
> +++ b/net/sunrpc/svc_xprt.c
> @@ -776,9 +776,9 @@ static struct svc_xprt *svc_get_next_xprt(struct svc_rqst *rqstp, long timeout)
>  
>  	if (!time_left)
>  		percpu_counter_inc(&pool->sp_threads_timedout);
> -
>  	if (signalled() || kthread_should_stop())
>  		return ERR_PTR(-EINTR);
> +	percpu_counter_inc(&pool->sp_threads_no_work);
>  	return ERR_PTR(-EAGAIN);
>  out_found:
>  	/* Normally we will wait up to 5 seconds for any required
> @@ -1445,13 +1445,14 @@ static int svc_pool_stats_show(struct seq_file *m, void *p)
>  		return 0;
>  	}
>  
> -	seq_printf(m, "%u %llu %llu %llu %llu %llu\n",
> +	seq_printf(m, "%u %llu %llu %llu %llu %llu %llu\n",
>  		pool->sp_id,
>  		percpu_counter_sum_positive(&pool->sp_messages_arrived),
>  		percpu_counter_sum_positive(&pool->sp_sockets_queued),
>  		percpu_counter_sum_positive(&pool->sp_threads_woken),
>  		percpu_counter_sum_positive(&pool->sp_threads_timedout),
> -		percpu_counter_sum_positive(&pool->sp_threads_starved));
> +		percpu_counter_sum_positive(&pool->sp_threads_starved),
> +		percpu_counter_sum_positive(&pool->sp_threads_no_work));
>  
>  	return 0;
>  }
> 
> 
>
Chuck Lever July 10, 2023, 10:52 p.m. UTC | #2
> On Jul 10, 2023, at 6:29 PM, NeilBrown <neilb@suse.de> wrote:
> 
> On Tue, 11 Jul 2023, Chuck Lever wrote:
>> From: Chuck Lever <chuck.lever@oracle.com>
>> 
>> Measure a source of thread scheduling inefficiency -- count threads
>> that were awoken but found that the transport queue had already been
>> emptied.
>> 
>> An empty transport queue is possible when threads that run between
>> the wake_up_process() call and the woken thread returning from the
>> scheduler have pulled all remaining work off the transport queue
>> using the first svc_xprt_dequeue() in svc_get_next_xprt().
> 
> I'm in two minds about this.  The data being gathered here is
> potentially useful

It's actually pretty shocking: I've measured more than
15% of thread wake-ups find no work to do.


> - but who it is useful to?
> I think it is primarily useful for us - to understand the behaviour of
> the implementation so we can know what needs to be optimised.
> It isn't really of any use to a sysadmin who wants to understand how
> their system is performing.
> 
> But then .. who are tracepoints for?  Developers or admins?
> I guess that fact that we feel free to modify them whenever we need
> means they are primarily for developers?  In which case this is a good
> patch, and maybe we'll revert the functionality one day if it turns out
> that we can change the implementation so that a thread is never woken
> when there is no work to do ....

A reasonable question to ask. The new "starved" metric
is similar: possibly useful while we are developing the
code, but not particularly valuable for system
administrators.

How are the pool_stats used by administrators?

(And, why are they in /proc/fs/nfsd/ rather than under
something RPC-related?)


>> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
>> ---
>> include/linux/sunrpc/svc.h |    1 +
>> net/sunrpc/svc.c           |    2 ++
>> net/sunrpc/svc_xprt.c      |    7 ++++---
>> 3 files changed, 7 insertions(+), 3 deletions(-)
>> 
>> diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
>> index 74ea13270679..9dd3b16cc4c2 100644
>> --- a/include/linux/sunrpc/svc.h
>> +++ b/include/linux/sunrpc/svc.h
>> @@ -43,6 +43,7 @@ struct svc_pool {
>> struct percpu_counter sp_threads_woken;
>> struct percpu_counter sp_threads_timedout;
>> struct percpu_counter sp_threads_starved;
>> + struct percpu_counter sp_threads_no_work;
>> 
>> #define SP_TASK_PENDING (0) /* still work to do even if no
>> * xprt is queued. */
>> diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
>> index 88b7b5fb6d75..b7a02309ecb1 100644
>> --- a/net/sunrpc/svc.c
>> +++ b/net/sunrpc/svc.c
>> @@ -518,6 +518,7 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
>> percpu_counter_init(&pool->sp_threads_woken, 0, GFP_KERNEL);
>> percpu_counter_init(&pool->sp_threads_timedout, 0, GFP_KERNEL);
>> percpu_counter_init(&pool->sp_threads_starved, 0, GFP_KERNEL);
>> + percpu_counter_init(&pool->sp_threads_no_work, 0, GFP_KERNEL);
>> }
>> 
>> return serv;
>> @@ -595,6 +596,7 @@ svc_destroy(struct kref *ref)
>> percpu_counter_destroy(&pool->sp_threads_woken);
>> percpu_counter_destroy(&pool->sp_threads_timedout);
>> percpu_counter_destroy(&pool->sp_threads_starved);
>> + percpu_counter_destroy(&pool->sp_threads_no_work);
>> }
>> kfree(serv->sv_pools);
>> kfree(serv);
>> diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
>> index ecbccf0d89b9..6c2a702aa469 100644
>> --- a/net/sunrpc/svc_xprt.c
>> +++ b/net/sunrpc/svc_xprt.c
>> @@ -776,9 +776,9 @@ static struct svc_xprt *svc_get_next_xprt(struct svc_rqst *rqstp, long timeout)
>> 
>> if (!time_left)
>> percpu_counter_inc(&pool->sp_threads_timedout);
>> -
>> if (signalled() || kthread_should_stop())
>> return ERR_PTR(-EINTR);
>> + percpu_counter_inc(&pool->sp_threads_no_work);
>> return ERR_PTR(-EAGAIN);
>> out_found:
>> /* Normally we will wait up to 5 seconds for any required
>> @@ -1445,13 +1445,14 @@ static int svc_pool_stats_show(struct seq_file *m, void *p)
>> return 0;
>> }
>> 
>> - seq_printf(m, "%u %llu %llu %llu %llu %llu\n",
>> + seq_printf(m, "%u %llu %llu %llu %llu %llu %llu\n",
>> pool->sp_id,
>> percpu_counter_sum_positive(&pool->sp_messages_arrived),
>> percpu_counter_sum_positive(&pool->sp_sockets_queued),
>> percpu_counter_sum_positive(&pool->sp_threads_woken),
>> percpu_counter_sum_positive(&pool->sp_threads_timedout),
>> - percpu_counter_sum_positive(&pool->sp_threads_starved));
>> + percpu_counter_sum_positive(&pool->sp_threads_starved),
>> + percpu_counter_sum_positive(&pool->sp_threads_no_work));
>> 
>> return 0;
>> }
>> 
>> 
>> 
> 

--
Chuck Lever
NeilBrown July 11, 2023, 9:54 a.m. UTC | #3
On Tue, 11 Jul 2023, Chuck Lever III wrote:
> 
> > On Jul 10, 2023, at 6:29 PM, NeilBrown <neilb@suse.de> wrote:
> > 
> > On Tue, 11 Jul 2023, Chuck Lever wrote:
> >> From: Chuck Lever <chuck.lever@oracle.com>
> >> 
> >> Measure a source of thread scheduling inefficiency -- count threads
> >> that were awoken but found that the transport queue had already been
> >> emptied.
> >> 
> >> An empty transport queue is possible when threads that run between
> >> the wake_up_process() call and the woken thread returning from the
> >> scheduler have pulled all remaining work off the transport queue
> >> using the first svc_xprt_dequeue() in svc_get_next_xprt().
> > 
> > I'm in two minds about this.  The data being gathered here is
> > potentially useful
> 
> It's actually pretty shocking: I've measured more than
> 15% of thread wake-ups find no work to do.

That is a bigger number than I would have guessed!

> 
> 
> > - but who it is useful to?
> > I think it is primarily useful for us - to understand the behaviour of
> > the implementation so we can know what needs to be optimised.
> > It isn't really of any use to a sysadmin who wants to understand how
> > their system is performing.
> > 
> > But then .. who are tracepoints for?  Developers or admins?
> > I guess that fact that we feel free to modify them whenever we need
> > means they are primarily for developers?  In which case this is a good
> > patch, and maybe we'll revert the functionality one day if it turns out
> > that we can change the implementation so that a thread is never woken
> > when there is no work to do ....
> 
> A reasonable question to ask. The new "starved" metric
> is similar: possibly useful while we are developing the
> code, but not particularly valuable for system
> administrators.
> 
> How are the pool_stats used by administrators?

That is a fair question.  Probably not much.
Once upon a time we had stats which could show a histogram how thread
usage.  I used that to decided if the load justified more threads.
But we removed it because it was somewhat expensive and it was argued it
may not be all that useful...
I haven't really looked at any other stats in my work.  Almost always it
is a packet capture that helps me see what is happening when I have an
issue to address.

Maybe I should just accept that stats are primarily for developers and
they can be incredible useful for that purpose, and not worry if admins
might ever need them.

> 
> (And, why are they in /proc/fs/nfsd/ rather than under
> something RPC-related?)

Maybe because we "owned" /proc/fs/nfsd/, but the RPC-related stuff is
under "net" and we didn't feel so comfortable sticking new stuff there.
Or maybe not.

Thanks,
NeilBrown
Jeff Layton July 11, 2023, 11:42 a.m. UTC | #4
On Tue, 2023-07-11 at 19:54 +1000, NeilBrown wrote:
> On Tue, 11 Jul 2023, Chuck Lever III wrote:
> > 
> > > On Jul 10, 2023, at 6:29 PM, NeilBrown <neilb@suse.de> wrote:
> > > 
> > > On Tue, 11 Jul 2023, Chuck Lever wrote:
> > > > From: Chuck Lever <chuck.lever@oracle.com>
> > > > 
> > > > Measure a source of thread scheduling inefficiency -- count threads
> > > > that were awoken but found that the transport queue had already been
> > > > emptied.
> > > > 
> > > > An empty transport queue is possible when threads that run between
> > > > the wake_up_process() call and the woken thread returning from the
> > > > scheduler have pulled all remaining work off the transport queue
> > > > using the first svc_xprt_dequeue() in svc_get_next_xprt().
> > > 
> > > I'm in two minds about this.  The data being gathered here is
> > > potentially useful
> > 
> > It's actually pretty shocking: I've measured more than
> > 15% of thread wake-ups find no work to do.
> 
> That is a bigger number than I would have guessed!
> 

I'm guessing the idea is that the receiver is waking a thread to do the
work, and that races with one that's already running? I'm sure there are
ways we can fix that, but it really does seem like we're trying to
reinvent workqueues here.

> > 
> > 
> > > - but who it is useful to?
> > > I think it is primarily useful for us - to understand the behaviour of
> > > the implementation so we can know what needs to be optimised.
> > > It isn't really of any use to a sysadmin who wants to understand how
> > > their system is performing.
> > > 
> > > But then .. who are tracepoints for?  Developers or admins?
> > > I guess that fact that we feel free to modify them whenever we need
> > > means they are primarily for developers?  In which case this is a good
> > > patch, and maybe we'll revert the functionality one day if it turns out
> > > that we can change the implementation so that a thread is never woken
> > > when there is no work to do ....
> > 
> > A reasonable question to ask. The new "starved" metric
> > is similar: possibly useful while we are developing the
> > code, but not particularly valuable for system
> > administrators.
> > 
> > How are the pool_stats used by administrators?
> 
> That is a fair question.  Probably not much.
> Once upon a time we had stats which could show a histogram how thread
> usage.  I used that to decided if the load justified more threads.
> But we removed it because it was somewhat expensive and it was argued it
> may not be all that useful...
> I haven't really looked at any other stats in my work.  Almost always it
> is a packet capture that helps me see what is happening when I have an
> issue to address.
> 
> Maybe I should just accept that stats are primarily for developers and
> they can be incredible useful for that purpose, and not worry if admins
> might ever need them.
> 
> > 
> > (And, why are they in /proc/fs/nfsd/ rather than under
> > something RPC-related?)
> 
> Maybe because we "owned" /proc/fs/nfsd/, but the RPC-related stuff is
> under "net" and we didn't feel so comfortable sticking new stuff there.
> Or maybe not.
>
NeilBrown July 11, 2023, 12:17 p.m. UTC | #5
On Tue, 11 Jul 2023, Jeff Layton wrote:
> On Tue, 2023-07-11 at 19:54 +1000, NeilBrown wrote:
> > On Tue, 11 Jul 2023, Chuck Lever III wrote:
> > > 
> > > > On Jul 10, 2023, at 6:29 PM, NeilBrown <neilb@suse.de> wrote:
> > > > 
> > > > On Tue, 11 Jul 2023, Chuck Lever wrote:
> > > > > From: Chuck Lever <chuck.lever@oracle.com>
> > > > > 
> > > > > Measure a source of thread scheduling inefficiency -- count threads
> > > > > that were awoken but found that the transport queue had already been
> > > > > emptied.
> > > > > 
> > > > > An empty transport queue is possible when threads that run between
> > > > > the wake_up_process() call and the woken thread returning from the
> > > > > scheduler have pulled all remaining work off the transport queue
> > > > > using the first svc_xprt_dequeue() in svc_get_next_xprt().
> > > > 
> > > > I'm in two minds about this.  The data being gathered here is
> > > > potentially useful
> > > 
> > > It's actually pretty shocking: I've measured more than
> > > 15% of thread wake-ups find no work to do.
> > 
> > That is a bigger number than I would have guessed!
> > 
> 
> I'm guessing the idea is that the receiver is waking a thread to do the
> work, and that races with one that's already running? I'm sure there are
> ways we can fix that, but it really does seem like we're trying to
> reinvent workqueues here.

True.  But then workqueues seem to reinvent themselves every so often
too.  Once gets the impression they are trying to meet an enormous
variety of needs.
I'm not against trying to see if nfsd could work well in a workqueue
environment, but I'm not certain it is a good idea.  Maintaining control
of our own thread pools might be safer.

I have a vague memory of looking into this in more detail once and
deciding that it wasn't a good fit, but I cannot recall or easily deduce
the reason.  Obviously we would have to give up SIGKILL, but we want to
do that anyway.

Would we want unbound work queues - so they can be scheduled across
different CPUs?  Are NFSD threads cpu-intensive or not?  I'm not sure.

I would be happy to explore a credible attempt at a conversion.

NeilBrown


> 
> > > 
> > > 
> > > > - but who it is useful to?
> > > > I think it is primarily useful for us - to understand the behaviour of
> > > > the implementation so we can know what needs to be optimised.
> > > > It isn't really of any use to a sysadmin who wants to understand how
> > > > their system is performing.
> > > > 
> > > > But then .. who are tracepoints for?  Developers or admins?
> > > > I guess that fact that we feel free to modify them whenever we need
> > > > means they are primarily for developers?  In which case this is a good
> > > > patch, and maybe we'll revert the functionality one day if it turns out
> > > > that we can change the implementation so that a thread is never woken
> > > > when there is no work to do ....
> > > 
> > > A reasonable question to ask. The new "starved" metric
> > > is similar: possibly useful while we are developing the
> > > code, but not particularly valuable for system
> > > administrators.
> > > 
> > > How are the pool_stats used by administrators?
> > 
> > That is a fair question.  Probably not much.
> > Once upon a time we had stats which could show a histogram how thread
> > usage.  I used that to decided if the load justified more threads.
> > But we removed it because it was somewhat expensive and it was argued it
> > may not be all that useful...
> > I haven't really looked at any other stats in my work.  Almost always it
> > is a packet capture that helps me see what is happening when I have an
> > issue to address.
> > 
> > Maybe I should just accept that stats are primarily for developers and
> > they can be incredible useful for that purpose, and not worry if admins
> > might ever need them.
> > 
> > > 
> > > (And, why are they in /proc/fs/nfsd/ rather than under
> > > something RPC-related?)
> > 
> > Maybe because we "owned" /proc/fs/nfsd/, but the RPC-related stuff is
> > under "net" and we didn't feel so comfortable sticking new stuff there.
> > Or maybe not.
> > 
> 
> -- 
> Jeff Layton <jlayton@redhat.com>
> 
>
Chuck Lever July 11, 2023, 12:29 p.m. UTC | #6
> On Jul 11, 2023, at 8:17 AM, NeilBrown <neilb@suse.de> wrote:
> 
> On Tue, 11 Jul 2023, Jeff Layton wrote:
>> On Tue, 2023-07-11 at 19:54 +1000, NeilBrown wrote:
>>> On Tue, 11 Jul 2023, Chuck Lever III wrote:
>>>> 
>>>>> On Jul 10, 2023, at 6:29 PM, NeilBrown <neilb@suse.de> wrote:
>>>>> 
>>>>> On Tue, 11 Jul 2023, Chuck Lever wrote:
>>>>>> From: Chuck Lever <chuck.lever@oracle.com>
>>>>>> 
>>>>>> Measure a source of thread scheduling inefficiency -- count threads
>>>>>> that were awoken but found that the transport queue had already been
>>>>>> emptied.
>>>>>> 
>>>>>> An empty transport queue is possible when threads that run between
>>>>>> the wake_up_process() call and the woken thread returning from the
>>>>>> scheduler have pulled all remaining work off the transport queue
>>>>>> using the first svc_xprt_dequeue() in svc_get_next_xprt().
>>>>> 
>>>>> I'm in two minds about this.  The data being gathered here is
>>>>> potentially useful
>>>> 
>>>> It's actually pretty shocking: I've measured more than
>>>> 15% of thread wake-ups find no work to do.
>>> 
>>> That is a bigger number than I would have guessed!
>>> 
>> 
>> I'm guessing the idea is that the receiver is waking a thread to do the
>> work, and that races with one that's already running? I'm sure there are
>> ways we can fix that, but it really does seem like we're trying to
>> reinvent workqueues here.
> 
> True.  But then workqueues seem to reinvent themselves every so often
> too.  Once gets the impression they are trying to meet an enormous
> variety of needs.
> I'm not against trying to see if nfsd could work well in a workqueue
> environment, but I'm not certain it is a good idea.  Maintaining control
> of our own thread pools might be safer.
> 
> I have a vague memory of looking into this in more detail once and
> deciding that it wasn't a good fit, but I cannot recall or easily deduce
> the reason.  Obviously we would have to give up SIGKILL, but we want to
> do that anyway.
> 
> Would we want unbound work queues - so they can be scheduled across
> different CPUs?  Are NFSD threads cpu-intensive or not?  I'm not sure.
> 
> I would be happy to explore a credible attempt at a conversion.

Jeff did one a few years back. It had some subtle performance
problems that we couldn't nail down.

My concern with workqueues is they don't seem well suited to
heavy workloads. There seems to be a lot of lock contention
unless the consumer is very very careful.


>>>>> - but who it is useful to?
>>>>> I think it is primarily useful for us - to understand the behaviour of
>>>>> the implementation so we can know what needs to be optimised.
>>>>> It isn't really of any use to a sysadmin who wants to understand how
>>>>> their system is performing.
>>>>> 
>>>>> But then .. who are tracepoints for?  Developers or admins?
>>>>> I guess that fact that we feel free to modify them whenever we need
>>>>> means they are primarily for developers?  In which case this is a good
>>>>> patch, and maybe we'll revert the functionality one day if it turns out
>>>>> that we can change the implementation so that a thread is never woken
>>>>> when there is no work to do ....
>>>> 
>>>> A reasonable question to ask. The new "starved" metric
>>>> is similar: possibly useful while we are developing the
>>>> code, but not particularly valuable for system
>>>> administrators.
>>>> 
>>>> How are the pool_stats used by administrators?
>>> 
>>> That is a fair question.  Probably not much.
>>> Once upon a time we had stats which could show a histogram how thread
>>> usage.  I used that to decided if the load justified more threads.
>>> But we removed it because it was somewhat expensive and it was argued it
>>> may not be all that useful...
>>> I haven't really looked at any other stats in my work.  Almost always it
>>> is a packet capture that helps me see what is happening when I have an
>>> issue to address.
>>> 
>>> Maybe I should just accept that stats are primarily for developers and
>>> they can be incredible useful for that purpose, and not worry if admins
>>> might ever need them.
>>> 
>>>> 
>>>> (And, why are they in /proc/fs/nfsd/ rather than under
>>>> something RPC-related?)
>>> 
>>> Maybe because we "owned" /proc/fs/nfsd/, but the RPC-related stuff is
>>> under "net" and we didn't feel so comfortable sticking new stuff there.
>>> Or maybe not.
>>> 
>> 
>> -- 
>> Jeff Layton <jlayton@redhat.com>


--
Chuck Lever
Jeff Layton July 11, 2023, 2:07 p.m. UTC | #7
On Tue, 2023-07-11 at 22:17 +1000, NeilBrown wrote:
> On Tue, 11 Jul 2023, Jeff Layton wrote:
> > On Tue, 2023-07-11 at 19:54 +1000, NeilBrown wrote:
> > > On Tue, 11 Jul 2023, Chuck Lever III wrote:
> > > > 
> > > > > On Jul 10, 2023, at 6:29 PM, NeilBrown <neilb@suse.de> wrote:
> > > > > 
> > > > > On Tue, 11 Jul 2023, Chuck Lever wrote:
> > > > > > From: Chuck Lever <chuck.lever@oracle.com>
> > > > > > 
> > > > > > Measure a source of thread scheduling inefficiency -- count threads
> > > > > > that were awoken but found that the transport queue had already been
> > > > > > emptied.
> > > > > > 
> > > > > > An empty transport queue is possible when threads that run between
> > > > > > the wake_up_process() call and the woken thread returning from the
> > > > > > scheduler have pulled all remaining work off the transport queue
> > > > > > using the first svc_xprt_dequeue() in svc_get_next_xprt().
> > > > > 
> > > > > I'm in two minds about this.  The data being gathered here is
> > > > > potentially useful
> > > > 
> > > > It's actually pretty shocking: I've measured more than
> > > > 15% of thread wake-ups find no work to do.
> > > 
> > > That is a bigger number than I would have guessed!
> > > 
> > 
> > I'm guessing the idea is that the receiver is waking a thread to do the
> > work, and that races with one that's already running? I'm sure there are
> > ways we can fix that, but it really does seem like we're trying to
> > reinvent workqueues here.
> 
> True.  But then workqueues seem to reinvent themselves every so often
> too.  Once gets the impression they are trying to meet an enormous
> variety of needs.
> I'm not against trying to see if nfsd could work well in a workqueue
> environment, but I'm not certain it is a good idea.  Maintaining control
> of our own thread pools might be safer.
> 
> I have a vague memory of looking into this in more detail once and
> deciding that it wasn't a good fit, but I cannot recall or easily deduce
> the reason.  Obviously we would have to give up SIGKILL, but we want to
> do that anyway.
> 
> Would we want unbound work queues - so they can be scheduled across
> different CPUs?  Are NFSD threads cpu-intensive or not?  I'm not sure.
> 
> I would be happy to explore a credible attempt at a conversion.
> 

I had some patches several years ago that did a conversion from nfsd
threads to workqueues. 

It worked reasonably well, but under heavy loads it didn't perform as
well as having a dedicated threadpool...which is not too surprising,
really. nfsd has been tuned for performance over years and it's fairly
"greedy" about squatting on system resources even when it's idle.

If we wanted to look again at doing this with workqueues, we'd need the
workqueue infrastructure to allow for long-running jobs that may block
for a long time. That means it might need to be more cavalier about
spinning up new workqueue threads and keeping them running when there
are a lot of concurrent, but sleeping workqueue jobs.

We probably would want unbound workqueues so we can have more jobs in
flight at any given time than cpus, given that a lot of them will end up
being blocked in some way or another.

CPU utilization is a good question. Mostly we just call down into the
filesystem to do things, and the encoding and decoding is not _that_ cpu
intensive. For most storage stacks, I suspect we don't use a lot of CPU
aside from normal copying of data.  There might be some exceptions
though, like when the underlying storage is using encryption, etc.

The main upside to switching to workqueues is that it would allow us to
get rid of a lot of fiddly, hand-tuned thread handling code. It might
also make it simpler to convert to a more asynchronous model.

For instance, it would be nice to be able to not have to put a thread to
sleep while waiting on writeback for a COMMIT. I think that'd be easier
to handle with a workqueue-like model.
NeilBrown July 19, 2023, 12:39 a.m. UTC | #8
On Tue, 11 Jul 2023, Chuck Lever III wrote:
> 
> > On Jul 10, 2023, at 6:29 PM, NeilBrown <neilb@suse.de> wrote:
> > 
> > On Tue, 11 Jul 2023, Chuck Lever wrote:
> >> From: Chuck Lever <chuck.lever@oracle.com>
> >> 
> >> Measure a source of thread scheduling inefficiency -- count threads
> >> that were awoken but found that the transport queue had already been
> >> emptied.
> >> 
> >> An empty transport queue is possible when threads that run between
> >> the wake_up_process() call and the woken thread returning from the
> >> scheduler have pulled all remaining work off the transport queue
> >> using the first svc_xprt_dequeue() in svc_get_next_xprt().
> > 
> > I'm in two minds about this.  The data being gathered here is
> > potentially useful
> 
> It's actually pretty shocking: I've measured more than
> 15% of thread wake-ups find no work to do.

I'm now wondering if that is a reliable statistic.

This counter as implemented doesn't count "pool threads that were awoken
but found no work to do".  Rather, it counts "pool threads that found no
work to do, either after having been awoken, or having just completed
some other request".

And it doesn't even really count that is it doesn't notice that lockd
"retry blocked" work (or the NFSv4.1 callback work, but we don't count
that at all I think).

Maybe we should only update the count if we had actually been woken up
recently.

NeilBrown
Chuck Lever July 19, 2023, 1:08 p.m. UTC | #9
> On Jul 18, 2023, at 8:39 PM, NeilBrown <neilb@suse.de> wrote:
> 
> On Tue, 11 Jul 2023, Chuck Lever III wrote:
>> 
>>> On Jul 10, 2023, at 6:29 PM, NeilBrown <neilb@suse.de> wrote:
>>> 
>>> On Tue, 11 Jul 2023, Chuck Lever wrote:
>>>> From: Chuck Lever <chuck.lever@oracle.com>
>>>> 
>>>> Measure a source of thread scheduling inefficiency -- count threads
>>>> that were awoken but found that the transport queue had already been
>>>> emptied.
>>>> 
>>>> An empty transport queue is possible when threads that run between
>>>> the wake_up_process() call and the woken thread returning from the
>>>> scheduler have pulled all remaining work off the transport queue
>>>> using the first svc_xprt_dequeue() in svc_get_next_xprt().
>>> 
>>> I'm in two minds about this.  The data being gathered here is
>>> potentially useful
>> 
>> It's actually pretty shocking: I've measured more than
>> 15% of thread wake-ups find no work to do.
> 
> I'm now wondering if that is a reliable statistic.
> 
> This counter as implemented doesn't count "pool threads that were awoken
> but found no work to do".  Rather, it counts "pool threads that found no
> work to do, either after having been awoken, or having just completed
> some other request".

In the current code, the only way to get to "return -EAGAIN;" is if the
thread calls schedule_timeout() (ie, it actually sleeps), then the
svc_rqst was specifically selected and awoken, and the schedule_timeout()
did not time out.

I don't see a problem.


> And it doesn't even really count that is it doesn't notice that lockd
> "retry blocked" work (or the NFSv4.1 callback work, but we don't count
> that at all I think).
> 
> Maybe we should only update the count if we had actually been woken up
> recently.

So this one can be dropped for now since it's currently of value only
for working on the scheduler changes. If the thread scheduler were to
change such that a work item was actually assigned to a thread before
it is awoken (a la, a work queue model) then this counter would be
truly meaningless. I think we can wait for a bit.


--
Chuck Lever
NeilBrown July 19, 2023, 11:27 p.m. UTC | #10
On Wed, 19 Jul 2023, Chuck Lever III wrote:
> 
> > On Jul 18, 2023, at 8:39 PM, NeilBrown <neilb@suse.de> wrote:
> > 
> > On Tue, 11 Jul 2023, Chuck Lever III wrote:
> >> 
> >>> On Jul 10, 2023, at 6:29 PM, NeilBrown <neilb@suse.de> wrote:
> >>> 
> >>> On Tue, 11 Jul 2023, Chuck Lever wrote:
> >>>> From: Chuck Lever <chuck.lever@oracle.com>
> >>>> 
> >>>> Measure a source of thread scheduling inefficiency -- count threads
> >>>> that were awoken but found that the transport queue had already been
> >>>> emptied.
> >>>> 
> >>>> An empty transport queue is possible when threads that run between
> >>>> the wake_up_process() call and the woken thread returning from the
> >>>> scheduler have pulled all remaining work off the transport queue
> >>>> using the first svc_xprt_dequeue() in svc_get_next_xprt().
> >>> 
> >>> I'm in two minds about this.  The data being gathered here is
> >>> potentially useful
> >> 
> >> It's actually pretty shocking: I've measured more than
> >> 15% of thread wake-ups find no work to do.
> > 
> > I'm now wondering if that is a reliable statistic.
> > 
> > This counter as implemented doesn't count "pool threads that were awoken
> > but found no work to do".  Rather, it counts "pool threads that found no
> > work to do, either after having been awoken, or having just completed
> > some other request".
> 
> In the current code, the only way to get to "return -EAGAIN;" is if the
> thread calls schedule_timeout() (ie, it actually sleeps), then the
> svc_rqst was specifically selected and awoken, and the schedule_timeout()
> did not time out.
> 
> I don't see a problem.
> 

Yeah - I don't either any more.  Sorry for the noise.


> 
> > And it doesn't even really count that is it doesn't notice that lockd
> > "retry blocked" work (or the NFSv4.1 callback work, but we don't count
> > that at all I think).
> > 
> > Maybe we should only update the count if we had actually been woken up
> > recently.
> 
> So this one can be dropped for now since it's currently of value only
> for working on the scheduler changes. If the thread scheduler were to
> change such that a work item was actually assigned to a thread before
> it is awoken (a la, a work queue model) then this counter would be
> truly meaningless. I think we can wait for a bit.
> 

We used to assign a workitem to a thread before it was woken.  I find
that model to be aesthetically pleasing.
Trond changed that in
  Commit: 22700f3c6df5 ("SUNRPC: Improve ordering of transport processing")

claiming that the wake-up time for a sleeping thread could result in
poorer throughput.  No data given but I find the reasoning quite
credible.

Thanks,
NeilBrown

> 
> --
> Chuck Lever
> 
> 
>
diff mbox series

Patch

diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 74ea13270679..9dd3b16cc4c2 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -43,6 +43,7 @@  struct svc_pool {
 	struct percpu_counter	sp_threads_woken;
 	struct percpu_counter	sp_threads_timedout;
 	struct percpu_counter	sp_threads_starved;
+	struct percpu_counter	sp_threads_no_work;
 
 #define	SP_TASK_PENDING		(0)		/* still work to do even if no
 						 * xprt is queued. */
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 88b7b5fb6d75..b7a02309ecb1 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -518,6 +518,7 @@  __svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
 		percpu_counter_init(&pool->sp_threads_woken, 0, GFP_KERNEL);
 		percpu_counter_init(&pool->sp_threads_timedout, 0, GFP_KERNEL);
 		percpu_counter_init(&pool->sp_threads_starved, 0, GFP_KERNEL);
+		percpu_counter_init(&pool->sp_threads_no_work, 0, GFP_KERNEL);
 	}
 
 	return serv;
@@ -595,6 +596,7 @@  svc_destroy(struct kref *ref)
 		percpu_counter_destroy(&pool->sp_threads_woken);
 		percpu_counter_destroy(&pool->sp_threads_timedout);
 		percpu_counter_destroy(&pool->sp_threads_starved);
+		percpu_counter_destroy(&pool->sp_threads_no_work);
 	}
 	kfree(serv->sv_pools);
 	kfree(serv);
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index ecbccf0d89b9..6c2a702aa469 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -776,9 +776,9 @@  static struct svc_xprt *svc_get_next_xprt(struct svc_rqst *rqstp, long timeout)
 
 	if (!time_left)
 		percpu_counter_inc(&pool->sp_threads_timedout);
-
 	if (signalled() || kthread_should_stop())
 		return ERR_PTR(-EINTR);
+	percpu_counter_inc(&pool->sp_threads_no_work);
 	return ERR_PTR(-EAGAIN);
 out_found:
 	/* Normally we will wait up to 5 seconds for any required
@@ -1445,13 +1445,14 @@  static int svc_pool_stats_show(struct seq_file *m, void *p)
 		return 0;
 	}
 
-	seq_printf(m, "%u %llu %llu %llu %llu %llu\n",
+	seq_printf(m, "%u %llu %llu %llu %llu %llu %llu\n",
 		pool->sp_id,
 		percpu_counter_sum_positive(&pool->sp_messages_arrived),
 		percpu_counter_sum_positive(&pool->sp_sockets_queued),
 		percpu_counter_sum_positive(&pool->sp_threads_woken),
 		percpu_counter_sum_positive(&pool->sp_threads_timedout),
-		percpu_counter_sum_positive(&pool->sp_threads_starved));
+		percpu_counter_sum_positive(&pool->sp_threads_starved),
+		percpu_counter_sum_positive(&pool->sp_threads_no_work));
 
 	return 0;
 }