Message ID | 168900734678.7514.887270657845753276.stgit@manet.1015granger.net (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | SUNRPC service thread scheduler optimizations | expand |
On Tue, 11 Jul 2023, Chuck Lever wrote: > From: Chuck Lever <chuck.lever@oracle.com> > > Measure a source of thread scheduling inefficiency -- count threads > that were awoken but found that the transport queue had already been > emptied. > > An empty transport queue is possible when threads that run between > the wake_up_process() call and the woken thread returning from the > scheduler have pulled all remaining work off the transport queue > using the first svc_xprt_dequeue() in svc_get_next_xprt(). I'm in two minds about this. The data being gathered here is potentially useful - but who it is useful to? I think it is primarily useful for us - to understand the behaviour of the implementation so we can know what needs to be optimised. It isn't really of any use to a sysadmin who wants to understand how their system is performing. But then .. who are tracepoints for? Developers or admins? I guess that fact that we feel free to modify them whenever we need means they are primarily for developers? In which case this is a good patch, and maybe we'll revert the functionality one day if it turns out that we can change the implementation so that a thread is never woken when there is no work to do .... Thanks, NeilBrown > > Signed-off-by: Chuck Lever <chuck.lever@oracle.com> > --- > include/linux/sunrpc/svc.h | 1 + > net/sunrpc/svc.c | 2 ++ > net/sunrpc/svc_xprt.c | 7 ++++--- > 3 files changed, 7 insertions(+), 3 deletions(-) > > diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h > index 74ea13270679..9dd3b16cc4c2 100644 > --- a/include/linux/sunrpc/svc.h > +++ b/include/linux/sunrpc/svc.h > @@ -43,6 +43,7 @@ struct svc_pool { > struct percpu_counter sp_threads_woken; > struct percpu_counter sp_threads_timedout; > struct percpu_counter sp_threads_starved; > + struct percpu_counter sp_threads_no_work; > > #define SP_TASK_PENDING (0) /* still work to do even if no > * xprt is queued. */ > diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c > index 88b7b5fb6d75..b7a02309ecb1 100644 > --- a/net/sunrpc/svc.c > +++ b/net/sunrpc/svc.c > @@ -518,6 +518,7 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools, > percpu_counter_init(&pool->sp_threads_woken, 0, GFP_KERNEL); > percpu_counter_init(&pool->sp_threads_timedout, 0, GFP_KERNEL); > percpu_counter_init(&pool->sp_threads_starved, 0, GFP_KERNEL); > + percpu_counter_init(&pool->sp_threads_no_work, 0, GFP_KERNEL); > } > > return serv; > @@ -595,6 +596,7 @@ svc_destroy(struct kref *ref) > percpu_counter_destroy(&pool->sp_threads_woken); > percpu_counter_destroy(&pool->sp_threads_timedout); > percpu_counter_destroy(&pool->sp_threads_starved); > + percpu_counter_destroy(&pool->sp_threads_no_work); > } > kfree(serv->sv_pools); > kfree(serv); > diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c > index ecbccf0d89b9..6c2a702aa469 100644 > --- a/net/sunrpc/svc_xprt.c > +++ b/net/sunrpc/svc_xprt.c > @@ -776,9 +776,9 @@ static struct svc_xprt *svc_get_next_xprt(struct svc_rqst *rqstp, long timeout) > > if (!time_left) > percpu_counter_inc(&pool->sp_threads_timedout); > - > if (signalled() || kthread_should_stop()) > return ERR_PTR(-EINTR); > + percpu_counter_inc(&pool->sp_threads_no_work); > return ERR_PTR(-EAGAIN); > out_found: > /* Normally we will wait up to 5 seconds for any required > @@ -1445,13 +1445,14 @@ static int svc_pool_stats_show(struct seq_file *m, void *p) > return 0; > } > > - seq_printf(m, "%u %llu %llu %llu %llu %llu\n", > + seq_printf(m, "%u %llu %llu %llu %llu %llu %llu\n", > pool->sp_id, > percpu_counter_sum_positive(&pool->sp_messages_arrived), > percpu_counter_sum_positive(&pool->sp_sockets_queued), > percpu_counter_sum_positive(&pool->sp_threads_woken), > percpu_counter_sum_positive(&pool->sp_threads_timedout), > - percpu_counter_sum_positive(&pool->sp_threads_starved)); > + percpu_counter_sum_positive(&pool->sp_threads_starved), > + percpu_counter_sum_positive(&pool->sp_threads_no_work)); > > return 0; > } > > >
> On Jul 10, 2023, at 6:29 PM, NeilBrown <neilb@suse.de> wrote: > > On Tue, 11 Jul 2023, Chuck Lever wrote: >> From: Chuck Lever <chuck.lever@oracle.com> >> >> Measure a source of thread scheduling inefficiency -- count threads >> that were awoken but found that the transport queue had already been >> emptied. >> >> An empty transport queue is possible when threads that run between >> the wake_up_process() call and the woken thread returning from the >> scheduler have pulled all remaining work off the transport queue >> using the first svc_xprt_dequeue() in svc_get_next_xprt(). > > I'm in two minds about this. The data being gathered here is > potentially useful It's actually pretty shocking: I've measured more than 15% of thread wake-ups find no work to do. > - but who it is useful to? > I think it is primarily useful for us - to understand the behaviour of > the implementation so we can know what needs to be optimised. > It isn't really of any use to a sysadmin who wants to understand how > their system is performing. > > But then .. who are tracepoints for? Developers or admins? > I guess that fact that we feel free to modify them whenever we need > means they are primarily for developers? In which case this is a good > patch, and maybe we'll revert the functionality one day if it turns out > that we can change the implementation so that a thread is never woken > when there is no work to do .... A reasonable question to ask. The new "starved" metric is similar: possibly useful while we are developing the code, but not particularly valuable for system administrators. How are the pool_stats used by administrators? (And, why are they in /proc/fs/nfsd/ rather than under something RPC-related?) >> Signed-off-by: Chuck Lever <chuck.lever@oracle.com> >> --- >> include/linux/sunrpc/svc.h | 1 + >> net/sunrpc/svc.c | 2 ++ >> net/sunrpc/svc_xprt.c | 7 ++++--- >> 3 files changed, 7 insertions(+), 3 deletions(-) >> >> diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h >> index 74ea13270679..9dd3b16cc4c2 100644 >> --- a/include/linux/sunrpc/svc.h >> +++ b/include/linux/sunrpc/svc.h >> @@ -43,6 +43,7 @@ struct svc_pool { >> struct percpu_counter sp_threads_woken; >> struct percpu_counter sp_threads_timedout; >> struct percpu_counter sp_threads_starved; >> + struct percpu_counter sp_threads_no_work; >> >> #define SP_TASK_PENDING (0) /* still work to do even if no >> * xprt is queued. */ >> diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c >> index 88b7b5fb6d75..b7a02309ecb1 100644 >> --- a/net/sunrpc/svc.c >> +++ b/net/sunrpc/svc.c >> @@ -518,6 +518,7 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools, >> percpu_counter_init(&pool->sp_threads_woken, 0, GFP_KERNEL); >> percpu_counter_init(&pool->sp_threads_timedout, 0, GFP_KERNEL); >> percpu_counter_init(&pool->sp_threads_starved, 0, GFP_KERNEL); >> + percpu_counter_init(&pool->sp_threads_no_work, 0, GFP_KERNEL); >> } >> >> return serv; >> @@ -595,6 +596,7 @@ svc_destroy(struct kref *ref) >> percpu_counter_destroy(&pool->sp_threads_woken); >> percpu_counter_destroy(&pool->sp_threads_timedout); >> percpu_counter_destroy(&pool->sp_threads_starved); >> + percpu_counter_destroy(&pool->sp_threads_no_work); >> } >> kfree(serv->sv_pools); >> kfree(serv); >> diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c >> index ecbccf0d89b9..6c2a702aa469 100644 >> --- a/net/sunrpc/svc_xprt.c >> +++ b/net/sunrpc/svc_xprt.c >> @@ -776,9 +776,9 @@ static struct svc_xprt *svc_get_next_xprt(struct svc_rqst *rqstp, long timeout) >> >> if (!time_left) >> percpu_counter_inc(&pool->sp_threads_timedout); >> - >> if (signalled() || kthread_should_stop()) >> return ERR_PTR(-EINTR); >> + percpu_counter_inc(&pool->sp_threads_no_work); >> return ERR_PTR(-EAGAIN); >> out_found: >> /* Normally we will wait up to 5 seconds for any required >> @@ -1445,13 +1445,14 @@ static int svc_pool_stats_show(struct seq_file *m, void *p) >> return 0; >> } >> >> - seq_printf(m, "%u %llu %llu %llu %llu %llu\n", >> + seq_printf(m, "%u %llu %llu %llu %llu %llu %llu\n", >> pool->sp_id, >> percpu_counter_sum_positive(&pool->sp_messages_arrived), >> percpu_counter_sum_positive(&pool->sp_sockets_queued), >> percpu_counter_sum_positive(&pool->sp_threads_woken), >> percpu_counter_sum_positive(&pool->sp_threads_timedout), >> - percpu_counter_sum_positive(&pool->sp_threads_starved)); >> + percpu_counter_sum_positive(&pool->sp_threads_starved), >> + percpu_counter_sum_positive(&pool->sp_threads_no_work)); >> >> return 0; >> } >> >> >> > -- Chuck Lever
On Tue, 11 Jul 2023, Chuck Lever III wrote: > > > On Jul 10, 2023, at 6:29 PM, NeilBrown <neilb@suse.de> wrote: > > > > On Tue, 11 Jul 2023, Chuck Lever wrote: > >> From: Chuck Lever <chuck.lever@oracle.com> > >> > >> Measure a source of thread scheduling inefficiency -- count threads > >> that were awoken but found that the transport queue had already been > >> emptied. > >> > >> An empty transport queue is possible when threads that run between > >> the wake_up_process() call and the woken thread returning from the > >> scheduler have pulled all remaining work off the transport queue > >> using the first svc_xprt_dequeue() in svc_get_next_xprt(). > > > > I'm in two minds about this. The data being gathered here is > > potentially useful > > It's actually pretty shocking: I've measured more than > 15% of thread wake-ups find no work to do. That is a bigger number than I would have guessed! > > > > - but who it is useful to? > > I think it is primarily useful for us - to understand the behaviour of > > the implementation so we can know what needs to be optimised. > > It isn't really of any use to a sysadmin who wants to understand how > > their system is performing. > > > > But then .. who are tracepoints for? Developers or admins? > > I guess that fact that we feel free to modify them whenever we need > > means they are primarily for developers? In which case this is a good > > patch, and maybe we'll revert the functionality one day if it turns out > > that we can change the implementation so that a thread is never woken > > when there is no work to do .... > > A reasonable question to ask. The new "starved" metric > is similar: possibly useful while we are developing the > code, but not particularly valuable for system > administrators. > > How are the pool_stats used by administrators? That is a fair question. Probably not much. Once upon a time we had stats which could show a histogram how thread usage. I used that to decided if the load justified more threads. But we removed it because it was somewhat expensive and it was argued it may not be all that useful... I haven't really looked at any other stats in my work. Almost always it is a packet capture that helps me see what is happening when I have an issue to address. Maybe I should just accept that stats are primarily for developers and they can be incredible useful for that purpose, and not worry if admins might ever need them. > > (And, why are they in /proc/fs/nfsd/ rather than under > something RPC-related?) Maybe because we "owned" /proc/fs/nfsd/, but the RPC-related stuff is under "net" and we didn't feel so comfortable sticking new stuff there. Or maybe not. Thanks, NeilBrown
On Tue, 2023-07-11 at 19:54 +1000, NeilBrown wrote: > On Tue, 11 Jul 2023, Chuck Lever III wrote: > > > > > On Jul 10, 2023, at 6:29 PM, NeilBrown <neilb@suse.de> wrote: > > > > > > On Tue, 11 Jul 2023, Chuck Lever wrote: > > > > From: Chuck Lever <chuck.lever@oracle.com> > > > > > > > > Measure a source of thread scheduling inefficiency -- count threads > > > > that were awoken but found that the transport queue had already been > > > > emptied. > > > > > > > > An empty transport queue is possible when threads that run between > > > > the wake_up_process() call and the woken thread returning from the > > > > scheduler have pulled all remaining work off the transport queue > > > > using the first svc_xprt_dequeue() in svc_get_next_xprt(). > > > > > > I'm in two minds about this. The data being gathered here is > > > potentially useful > > > > It's actually pretty shocking: I've measured more than > > 15% of thread wake-ups find no work to do. > > That is a bigger number than I would have guessed! > I'm guessing the idea is that the receiver is waking a thread to do the work, and that races with one that's already running? I'm sure there are ways we can fix that, but it really does seem like we're trying to reinvent workqueues here. > > > > > > > - but who it is useful to? > > > I think it is primarily useful for us - to understand the behaviour of > > > the implementation so we can know what needs to be optimised. > > > It isn't really of any use to a sysadmin who wants to understand how > > > their system is performing. > > > > > > But then .. who are tracepoints for? Developers or admins? > > > I guess that fact that we feel free to modify them whenever we need > > > means they are primarily for developers? In which case this is a good > > > patch, and maybe we'll revert the functionality one day if it turns out > > > that we can change the implementation so that a thread is never woken > > > when there is no work to do .... > > > > A reasonable question to ask. The new "starved" metric > > is similar: possibly useful while we are developing the > > code, but not particularly valuable for system > > administrators. > > > > How are the pool_stats used by administrators? > > That is a fair question. Probably not much. > Once upon a time we had stats which could show a histogram how thread > usage. I used that to decided if the load justified more threads. > But we removed it because it was somewhat expensive and it was argued it > may not be all that useful... > I haven't really looked at any other stats in my work. Almost always it > is a packet capture that helps me see what is happening when I have an > issue to address. > > Maybe I should just accept that stats are primarily for developers and > they can be incredible useful for that purpose, and not worry if admins > might ever need them. > > > > > (And, why are they in /proc/fs/nfsd/ rather than under > > something RPC-related?) > > Maybe because we "owned" /proc/fs/nfsd/, but the RPC-related stuff is > under "net" and we didn't feel so comfortable sticking new stuff there. > Or maybe not. >
On Tue, 11 Jul 2023, Jeff Layton wrote: > On Tue, 2023-07-11 at 19:54 +1000, NeilBrown wrote: > > On Tue, 11 Jul 2023, Chuck Lever III wrote: > > > > > > > On Jul 10, 2023, at 6:29 PM, NeilBrown <neilb@suse.de> wrote: > > > > > > > > On Tue, 11 Jul 2023, Chuck Lever wrote: > > > > > From: Chuck Lever <chuck.lever@oracle.com> > > > > > > > > > > Measure a source of thread scheduling inefficiency -- count threads > > > > > that were awoken but found that the transport queue had already been > > > > > emptied. > > > > > > > > > > An empty transport queue is possible when threads that run between > > > > > the wake_up_process() call and the woken thread returning from the > > > > > scheduler have pulled all remaining work off the transport queue > > > > > using the first svc_xprt_dequeue() in svc_get_next_xprt(). > > > > > > > > I'm in two minds about this. The data being gathered here is > > > > potentially useful > > > > > > It's actually pretty shocking: I've measured more than > > > 15% of thread wake-ups find no work to do. > > > > That is a bigger number than I would have guessed! > > > > I'm guessing the idea is that the receiver is waking a thread to do the > work, and that races with one that's already running? I'm sure there are > ways we can fix that, but it really does seem like we're trying to > reinvent workqueues here. True. But then workqueues seem to reinvent themselves every so often too. Once gets the impression they are trying to meet an enormous variety of needs. I'm not against trying to see if nfsd could work well in a workqueue environment, but I'm not certain it is a good idea. Maintaining control of our own thread pools might be safer. I have a vague memory of looking into this in more detail once and deciding that it wasn't a good fit, but I cannot recall or easily deduce the reason. Obviously we would have to give up SIGKILL, but we want to do that anyway. Would we want unbound work queues - so they can be scheduled across different CPUs? Are NFSD threads cpu-intensive or not? I'm not sure. I would be happy to explore a credible attempt at a conversion. NeilBrown > > > > > > > > > > > - but who it is useful to? > > > > I think it is primarily useful for us - to understand the behaviour of > > > > the implementation so we can know what needs to be optimised. > > > > It isn't really of any use to a sysadmin who wants to understand how > > > > their system is performing. > > > > > > > > But then .. who are tracepoints for? Developers or admins? > > > > I guess that fact that we feel free to modify them whenever we need > > > > means they are primarily for developers? In which case this is a good > > > > patch, and maybe we'll revert the functionality one day if it turns out > > > > that we can change the implementation so that a thread is never woken > > > > when there is no work to do .... > > > > > > A reasonable question to ask. The new "starved" metric > > > is similar: possibly useful while we are developing the > > > code, but not particularly valuable for system > > > administrators. > > > > > > How are the pool_stats used by administrators? > > > > That is a fair question. Probably not much. > > Once upon a time we had stats which could show a histogram how thread > > usage. I used that to decided if the load justified more threads. > > But we removed it because it was somewhat expensive and it was argued it > > may not be all that useful... > > I haven't really looked at any other stats in my work. Almost always it > > is a packet capture that helps me see what is happening when I have an > > issue to address. > > > > Maybe I should just accept that stats are primarily for developers and > > they can be incredible useful for that purpose, and not worry if admins > > might ever need them. > > > > > > > > (And, why are they in /proc/fs/nfsd/ rather than under > > > something RPC-related?) > > > > Maybe because we "owned" /proc/fs/nfsd/, but the RPC-related stuff is > > under "net" and we didn't feel so comfortable sticking new stuff there. > > Or maybe not. > > > > -- > Jeff Layton <jlayton@redhat.com> > >
> On Jul 11, 2023, at 8:17 AM, NeilBrown <neilb@suse.de> wrote: > > On Tue, 11 Jul 2023, Jeff Layton wrote: >> On Tue, 2023-07-11 at 19:54 +1000, NeilBrown wrote: >>> On Tue, 11 Jul 2023, Chuck Lever III wrote: >>>> >>>>> On Jul 10, 2023, at 6:29 PM, NeilBrown <neilb@suse.de> wrote: >>>>> >>>>> On Tue, 11 Jul 2023, Chuck Lever wrote: >>>>>> From: Chuck Lever <chuck.lever@oracle.com> >>>>>> >>>>>> Measure a source of thread scheduling inefficiency -- count threads >>>>>> that were awoken but found that the transport queue had already been >>>>>> emptied. >>>>>> >>>>>> An empty transport queue is possible when threads that run between >>>>>> the wake_up_process() call and the woken thread returning from the >>>>>> scheduler have pulled all remaining work off the transport queue >>>>>> using the first svc_xprt_dequeue() in svc_get_next_xprt(). >>>>> >>>>> I'm in two minds about this. The data being gathered here is >>>>> potentially useful >>>> >>>> It's actually pretty shocking: I've measured more than >>>> 15% of thread wake-ups find no work to do. >>> >>> That is a bigger number than I would have guessed! >>> >> >> I'm guessing the idea is that the receiver is waking a thread to do the >> work, and that races with one that's already running? I'm sure there are >> ways we can fix that, but it really does seem like we're trying to >> reinvent workqueues here. > > True. But then workqueues seem to reinvent themselves every so often > too. Once gets the impression they are trying to meet an enormous > variety of needs. > I'm not against trying to see if nfsd could work well in a workqueue > environment, but I'm not certain it is a good idea. Maintaining control > of our own thread pools might be safer. > > I have a vague memory of looking into this in more detail once and > deciding that it wasn't a good fit, but I cannot recall or easily deduce > the reason. Obviously we would have to give up SIGKILL, but we want to > do that anyway. > > Would we want unbound work queues - so they can be scheduled across > different CPUs? Are NFSD threads cpu-intensive or not? I'm not sure. > > I would be happy to explore a credible attempt at a conversion. Jeff did one a few years back. It had some subtle performance problems that we couldn't nail down. My concern with workqueues is they don't seem well suited to heavy workloads. There seems to be a lot of lock contention unless the consumer is very very careful. >>>>> - but who it is useful to? >>>>> I think it is primarily useful for us - to understand the behaviour of >>>>> the implementation so we can know what needs to be optimised. >>>>> It isn't really of any use to a sysadmin who wants to understand how >>>>> their system is performing. >>>>> >>>>> But then .. who are tracepoints for? Developers or admins? >>>>> I guess that fact that we feel free to modify them whenever we need >>>>> means they are primarily for developers? In which case this is a good >>>>> patch, and maybe we'll revert the functionality one day if it turns out >>>>> that we can change the implementation so that a thread is never woken >>>>> when there is no work to do .... >>>> >>>> A reasonable question to ask. The new "starved" metric >>>> is similar: possibly useful while we are developing the >>>> code, but not particularly valuable for system >>>> administrators. >>>> >>>> How are the pool_stats used by administrators? >>> >>> That is a fair question. Probably not much. >>> Once upon a time we had stats which could show a histogram how thread >>> usage. I used that to decided if the load justified more threads. >>> But we removed it because it was somewhat expensive and it was argued it >>> may not be all that useful... >>> I haven't really looked at any other stats in my work. Almost always it >>> is a packet capture that helps me see what is happening when I have an >>> issue to address. >>> >>> Maybe I should just accept that stats are primarily for developers and >>> they can be incredible useful for that purpose, and not worry if admins >>> might ever need them. >>> >>>> >>>> (And, why are they in /proc/fs/nfsd/ rather than under >>>> something RPC-related?) >>> >>> Maybe because we "owned" /proc/fs/nfsd/, but the RPC-related stuff is >>> under "net" and we didn't feel so comfortable sticking new stuff there. >>> Or maybe not. >>> >> >> -- >> Jeff Layton <jlayton@redhat.com> -- Chuck Lever
On Tue, 2023-07-11 at 22:17 +1000, NeilBrown wrote: > On Tue, 11 Jul 2023, Jeff Layton wrote: > > On Tue, 2023-07-11 at 19:54 +1000, NeilBrown wrote: > > > On Tue, 11 Jul 2023, Chuck Lever III wrote: > > > > > > > > > On Jul 10, 2023, at 6:29 PM, NeilBrown <neilb@suse.de> wrote: > > > > > > > > > > On Tue, 11 Jul 2023, Chuck Lever wrote: > > > > > > From: Chuck Lever <chuck.lever@oracle.com> > > > > > > > > > > > > Measure a source of thread scheduling inefficiency -- count threads > > > > > > that were awoken but found that the transport queue had already been > > > > > > emptied. > > > > > > > > > > > > An empty transport queue is possible when threads that run between > > > > > > the wake_up_process() call and the woken thread returning from the > > > > > > scheduler have pulled all remaining work off the transport queue > > > > > > using the first svc_xprt_dequeue() in svc_get_next_xprt(). > > > > > > > > > > I'm in two minds about this. The data being gathered here is > > > > > potentially useful > > > > > > > > It's actually pretty shocking: I've measured more than > > > > 15% of thread wake-ups find no work to do. > > > > > > That is a bigger number than I would have guessed! > > > > > > > I'm guessing the idea is that the receiver is waking a thread to do the > > work, and that races with one that's already running? I'm sure there are > > ways we can fix that, but it really does seem like we're trying to > > reinvent workqueues here. > > True. But then workqueues seem to reinvent themselves every so often > too. Once gets the impression they are trying to meet an enormous > variety of needs. > I'm not against trying to see if nfsd could work well in a workqueue > environment, but I'm not certain it is a good idea. Maintaining control > of our own thread pools might be safer. > > I have a vague memory of looking into this in more detail once and > deciding that it wasn't a good fit, but I cannot recall or easily deduce > the reason. Obviously we would have to give up SIGKILL, but we want to > do that anyway. > > Would we want unbound work queues - so they can be scheduled across > different CPUs? Are NFSD threads cpu-intensive or not? I'm not sure. > > I would be happy to explore a credible attempt at a conversion. > I had some patches several years ago that did a conversion from nfsd threads to workqueues. It worked reasonably well, but under heavy loads it didn't perform as well as having a dedicated threadpool...which is not too surprising, really. nfsd has been tuned for performance over years and it's fairly "greedy" about squatting on system resources even when it's idle. If we wanted to look again at doing this with workqueues, we'd need the workqueue infrastructure to allow for long-running jobs that may block for a long time. That means it might need to be more cavalier about spinning up new workqueue threads and keeping them running when there are a lot of concurrent, but sleeping workqueue jobs. We probably would want unbound workqueues so we can have more jobs in flight at any given time than cpus, given that a lot of them will end up being blocked in some way or another. CPU utilization is a good question. Mostly we just call down into the filesystem to do things, and the encoding and decoding is not _that_ cpu intensive. For most storage stacks, I suspect we don't use a lot of CPU aside from normal copying of data. There might be some exceptions though, like when the underlying storage is using encryption, etc. The main upside to switching to workqueues is that it would allow us to get rid of a lot of fiddly, hand-tuned thread handling code. It might also make it simpler to convert to a more asynchronous model. For instance, it would be nice to be able to not have to put a thread to sleep while waiting on writeback for a COMMIT. I think that'd be easier to handle with a workqueue-like model.
On Tue, 11 Jul 2023, Chuck Lever III wrote: > > > On Jul 10, 2023, at 6:29 PM, NeilBrown <neilb@suse.de> wrote: > > > > On Tue, 11 Jul 2023, Chuck Lever wrote: > >> From: Chuck Lever <chuck.lever@oracle.com> > >> > >> Measure a source of thread scheduling inefficiency -- count threads > >> that were awoken but found that the transport queue had already been > >> emptied. > >> > >> An empty transport queue is possible when threads that run between > >> the wake_up_process() call and the woken thread returning from the > >> scheduler have pulled all remaining work off the transport queue > >> using the first svc_xprt_dequeue() in svc_get_next_xprt(). > > > > I'm in two minds about this. The data being gathered here is > > potentially useful > > It's actually pretty shocking: I've measured more than > 15% of thread wake-ups find no work to do. I'm now wondering if that is a reliable statistic. This counter as implemented doesn't count "pool threads that were awoken but found no work to do". Rather, it counts "pool threads that found no work to do, either after having been awoken, or having just completed some other request". And it doesn't even really count that is it doesn't notice that lockd "retry blocked" work (or the NFSv4.1 callback work, but we don't count that at all I think). Maybe we should only update the count if we had actually been woken up recently. NeilBrown
> On Jul 18, 2023, at 8:39 PM, NeilBrown <neilb@suse.de> wrote: > > On Tue, 11 Jul 2023, Chuck Lever III wrote: >> >>> On Jul 10, 2023, at 6:29 PM, NeilBrown <neilb@suse.de> wrote: >>> >>> On Tue, 11 Jul 2023, Chuck Lever wrote: >>>> From: Chuck Lever <chuck.lever@oracle.com> >>>> >>>> Measure a source of thread scheduling inefficiency -- count threads >>>> that were awoken but found that the transport queue had already been >>>> emptied. >>>> >>>> An empty transport queue is possible when threads that run between >>>> the wake_up_process() call and the woken thread returning from the >>>> scheduler have pulled all remaining work off the transport queue >>>> using the first svc_xprt_dequeue() in svc_get_next_xprt(). >>> >>> I'm in two minds about this. The data being gathered here is >>> potentially useful >> >> It's actually pretty shocking: I've measured more than >> 15% of thread wake-ups find no work to do. > > I'm now wondering if that is a reliable statistic. > > This counter as implemented doesn't count "pool threads that were awoken > but found no work to do". Rather, it counts "pool threads that found no > work to do, either after having been awoken, or having just completed > some other request". In the current code, the only way to get to "return -EAGAIN;" is if the thread calls schedule_timeout() (ie, it actually sleeps), then the svc_rqst was specifically selected and awoken, and the schedule_timeout() did not time out. I don't see a problem. > And it doesn't even really count that is it doesn't notice that lockd > "retry blocked" work (or the NFSv4.1 callback work, but we don't count > that at all I think). > > Maybe we should only update the count if we had actually been woken up > recently. So this one can be dropped for now since it's currently of value only for working on the scheduler changes. If the thread scheduler were to change such that a work item was actually assigned to a thread before it is awoken (a la, a work queue model) then this counter would be truly meaningless. I think we can wait for a bit. -- Chuck Lever
On Wed, 19 Jul 2023, Chuck Lever III wrote: > > > On Jul 18, 2023, at 8:39 PM, NeilBrown <neilb@suse.de> wrote: > > > > On Tue, 11 Jul 2023, Chuck Lever III wrote: > >> > >>> On Jul 10, 2023, at 6:29 PM, NeilBrown <neilb@suse.de> wrote: > >>> > >>> On Tue, 11 Jul 2023, Chuck Lever wrote: > >>>> From: Chuck Lever <chuck.lever@oracle.com> > >>>> > >>>> Measure a source of thread scheduling inefficiency -- count threads > >>>> that were awoken but found that the transport queue had already been > >>>> emptied. > >>>> > >>>> An empty transport queue is possible when threads that run between > >>>> the wake_up_process() call and the woken thread returning from the > >>>> scheduler have pulled all remaining work off the transport queue > >>>> using the first svc_xprt_dequeue() in svc_get_next_xprt(). > >>> > >>> I'm in two minds about this. The data being gathered here is > >>> potentially useful > >> > >> It's actually pretty shocking: I've measured more than > >> 15% of thread wake-ups find no work to do. > > > > I'm now wondering if that is a reliable statistic. > > > > This counter as implemented doesn't count "pool threads that were awoken > > but found no work to do". Rather, it counts "pool threads that found no > > work to do, either after having been awoken, or having just completed > > some other request". > > In the current code, the only way to get to "return -EAGAIN;" is if the > thread calls schedule_timeout() (ie, it actually sleeps), then the > svc_rqst was specifically selected and awoken, and the schedule_timeout() > did not time out. > > I don't see a problem. > Yeah - I don't either any more. Sorry for the noise. > > > And it doesn't even really count that is it doesn't notice that lockd > > "retry blocked" work (or the NFSv4.1 callback work, but we don't count > > that at all I think). > > > > Maybe we should only update the count if we had actually been woken up > > recently. > > So this one can be dropped for now since it's currently of value only > for working on the scheduler changes. If the thread scheduler were to > change such that a work item was actually assigned to a thread before > it is awoken (a la, a work queue model) then this counter would be > truly meaningless. I think we can wait for a bit. > We used to assign a workitem to a thread before it was woken. I find that model to be aesthetically pleasing. Trond changed that in Commit: 22700f3c6df5 ("SUNRPC: Improve ordering of transport processing") claiming that the wake-up time for a sleeping thread could result in poorer throughput. No data given but I find the reasoning quite credible. Thanks, NeilBrown > > -- > Chuck Lever > > >
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 74ea13270679..9dd3b16cc4c2 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -43,6 +43,7 @@ struct svc_pool { struct percpu_counter sp_threads_woken; struct percpu_counter sp_threads_timedout; struct percpu_counter sp_threads_starved; + struct percpu_counter sp_threads_no_work; #define SP_TASK_PENDING (0) /* still work to do even if no * xprt is queued. */ diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 88b7b5fb6d75..b7a02309ecb1 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -518,6 +518,7 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools, percpu_counter_init(&pool->sp_threads_woken, 0, GFP_KERNEL); percpu_counter_init(&pool->sp_threads_timedout, 0, GFP_KERNEL); percpu_counter_init(&pool->sp_threads_starved, 0, GFP_KERNEL); + percpu_counter_init(&pool->sp_threads_no_work, 0, GFP_KERNEL); } return serv; @@ -595,6 +596,7 @@ svc_destroy(struct kref *ref) percpu_counter_destroy(&pool->sp_threads_woken); percpu_counter_destroy(&pool->sp_threads_timedout); percpu_counter_destroy(&pool->sp_threads_starved); + percpu_counter_destroy(&pool->sp_threads_no_work); } kfree(serv->sv_pools); kfree(serv); diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index ecbccf0d89b9..6c2a702aa469 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -776,9 +776,9 @@ static struct svc_xprt *svc_get_next_xprt(struct svc_rqst *rqstp, long timeout) if (!time_left) percpu_counter_inc(&pool->sp_threads_timedout); - if (signalled() || kthread_should_stop()) return ERR_PTR(-EINTR); + percpu_counter_inc(&pool->sp_threads_no_work); return ERR_PTR(-EAGAIN); out_found: /* Normally we will wait up to 5 seconds for any required @@ -1445,13 +1445,14 @@ static int svc_pool_stats_show(struct seq_file *m, void *p) return 0; } - seq_printf(m, "%u %llu %llu %llu %llu %llu\n", + seq_printf(m, "%u %llu %llu %llu %llu %llu %llu\n", pool->sp_id, percpu_counter_sum_positive(&pool->sp_messages_arrived), percpu_counter_sum_positive(&pool->sp_sockets_queued), percpu_counter_sum_positive(&pool->sp_threads_woken), percpu_counter_sum_positive(&pool->sp_threads_timedout), - percpu_counter_sum_positive(&pool->sp_threads_starved)); + percpu_counter_sum_positive(&pool->sp_threads_starved), + percpu_counter_sum_positive(&pool->sp_threads_no_work)); return 0; }