Message ID | 20220512030442.2530552-15-joel@joelfernandes.org (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
Series | Implement call_rcu_lazy() and miscellaneous fixes | expand |
On Thu, May 12, 2022 at 03:04:42AM +0000, Joel Fernandes (Google) wrote: > Add sysctl knobs just for easier debugging/testing, to tune the maximum > batch size, maximum time to wait before flush, and turning off the > feature entirely. > > Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org> This is good, and might also be needed longer term. One thought below. Thanx, Paul > --- > include/linux/sched/sysctl.h | 4 ++++ > kernel/rcu/lazy.c | 12 ++++++++++-- > kernel/sysctl.c | 23 +++++++++++++++++++++++ > 3 files changed, 37 insertions(+), 2 deletions(-) > > diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h > index c19dd5a2c05c..55ffc61beed1 100644 > --- a/include/linux/sched/sysctl.h > +++ b/include/linux/sched/sysctl.h > @@ -16,6 +16,10 @@ enum { sysctl_hung_task_timeout_secs = 0 }; > > extern unsigned int sysctl_sched_child_runs_first; > > +extern unsigned int sysctl_rcu_lazy; > +extern unsigned int sysctl_rcu_lazy_batch; > +extern unsigned int sysctl_rcu_lazy_jiffies; > + > enum sched_tunable_scaling { > SCHED_TUNABLESCALING_NONE, > SCHED_TUNABLESCALING_LOG, > diff --git a/kernel/rcu/lazy.c b/kernel/rcu/lazy.c > index 55e406cfc528..0af9fb67c92b 100644 > --- a/kernel/rcu/lazy.c > +++ b/kernel/rcu/lazy.c > @@ -12,6 +12,10 @@ > // How much to wait before flushing? > #define MAX_LAZY_JIFFIES 10000 > > +unsigned int sysctl_rcu_lazy_batch = MAX_LAZY_BATCH; > +unsigned int sysctl_rcu_lazy_jiffies = MAX_LAZY_JIFFIES; > +unsigned int sysctl_rcu_lazy = 1; > + > // We cast lazy_rcu_head to rcu_head and back. This keeps the API simple while > // allowing us to use lockless list node in the head. Also, we use BUILD_BUG_ON > // later to ensure that rcu_head and lazy_rcu_head are of the same size. > @@ -49,6 +53,10 @@ void call_rcu_lazy(struct rcu_head *head_rcu, rcu_callback_t func) > struct lazy_rcu_head *head = (struct lazy_rcu_head *)head_rcu; > struct rcu_lazy_pcp *rlp; > > + if (!sysctl_rcu_lazy) { This is the place to check for early boot use. Or, alternatively, initialize sysctl_rcu_lazy to zero and set it to one once boot is far enough along to allow all the pieces to work reasonably. > + return call_rcu(head_rcu, func); > + } > + > preempt_disable(); > rlp = this_cpu_ptr(&rcu_lazy_pcp_ins); > preempt_enable(); > @@ -67,11 +75,11 @@ void call_rcu_lazy(struct rcu_head *head_rcu, rcu_callback_t func) > llist_add(&head->llist_node, &rlp->head); > > // Flush queue if too big > - if (atomic_inc_return(&rlp->count) >= MAX_LAZY_BATCH) { > + if (atomic_inc_return(&rlp->count) >= sysctl_rcu_lazy_batch) { > lazy_rcu_flush_cpu(rlp); > } else { > if (!delayed_work_pending(&rlp->work)) { > - schedule_delayed_work(&rlp->work, MAX_LAZY_JIFFIES); > + schedule_delayed_work(&rlp->work, sysctl_rcu_lazy_jiffies); > } > } > } > diff --git a/kernel/sysctl.c b/kernel/sysctl.c > index 5ae443b2882e..2ba830ca71ec 100644 > --- a/kernel/sysctl.c > +++ b/kernel/sysctl.c > @@ -1659,6 +1659,29 @@ static struct ctl_table kern_table[] = { > .mode = 0644, > .proc_handler = proc_dointvec, > }, > +#ifdef CONFIG_RCU_LAZY > + { > + .procname = "rcu_lazy", > + .data = &sysctl_rcu_lazy, > + .maxlen = sizeof(unsigned int), > + .mode = 0644, > + .proc_handler = proc_dointvec, > + }, > + { > + .procname = "rcu_lazy_batch", > + .data = &sysctl_rcu_lazy_batch, > + .maxlen = sizeof(unsigned int), > + .mode = 0644, > + .proc_handler = proc_dointvec, > + }, > + { > + .procname = "rcu_lazy_jiffies", > + .data = &sysctl_rcu_lazy_jiffies, > + .maxlen = sizeof(unsigned int), > + .mode = 0644, > + .proc_handler = proc_dointvec, > + }, > +#endif > #ifdef CONFIG_SCHEDSTATS > { > .procname = "sched_schedstats", > -- > 2.36.0.550.gb090851708-goog >
On Thu, May 12, 2022 at 05:16:22PM -0700, Paul E. McKenney wrote: > On Thu, May 12, 2022 at 03:04:42AM +0000, Joel Fernandes (Google) wrote: > > Add sysctl knobs just for easier debugging/testing, to tune the maximum > > batch size, maximum time to wait before flush, and turning off the > > feature entirely. > > > > Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org> > > This is good, and might also be needed longer term. > > One thought below. > > Thanx, Paul > > > --- > > include/linux/sched/sysctl.h | 4 ++++ > > kernel/rcu/lazy.c | 12 ++++++++++-- > > kernel/sysctl.c | 23 +++++++++++++++++++++++ > > 3 files changed, 37 insertions(+), 2 deletions(-) > > > > diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h > > index c19dd5a2c05c..55ffc61beed1 100644 > > --- a/include/linux/sched/sysctl.h > > +++ b/include/linux/sched/sysctl.h > > @@ -16,6 +16,10 @@ enum { sysctl_hung_task_timeout_secs = 0 }; > > > > extern unsigned int sysctl_sched_child_runs_first; > > > > +extern unsigned int sysctl_rcu_lazy; > > +extern unsigned int sysctl_rcu_lazy_batch; > > +extern unsigned int sysctl_rcu_lazy_jiffies; > > + > > enum sched_tunable_scaling { > > SCHED_TUNABLESCALING_NONE, > > SCHED_TUNABLESCALING_LOG, > > diff --git a/kernel/rcu/lazy.c b/kernel/rcu/lazy.c > > index 55e406cfc528..0af9fb67c92b 100644 > > --- a/kernel/rcu/lazy.c > > +++ b/kernel/rcu/lazy.c > > @@ -12,6 +12,10 @@ > > // How much to wait before flushing? > > #define MAX_LAZY_JIFFIES 10000 > > > > +unsigned int sysctl_rcu_lazy_batch = MAX_LAZY_BATCH; > > +unsigned int sysctl_rcu_lazy_jiffies = MAX_LAZY_JIFFIES; > > +unsigned int sysctl_rcu_lazy = 1; > > + > > // We cast lazy_rcu_head to rcu_head and back. This keeps the API simple while > > // allowing us to use lockless list node in the head. Also, we use BUILD_BUG_ON > > // later to ensure that rcu_head and lazy_rcu_head are of the same size. > > @@ -49,6 +53,10 @@ void call_rcu_lazy(struct rcu_head *head_rcu, rcu_callback_t func) > > struct lazy_rcu_head *head = (struct lazy_rcu_head *)head_rcu; > > struct rcu_lazy_pcp *rlp; > > > > + if (!sysctl_rcu_lazy) { > > This is the place to check for early boot use. Or, alternatively, > initialize sysctl_rcu_lazy to zero and set it to one once boot is far > enough along to allow all the pieces to work reasonably. Sure, I was also thinking perhaps to set this to a static branch. That way we don't have to pay the cost of the branch after it is setup on boot. But I think as you mentioned on IRC, if we were to promote this to a non-DEBUG patch, we would need to handle conditions similar to Frederick's work on toggling offloading. thanks, - Joel
On Sat, May 14, 2022 at 02:38:53PM +0000, Joel Fernandes wrote: > On Thu, May 12, 2022 at 05:16:22PM -0700, Paul E. McKenney wrote: > > On Thu, May 12, 2022 at 03:04:42AM +0000, Joel Fernandes (Google) wrote: > > > Add sysctl knobs just for easier debugging/testing, to tune the maximum > > > batch size, maximum time to wait before flush, and turning off the > > > feature entirely. > > > > > > Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org> > > > > This is good, and might also be needed longer term. > > > > One thought below. > > > > Thanx, Paul > > > > > --- > > > include/linux/sched/sysctl.h | 4 ++++ > > > kernel/rcu/lazy.c | 12 ++++++++++-- > > > kernel/sysctl.c | 23 +++++++++++++++++++++++ > > > 3 files changed, 37 insertions(+), 2 deletions(-) > > > > > > diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h > > > index c19dd5a2c05c..55ffc61beed1 100644 > > > --- a/include/linux/sched/sysctl.h > > > +++ b/include/linux/sched/sysctl.h > > > @@ -16,6 +16,10 @@ enum { sysctl_hung_task_timeout_secs = 0 }; > > > > > > extern unsigned int sysctl_sched_child_runs_first; > > > > > > +extern unsigned int sysctl_rcu_lazy; > > > +extern unsigned int sysctl_rcu_lazy_batch; > > > +extern unsigned int sysctl_rcu_lazy_jiffies; > > > + > > > enum sched_tunable_scaling { > > > SCHED_TUNABLESCALING_NONE, > > > SCHED_TUNABLESCALING_LOG, > > > diff --git a/kernel/rcu/lazy.c b/kernel/rcu/lazy.c > > > index 55e406cfc528..0af9fb67c92b 100644 > > > --- a/kernel/rcu/lazy.c > > > +++ b/kernel/rcu/lazy.c > > > @@ -12,6 +12,10 @@ > > > // How much to wait before flushing? > > > #define MAX_LAZY_JIFFIES 10000 > > > > > > +unsigned int sysctl_rcu_lazy_batch = MAX_LAZY_BATCH; > > > +unsigned int sysctl_rcu_lazy_jiffies = MAX_LAZY_JIFFIES; > > > +unsigned int sysctl_rcu_lazy = 1; > > > + > > > // We cast lazy_rcu_head to rcu_head and back. This keeps the API simple while > > > // allowing us to use lockless list node in the head. Also, we use BUILD_BUG_ON > > > // later to ensure that rcu_head and lazy_rcu_head are of the same size. > > > @@ -49,6 +53,10 @@ void call_rcu_lazy(struct rcu_head *head_rcu, rcu_callback_t func) > > > struct lazy_rcu_head *head = (struct lazy_rcu_head *)head_rcu; > > > struct rcu_lazy_pcp *rlp; > > > > > > + if (!sysctl_rcu_lazy) { > > > > This is the place to check for early boot use. Or, alternatively, > > initialize sysctl_rcu_lazy to zero and set it to one once boot is far > > enough along to allow all the pieces to work reasonably. > > Sure, I was also thinking perhaps to set this to a static branch. That way we > don't have to pay the cost of the branch after it is setup on boot. A static branch would be fine, though it would be good to demostrate whether or not it actually provided visble benefits. Also, please see below. > But I think as you mentioned on IRC, if we were to promote this to a > non-DEBUG patch, we would need to handle conditions similar to Frederick's > work on toggling offloading. There are several ways to approach this: o Make call_rcu_lazy() also work for non-offloaded CPUs. It would clearly need to be default-disabled for this case, and careful thought would be required on the distro situation, some of which build with CONFIG_NO_HZ_FULL=y, but whose users almost all refrain from providing a nohz_full boot parameter. In short, adding significant risk to the common case for the benefit of an uncommon case is not usually a particularly good idea. ;-) And yes, proper segmentation of the cases is important. As in the proper question here is "What happens to users not on ChromeOS or Android?" So the fact that the common case probably is Android is not relevant to this line of thought. o Make call_rcu_lazy() work on a per-CPU basis, so that a lazy CPU cannot be de-offloaded. o Make call_rcu_lazy() work on a per-CPU basis, but make flushing the lazy queue would be part of the de-offloading process. Careful with the static branch in this case, since different CPUs might have different reacttions to call_rcu_lazy(). o Others' ideas here! Thanx, Paul
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index c19dd5a2c05c..55ffc61beed1 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -16,6 +16,10 @@ enum { sysctl_hung_task_timeout_secs = 0 }; extern unsigned int sysctl_sched_child_runs_first; +extern unsigned int sysctl_rcu_lazy; +extern unsigned int sysctl_rcu_lazy_batch; +extern unsigned int sysctl_rcu_lazy_jiffies; + enum sched_tunable_scaling { SCHED_TUNABLESCALING_NONE, SCHED_TUNABLESCALING_LOG, diff --git a/kernel/rcu/lazy.c b/kernel/rcu/lazy.c index 55e406cfc528..0af9fb67c92b 100644 --- a/kernel/rcu/lazy.c +++ b/kernel/rcu/lazy.c @@ -12,6 +12,10 @@ // How much to wait before flushing? #define MAX_LAZY_JIFFIES 10000 +unsigned int sysctl_rcu_lazy_batch = MAX_LAZY_BATCH; +unsigned int sysctl_rcu_lazy_jiffies = MAX_LAZY_JIFFIES; +unsigned int sysctl_rcu_lazy = 1; + // We cast lazy_rcu_head to rcu_head and back. This keeps the API simple while // allowing us to use lockless list node in the head. Also, we use BUILD_BUG_ON // later to ensure that rcu_head and lazy_rcu_head are of the same size. @@ -49,6 +53,10 @@ void call_rcu_lazy(struct rcu_head *head_rcu, rcu_callback_t func) struct lazy_rcu_head *head = (struct lazy_rcu_head *)head_rcu; struct rcu_lazy_pcp *rlp; + if (!sysctl_rcu_lazy) { + return call_rcu(head_rcu, func); + } + preempt_disable(); rlp = this_cpu_ptr(&rcu_lazy_pcp_ins); preempt_enable(); @@ -67,11 +75,11 @@ void call_rcu_lazy(struct rcu_head *head_rcu, rcu_callback_t func) llist_add(&head->llist_node, &rlp->head); // Flush queue if too big - if (atomic_inc_return(&rlp->count) >= MAX_LAZY_BATCH) { + if (atomic_inc_return(&rlp->count) >= sysctl_rcu_lazy_batch) { lazy_rcu_flush_cpu(rlp); } else { if (!delayed_work_pending(&rlp->work)) { - schedule_delayed_work(&rlp->work, MAX_LAZY_JIFFIES); + schedule_delayed_work(&rlp->work, sysctl_rcu_lazy_jiffies); } } } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 5ae443b2882e..2ba830ca71ec 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1659,6 +1659,29 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, +#ifdef CONFIG_RCU_LAZY + { + .procname = "rcu_lazy", + .data = &sysctl_rcu_lazy, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "rcu_lazy_batch", + .data = &sysctl_rcu_lazy_batch, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "rcu_lazy_jiffies", + .data = &sysctl_rcu_lazy_jiffies, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif #ifdef CONFIG_SCHEDSTATS { .procname = "sched_schedstats",
Add sysctl knobs just for easier debugging/testing, to tune the maximum batch size, maximum time to wait before flush, and turning off the feature entirely. Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org> --- include/linux/sched/sysctl.h | 4 ++++ kernel/rcu/lazy.c | 12 ++++++++++-- kernel/sysctl.c | 23 +++++++++++++++++++++++ 3 files changed, 37 insertions(+), 2 deletions(-)