diff mbox series

[RFC,v1,14/14] DEBUG: Toggle rcu_lazy and tune at runtime

Message ID 20220512030442.2530552-15-joel@joelfernandes.org (mailing list archive)
State Superseded
Headers show
Series Implement call_rcu_lazy() and miscellaneous fixes | expand

Commit Message

Joel Fernandes May 12, 2022, 3:04 a.m. UTC
Add sysctl knobs just for easier debugging/testing, to tune the maximum
batch size, maximum time to wait before flush, and turning off the
feature entirely.

Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
---
 include/linux/sched/sysctl.h |  4 ++++
 kernel/rcu/lazy.c            | 12 ++++++++++--
 kernel/sysctl.c              | 23 +++++++++++++++++++++++
 3 files changed, 37 insertions(+), 2 deletions(-)

Comments

Paul E. McKenney May 13, 2022, 12:16 a.m. UTC | #1
On Thu, May 12, 2022 at 03:04:42AM +0000, Joel Fernandes (Google) wrote:
> Add sysctl knobs just for easier debugging/testing, to tune the maximum
> batch size, maximum time to wait before flush, and turning off the
> feature entirely.
> 
> Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>

This is good, and might also be needed longer term.

One thought below.

							Thanx, Paul

> ---
>  include/linux/sched/sysctl.h |  4 ++++
>  kernel/rcu/lazy.c            | 12 ++++++++++--
>  kernel/sysctl.c              | 23 +++++++++++++++++++++++
>  3 files changed, 37 insertions(+), 2 deletions(-)
> 
> diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
> index c19dd5a2c05c..55ffc61beed1 100644
> --- a/include/linux/sched/sysctl.h
> +++ b/include/linux/sched/sysctl.h
> @@ -16,6 +16,10 @@ enum { sysctl_hung_task_timeout_secs = 0 };
>  
>  extern unsigned int sysctl_sched_child_runs_first;
>  
> +extern unsigned int sysctl_rcu_lazy;
> +extern unsigned int sysctl_rcu_lazy_batch;
> +extern unsigned int sysctl_rcu_lazy_jiffies;
> +
>  enum sched_tunable_scaling {
>  	SCHED_TUNABLESCALING_NONE,
>  	SCHED_TUNABLESCALING_LOG,
> diff --git a/kernel/rcu/lazy.c b/kernel/rcu/lazy.c
> index 55e406cfc528..0af9fb67c92b 100644
> --- a/kernel/rcu/lazy.c
> +++ b/kernel/rcu/lazy.c
> @@ -12,6 +12,10 @@
>  // How much to wait before flushing?
>  #define MAX_LAZY_JIFFIES	10000
>  
> +unsigned int sysctl_rcu_lazy_batch = MAX_LAZY_BATCH;
> +unsigned int sysctl_rcu_lazy_jiffies = MAX_LAZY_JIFFIES;
> +unsigned int sysctl_rcu_lazy = 1;
> +
>  // We cast lazy_rcu_head to rcu_head and back. This keeps the API simple while
>  // allowing us to use lockless list node in the head. Also, we use BUILD_BUG_ON
>  // later to ensure that rcu_head and lazy_rcu_head are of the same size.
> @@ -49,6 +53,10 @@ void call_rcu_lazy(struct rcu_head *head_rcu, rcu_callback_t func)
>  	struct lazy_rcu_head *head = (struct lazy_rcu_head *)head_rcu;
>  	struct rcu_lazy_pcp *rlp;
>  
> +	if (!sysctl_rcu_lazy) {

This is the place to check for early boot use.  Or, alternatively,
initialize sysctl_rcu_lazy to zero and set it to one once boot is far
enough along to allow all the pieces to work reasonably.

> +		return call_rcu(head_rcu, func);
> +	}
> +
>  	preempt_disable();
>          rlp = this_cpu_ptr(&rcu_lazy_pcp_ins);
>  	preempt_enable();
> @@ -67,11 +75,11 @@ void call_rcu_lazy(struct rcu_head *head_rcu, rcu_callback_t func)
>  	llist_add(&head->llist_node, &rlp->head);
>  
>  	// Flush queue if too big
> -	if (atomic_inc_return(&rlp->count) >= MAX_LAZY_BATCH) {
> +	if (atomic_inc_return(&rlp->count) >= sysctl_rcu_lazy_batch) {
>  		lazy_rcu_flush_cpu(rlp);
>  	} else {
>  		if (!delayed_work_pending(&rlp->work)) {
> -			schedule_delayed_work(&rlp->work, MAX_LAZY_JIFFIES);
> +			schedule_delayed_work(&rlp->work, sysctl_rcu_lazy_jiffies);
>  		}
>  	}
>  }
> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
> index 5ae443b2882e..2ba830ca71ec 100644
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -1659,6 +1659,29 @@ static struct ctl_table kern_table[] = {
>  		.mode		= 0644,
>  		.proc_handler	= proc_dointvec,
>  	},
> +#ifdef CONFIG_RCU_LAZY
> +	{
> +		.procname	= "rcu_lazy",
> +		.data		= &sysctl_rcu_lazy,
> +		.maxlen		= sizeof(unsigned int),
> +		.mode		= 0644,
> +		.proc_handler	= proc_dointvec,
> +	},
> +	{
> +		.procname	= "rcu_lazy_batch",
> +		.data		= &sysctl_rcu_lazy_batch,
> +		.maxlen		= sizeof(unsigned int),
> +		.mode		= 0644,
> +		.proc_handler	= proc_dointvec,
> +	},
> +	{
> +		.procname	= "rcu_lazy_jiffies",
> +		.data		= &sysctl_rcu_lazy_jiffies,
> +		.maxlen		= sizeof(unsigned int),
> +		.mode		= 0644,
> +		.proc_handler	= proc_dointvec,
> +	},
> +#endif
>  #ifdef CONFIG_SCHEDSTATS
>  	{
>  		.procname	= "sched_schedstats",
> -- 
> 2.36.0.550.gb090851708-goog
>
Joel Fernandes May 14, 2022, 2:38 p.m. UTC | #2
On Thu, May 12, 2022 at 05:16:22PM -0700, Paul E. McKenney wrote:
> On Thu, May 12, 2022 at 03:04:42AM +0000, Joel Fernandes (Google) wrote:
> > Add sysctl knobs just for easier debugging/testing, to tune the maximum
> > batch size, maximum time to wait before flush, and turning off the
> > feature entirely.
> > 
> > Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
> 
> This is good, and might also be needed longer term.
> 
> One thought below.
> 
> 							Thanx, Paul
> 
> > ---
> >  include/linux/sched/sysctl.h |  4 ++++
> >  kernel/rcu/lazy.c            | 12 ++++++++++--
> >  kernel/sysctl.c              | 23 +++++++++++++++++++++++
> >  3 files changed, 37 insertions(+), 2 deletions(-)
> > 
> > diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
> > index c19dd5a2c05c..55ffc61beed1 100644
> > --- a/include/linux/sched/sysctl.h
> > +++ b/include/linux/sched/sysctl.h
> > @@ -16,6 +16,10 @@ enum { sysctl_hung_task_timeout_secs = 0 };
> >  
> >  extern unsigned int sysctl_sched_child_runs_first;
> >  
> > +extern unsigned int sysctl_rcu_lazy;
> > +extern unsigned int sysctl_rcu_lazy_batch;
> > +extern unsigned int sysctl_rcu_lazy_jiffies;
> > +
> >  enum sched_tunable_scaling {
> >  	SCHED_TUNABLESCALING_NONE,
> >  	SCHED_TUNABLESCALING_LOG,
> > diff --git a/kernel/rcu/lazy.c b/kernel/rcu/lazy.c
> > index 55e406cfc528..0af9fb67c92b 100644
> > --- a/kernel/rcu/lazy.c
> > +++ b/kernel/rcu/lazy.c
> > @@ -12,6 +12,10 @@
> >  // How much to wait before flushing?
> >  #define MAX_LAZY_JIFFIES	10000
> >  
> > +unsigned int sysctl_rcu_lazy_batch = MAX_LAZY_BATCH;
> > +unsigned int sysctl_rcu_lazy_jiffies = MAX_LAZY_JIFFIES;
> > +unsigned int sysctl_rcu_lazy = 1;
> > +
> >  // We cast lazy_rcu_head to rcu_head and back. This keeps the API simple while
> >  // allowing us to use lockless list node in the head. Also, we use BUILD_BUG_ON
> >  // later to ensure that rcu_head and lazy_rcu_head are of the same size.
> > @@ -49,6 +53,10 @@ void call_rcu_lazy(struct rcu_head *head_rcu, rcu_callback_t func)
> >  	struct lazy_rcu_head *head = (struct lazy_rcu_head *)head_rcu;
> >  	struct rcu_lazy_pcp *rlp;
> >  
> > +	if (!sysctl_rcu_lazy) {
> 
> This is the place to check for early boot use.  Or, alternatively,
> initialize sysctl_rcu_lazy to zero and set it to one once boot is far
> enough along to allow all the pieces to work reasonably.

Sure, I was also thinking perhaps to set this to a static branch. That way we
don't have to pay the cost of the branch after it is setup on boot.

But I think as you mentioned on IRC, if we were to promote this to a
non-DEBUG patch, we would need to handle conditions similar to Frederick's
work on toggling offloading.

thanks,

 - Joel
Paul E. McKenney May 14, 2022, 4:21 p.m. UTC | #3
On Sat, May 14, 2022 at 02:38:53PM +0000, Joel Fernandes wrote:
> On Thu, May 12, 2022 at 05:16:22PM -0700, Paul E. McKenney wrote:
> > On Thu, May 12, 2022 at 03:04:42AM +0000, Joel Fernandes (Google) wrote:
> > > Add sysctl knobs just for easier debugging/testing, to tune the maximum
> > > batch size, maximum time to wait before flush, and turning off the
> > > feature entirely.
> > > 
> > > Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
> > 
> > This is good, and might also be needed longer term.
> > 
> > One thought below.
> > 
> > 							Thanx, Paul
> > 
> > > ---
> > >  include/linux/sched/sysctl.h |  4 ++++
> > >  kernel/rcu/lazy.c            | 12 ++++++++++--
> > >  kernel/sysctl.c              | 23 +++++++++++++++++++++++
> > >  3 files changed, 37 insertions(+), 2 deletions(-)
> > > 
> > > diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
> > > index c19dd5a2c05c..55ffc61beed1 100644
> > > --- a/include/linux/sched/sysctl.h
> > > +++ b/include/linux/sched/sysctl.h
> > > @@ -16,6 +16,10 @@ enum { sysctl_hung_task_timeout_secs = 0 };
> > >  
> > >  extern unsigned int sysctl_sched_child_runs_first;
> > >  
> > > +extern unsigned int sysctl_rcu_lazy;
> > > +extern unsigned int sysctl_rcu_lazy_batch;
> > > +extern unsigned int sysctl_rcu_lazy_jiffies;
> > > +
> > >  enum sched_tunable_scaling {
> > >  	SCHED_TUNABLESCALING_NONE,
> > >  	SCHED_TUNABLESCALING_LOG,
> > > diff --git a/kernel/rcu/lazy.c b/kernel/rcu/lazy.c
> > > index 55e406cfc528..0af9fb67c92b 100644
> > > --- a/kernel/rcu/lazy.c
> > > +++ b/kernel/rcu/lazy.c
> > > @@ -12,6 +12,10 @@
> > >  // How much to wait before flushing?
> > >  #define MAX_LAZY_JIFFIES	10000
> > >  
> > > +unsigned int sysctl_rcu_lazy_batch = MAX_LAZY_BATCH;
> > > +unsigned int sysctl_rcu_lazy_jiffies = MAX_LAZY_JIFFIES;
> > > +unsigned int sysctl_rcu_lazy = 1;
> > > +
> > >  // We cast lazy_rcu_head to rcu_head and back. This keeps the API simple while
> > >  // allowing us to use lockless list node in the head. Also, we use BUILD_BUG_ON
> > >  // later to ensure that rcu_head and lazy_rcu_head are of the same size.
> > > @@ -49,6 +53,10 @@ void call_rcu_lazy(struct rcu_head *head_rcu, rcu_callback_t func)
> > >  	struct lazy_rcu_head *head = (struct lazy_rcu_head *)head_rcu;
> > >  	struct rcu_lazy_pcp *rlp;
> > >  
> > > +	if (!sysctl_rcu_lazy) {
> > 
> > This is the place to check for early boot use.  Or, alternatively,
> > initialize sysctl_rcu_lazy to zero and set it to one once boot is far
> > enough along to allow all the pieces to work reasonably.
> 
> Sure, I was also thinking perhaps to set this to a static branch. That way we
> don't have to pay the cost of the branch after it is setup on boot.

A static branch would be fine, though it would be good to demostrate
whether or not it actually provided visble benefits.  Also, please see
below.

> But I think as you mentioned on IRC, if we were to promote this to a
> non-DEBUG patch, we would need to handle conditions similar to Frederick's
> work on toggling offloading.

There are several ways to approach this:

o	Make call_rcu_lazy() also work for non-offloaded CPUs.
	It would clearly need to be default-disabled for this case,
	and careful thought would be required on the distro situation,
	some of which build with CONFIG_NO_HZ_FULL=y, but whose users
	almost all refrain from providing a nohz_full boot parameter.

	In short, adding significant risk to the common case for the
	benefit of an uncommon case is not usually a particularly
	good idea.  ;-)

	And yes, proper segmentation of the cases is important.  As in
	the proper question here is "What happens to users not on ChromeOS
	or Android?"  So the fact that the common case probably is
	Android is not relevant to this line of thought.

o	Make call_rcu_lazy() work on a per-CPU basis, so that a lazy
	CPU cannot be de-offloaded.

o	Make call_rcu_lazy() work on a per-CPU basis, but make flushing
	the lazy queue would be part of the de-offloading process.
	Careful with the static branch in this case, since different
	CPUs might have different reacttions to call_rcu_lazy().

o	Others' ideas here!

							Thanx, Paul
diff mbox series

Patch

diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index c19dd5a2c05c..55ffc61beed1 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -16,6 +16,10 @@  enum { sysctl_hung_task_timeout_secs = 0 };
 
 extern unsigned int sysctl_sched_child_runs_first;
 
+extern unsigned int sysctl_rcu_lazy;
+extern unsigned int sysctl_rcu_lazy_batch;
+extern unsigned int sysctl_rcu_lazy_jiffies;
+
 enum sched_tunable_scaling {
 	SCHED_TUNABLESCALING_NONE,
 	SCHED_TUNABLESCALING_LOG,
diff --git a/kernel/rcu/lazy.c b/kernel/rcu/lazy.c
index 55e406cfc528..0af9fb67c92b 100644
--- a/kernel/rcu/lazy.c
+++ b/kernel/rcu/lazy.c
@@ -12,6 +12,10 @@ 
 // How much to wait before flushing?
 #define MAX_LAZY_JIFFIES	10000
 
+unsigned int sysctl_rcu_lazy_batch = MAX_LAZY_BATCH;
+unsigned int sysctl_rcu_lazy_jiffies = MAX_LAZY_JIFFIES;
+unsigned int sysctl_rcu_lazy = 1;
+
 // We cast lazy_rcu_head to rcu_head and back. This keeps the API simple while
 // allowing us to use lockless list node in the head. Also, we use BUILD_BUG_ON
 // later to ensure that rcu_head and lazy_rcu_head are of the same size.
@@ -49,6 +53,10 @@  void call_rcu_lazy(struct rcu_head *head_rcu, rcu_callback_t func)
 	struct lazy_rcu_head *head = (struct lazy_rcu_head *)head_rcu;
 	struct rcu_lazy_pcp *rlp;
 
+	if (!sysctl_rcu_lazy) {
+		return call_rcu(head_rcu, func);
+	}
+
 	preempt_disable();
         rlp = this_cpu_ptr(&rcu_lazy_pcp_ins);
 	preempt_enable();
@@ -67,11 +75,11 @@  void call_rcu_lazy(struct rcu_head *head_rcu, rcu_callback_t func)
 	llist_add(&head->llist_node, &rlp->head);
 
 	// Flush queue if too big
-	if (atomic_inc_return(&rlp->count) >= MAX_LAZY_BATCH) {
+	if (atomic_inc_return(&rlp->count) >= sysctl_rcu_lazy_batch) {
 		lazy_rcu_flush_cpu(rlp);
 	} else {
 		if (!delayed_work_pending(&rlp->work)) {
-			schedule_delayed_work(&rlp->work, MAX_LAZY_JIFFIES);
+			schedule_delayed_work(&rlp->work, sysctl_rcu_lazy_jiffies);
 		}
 	}
 }
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 5ae443b2882e..2ba830ca71ec 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1659,6 +1659,29 @@  static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
+#ifdef CONFIG_RCU_LAZY
+	{
+		.procname	= "rcu_lazy",
+		.data		= &sysctl_rcu_lazy,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "rcu_lazy_batch",
+		.data		= &sysctl_rcu_lazy_batch,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "rcu_lazy_jiffies",
+		.data		= &sysctl_rcu_lazy_jiffies,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+#endif
 #ifdef CONFIG_SCHEDSTATS
 	{
 		.procname	= "sched_schedstats",