diff mbox series

[v5,06/18] rcu: Introduce call_rcu_lazy() API implementation

Message ID 20220901221720.1105021-7-joel@joelfernandes.org (mailing list archive)
State New, archived
Headers show
Series Implement call_rcu_lazy() and miscellaneous fixes | expand

Commit Message

Joel Fernandes Sept. 1, 2022, 10:17 p.m. UTC
Implement timer-based RCU lazy callback batching. The batch is flushed
whenever a certain amount of time has passed, or the batch on a
particular CPU grows too big. Also memory pressure will flush it in a
future patch.

To handle several corner cases automagically (such as rcu_barrier() and
hotplug), we re-use bypass lists to handle lazy CBs. The bypass list
length has the lazy CB length included in it. A separate lazy CB length
counter is also introduced to keep track of the number of lazy CBs.

Suggested-by: Paul McKenney <paulmck@kernel.org>
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
---
 include/linux/rcupdate.h   |   6 ++
 kernel/rcu/Kconfig         |   8 ++
 kernel/rcu/rcu.h           |  11 +++
 kernel/rcu/rcu_segcblist.c |   2 +-
 kernel/rcu/tree.c          | 130 +++++++++++++++---------
 kernel/rcu/tree.h          |  13 ++-
 kernel/rcu/tree_nocb.h     | 198 ++++++++++++++++++++++++++++++-------
 7 files changed, 280 insertions(+), 88 deletions(-)

Comments

Frederic Weisbecker Sept. 2, 2022, 3:21 p.m. UTC | #1
On Thu, Sep 01, 2022 at 10:17:08PM +0000, Joel Fernandes (Google) wrote:
> Implement timer-based RCU lazy callback batching. The batch is flushed
> whenever a certain amount of time has passed, or the batch on a
> particular CPU grows too big. Also memory pressure will flush it in a
> future patch.
> 
> To handle several corner cases automagically (such as rcu_barrier() and
> hotplug), we re-use bypass lists to handle lazy CBs. The bypass list
> length has the lazy CB length included in it. A separate lazy CB length
> counter is also introduced to keep track of the number of lazy CBs.
> 
> Suggested-by: Paul McKenney <paulmck@kernel.org>
> Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
> ---
>  include/linux/rcupdate.h   |   6 ++
>  kernel/rcu/Kconfig         |   8 ++
>  kernel/rcu/rcu.h           |  11 +++
>  kernel/rcu/rcu_segcblist.c |   2 +-
>  kernel/rcu/tree.c          | 130 +++++++++++++++---------
>  kernel/rcu/tree.h          |  13 ++-
>  kernel/rcu/tree_nocb.h     | 198 ++++++++++++++++++++++++++++++-------
>  7 files changed, 280 insertions(+), 88 deletions(-)
> 
> diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
> index 08605ce7379d..82e8a07e0856 100644
> --- a/include/linux/rcupdate.h
> +++ b/include/linux/rcupdate.h
> @@ -108,6 +108,12 @@ static inline int rcu_preempt_depth(void)
>  
>  #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
>  
> +#ifdef CONFIG_RCU_LAZY
> +void call_rcu_lazy(struct rcu_head *head, rcu_callback_t func);
> +#else
> +#define call_rcu_lazy(head, func) call_rcu(head, func)

Why about an inline instead?

> +#endif
> +
>  /* Internal to kernel */
>  void rcu_init(void);
>  extern int rcu_scheduler_active;
> diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
> index d471d22a5e21..3128d01427cb 100644
> --- a/kernel/rcu/Kconfig
> +++ b/kernel/rcu/Kconfig
> @@ -311,4 +311,12 @@ config TASKS_TRACE_RCU_READ_MB
>  	  Say N here if you hate read-side memory barriers.
>  	  Take the default if you are unsure.
>  
> +config RCU_LAZY
> +	bool "RCU callback lazy invocation functionality"
> +	depends on RCU_NOCB_CPU
> +	default n
> +	help
> +	  To save power, batch RCU callbacks and flush after delay, memory
> +	  pressure or callback list growing too big.
> +
>  endmenu # "RCU Subsystem"
> diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
> index be5979da07f5..94675f14efe8 100644
> --- a/kernel/rcu/rcu.h
> +++ b/kernel/rcu/rcu.h
> @@ -474,6 +474,14 @@ enum rcutorture_type {
>  	INVALID_RCU_FLAVOR
>  };
>  
> +#if defined(CONFIG_RCU_LAZY)
> +unsigned long rcu_lazy_get_jiffies_till_flush(void);
> +void rcu_lazy_set_jiffies_till_flush(unsigned long j);
> +#else
> +static inline unsigned long rcu_lazy_get_jiffies_till_flush(void) { return 0; }
> +static inline void rcu_lazy_set_jiffies_till_flush(unsigned long j) { }
> +#endif
> +
>  #if defined(CONFIG_TREE_RCU)
>  void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
>  			    unsigned long *gp_seq);
> @@ -483,6 +491,8 @@ void do_trace_rcu_torture_read(const char *rcutorturename,
>  			       unsigned long c_old,
>  			       unsigned long c);
>  void rcu_gp_set_torture_wait(int duration);
> +void rcu_force_call_rcu_to_lazy(bool force);
> +
>  #else
>  static inline void rcutorture_get_gp_data(enum rcutorture_type test_type,
>  					  int *flags, unsigned long *gp_seq)
> @@ -501,6 +511,7 @@ void do_trace_rcu_torture_read(const char *rcutorturename,
>  	do { } while (0)
>  #endif
>  static inline void rcu_gp_set_torture_wait(int duration) { }
> +static inline void rcu_force_call_rcu_to_lazy(bool force) { }
>  #endif
>  
>  #if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST)
> diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
> index c54ea2b6a36b..55b50e592986 100644
> --- a/kernel/rcu/rcu_segcblist.c
> +++ b/kernel/rcu/rcu_segcblist.c
> @@ -38,7 +38,7 @@ void rcu_cblist_enqueue(struct rcu_cblist *rclp, struct rcu_head *rhp)
>   * element of the second rcu_cblist structure, but ensuring that the second
>   * rcu_cblist structure, if initially non-empty, always appears non-empty
>   * throughout the process.  If rdp is NULL, the second rcu_cblist structure
> - * is instead initialized to empty.
> + * is instead initialized to empty. Also account for lazy_len for lazy CBs.

Leftover?

> @@ -3904,7 +3943,8 @@ static void rcu_barrier_entrain(struct rcu_data *rdp)
>  	rdp->barrier_head.func = rcu_barrier_callback;
>  	debug_rcu_head_queue(&rdp->barrier_head);
>  	rcu_nocb_lock(rdp);
> -	WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies));
> +	WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies, false,
> +		     /* wake gp thread */ true));

It's a bad sign when you need to start commenting your boolean parameters :)
Perhaps use a single two-bit flag instead of two booleans, for readability?

Also that's a subtle change which purpose isn't explained. It means that
rcu_barrier_entrain() used to wait for the bypass timer in the worst case
but now we force rcuog into it immediately. Should that be a separate change?

> diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
> index 31068dd31315..7e97a7b6e046 100644
> --- a/kernel/rcu/tree_nocb.h
> +++ b/kernel/rcu/tree_nocb.h
> @@ -256,6 +256,31 @@ static bool wake_nocb_gp(struct rcu_data *rdp, bool force)
>  	return __wake_nocb_gp(rdp_gp, rdp, force, flags);
>  }
>  
> +/*
> + * LAZY_FLUSH_JIFFIES decides the maximum amount of time that
> + * can elapse before lazy callbacks are flushed. Lazy callbacks
> + * could be flushed much earlier for a number of other reasons
> + * however, LAZY_FLUSH_JIFFIES will ensure no lazy callbacks are
> + * left unsubmitted to RCU after those many jiffies.
> + */
> +#define LAZY_FLUSH_JIFFIES (10 * HZ)
> +unsigned long jiffies_till_flush = LAZY_FLUSH_JIFFIES;

It seems this can be made static?

> @@ -265,23 +290,39 @@ static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype,
>  {
>  	unsigned long flags;
>  	struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
> +	unsigned long mod_jif = 0;
>  
>  	raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
>  
>  	/*
> -	 * Bypass wakeup overrides previous deferments. In case
> -	 * of callback storm, no need to wake up too early.
> +	 * Bypass and lazy wakeup overrides previous deferments. In case of
> +	 * callback storm, no need to wake up too early.
>  	 */
> -	if (waketype == RCU_NOCB_WAKE_BYPASS) {
> -		mod_timer(&rdp_gp->nocb_timer, jiffies + 2);
> -		WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype);
> -	} else {
> +	switch (waketype) {
> +	case RCU_NOCB_WAKE_LAZY:
> +		if (rdp->nocb_defer_wakeup != RCU_NOCB_WAKE_LAZY)
> +			mod_jif = jiffies_till_flush;
> +		break;

Ah so this timer override all the others, and it's a several seconds
timers. Then I'll have a related concern on nocb_gp_wait().

> +
> +	case RCU_NOCB_WAKE_BYPASS:
> +		mod_jif = 2;
> +		break;
> +
> +	case RCU_NOCB_WAKE:
> +	case RCU_NOCB_WAKE_FORCE:
> +		// For these, make it wake up the soonest if we
> +		// were in a bypass or lazy sleep before.
>  		if (rdp_gp->nocb_defer_wakeup < RCU_NOCB_WAKE)
> -			mod_timer(&rdp_gp->nocb_timer, jiffies + 1);
> -		if (rdp_gp->nocb_defer_wakeup < waketype)
> -			WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype);
> +			mod_jif = 1;
> +		break;
>  	}
>  
> +	if (mod_jif)
> +		mod_timer(&rdp_gp->nocb_timer, jiffies + mod_jif);
> +
> +	if (rdp_gp->nocb_defer_wakeup < waketype)
> +		WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype);

So RCU_NOCB_WAKE_BYPASS and RCU_NOCB_WAKE_LAZY don't override the timer state
anymore? Looks like something is missing.

> +
>  	raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
>  
>  	trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, reason);
[...]
> @@ -705,12 +816,21 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
>  	my_rdp->nocb_gp_gp = needwait_gp;
>  	my_rdp->nocb_gp_seq = needwait_gp ? wait_gp_seq : 0;
>  
> -	if (bypass && !rcu_nocb_poll) {
> -		// At least one child with non-empty ->nocb_bypass, so set
> -		// timer in order to avoid stranding its callbacks.
> -		wake_nocb_gp_defer(my_rdp, RCU_NOCB_WAKE_BYPASS,
> -				   TPS("WakeBypassIsDeferred"));
> +	// At least one child with non-empty ->nocb_bypass, so set
> +	// timer in order to avoid stranding its callbacks.
> +	if (!rcu_nocb_poll) {
> +		// If bypass list only has lazy CBs. Add a deferred
> +		// lazy wake up.
> +		if (lazy && !bypass) {
> +			wake_nocb_gp_defer(my_rdp, RCU_NOCB_WAKE_LAZY,
> +					TPS("WakeLazyIsDeferred"));

What if:

1) rdp(1) has lazy callbacks
2) all other rdp's have no callback at all
3) nocb_gp_wait() runs through all rdp's, everything is handled, except for
   these lazy callbacks
4) It reaches the above path, ready to arm the RCU_NOCB_WAKE_LAZY timer,
   but it hasn't yet called wake_nocb_gp_defer()
5) Oh but rdp(2) queues a non-lazy callback. interrupts are disabled so it defers
   the wake up to nocb_gp_wait() with arming the timer in RCU_NOCB_WAKE.
6) nocb_gp_wait() finally calls wake_nocb_gp_defer() and override the timeout
   to several seconds ahead.
7) No more callbacks queued, the non-lazy callback will have to wait several
   seconds to complete.

Or did I miss something? Note that the race exists with RCU_NOCB_WAKE_BYPASS
but it's only about one jiffy delay, not seconds.


(further review later).

Thanks.

> +		// Otherwise add a deferred bypass wake up.
> +		} else if (bypass) {
> +			wake_nocb_gp_defer(my_rdp, RCU_NOCB_WAKE_BYPASS,
> +					TPS("WakeBypassIsDeferred"));
> +		}
>  	}
> +
>  	if (rcu_nocb_poll) {
>  		/* Polling, so trace if first poll in the series. */
>  		if (gotcbs)
> @@ -1036,7 +1156,7 @@ static long rcu_nocb_rdp_deoffload(void *arg)
>  	 * return false, which means that future calls to rcu_nocb_try_bypass()
>  	 * will refuse to put anything into the bypass.
>  	 */
> -	WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies));
> +	WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies, false, false));
>  	/*
>  	 * Start with invoking rcu_core() early. This way if the current thread
>  	 * happens to preempt an ongoing call to rcu_core() in the middle,
> @@ -1290,6 +1410,7 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
>  	raw_spin_lock_init(&rdp->nocb_gp_lock);
>  	timer_setup(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer, 0);
>  	rcu_cblist_init(&rdp->nocb_bypass);
> +	WRITE_ONCE(rdp->lazy_len, 0);
>  	mutex_init(&rdp->nocb_gp_kthread_mutex);
>  }
>  
> @@ -1571,13 +1692,14 @@ static void rcu_init_one_nocb(struct rcu_node *rnp)
>  }
>  
>  static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
> -				  unsigned long j)
> +				  unsigned long j, bool lazy, bool wakegp)
>  {
>  	return true;
>  }
>  
>  static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
> -				bool *was_alldone, unsigned long flags)
> +				bool *was_alldone, unsigned long flags,
> +				bool lazy)
>  {
>  	return false;
>  }
> -- 
> 2.37.2.789.g6183377224-goog
>
Joel Fernandes Sept. 2, 2022, 11:09 p.m. UTC | #2
Hi Frederick,

On 9/2/2022 11:21 AM, Frederic Weisbecker wrote:
> On Thu, Sep 01, 2022 at 10:17:08PM +0000, Joel Fernandes (Google) wrote:
>> Implement timer-based RCU lazy callback batching. The batch is flushed
>> whenever a certain amount of time has passed, or the batch on a
>> particular CPU grows too big. Also memory pressure will flush it in a
>> future patch.
>>
>> To handle several corner cases automagically (such as rcu_barrier() and
>> hotplug), we re-use bypass lists to handle lazy CBs. The bypass list
>> length has the lazy CB length included in it. A separate lazy CB length
>> counter is also introduced to keep track of the number of lazy CBs.
>>
>> Suggested-by: Paul McKenney <paulmck@kernel.org>
>> Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
>> ---
>>  include/linux/rcupdate.h   |   6 ++
>>  kernel/rcu/Kconfig         |   8 ++
>>  kernel/rcu/rcu.h           |  11 +++
>>  kernel/rcu/rcu_segcblist.c |   2 +-
>>  kernel/rcu/tree.c          | 130 +++++++++++++++---------
>>  kernel/rcu/tree.h          |  13 ++-
>>  kernel/rcu/tree_nocb.h     | 198 ++++++++++++++++++++++++++++++-------
>>  7 files changed, 280 insertions(+), 88 deletions(-)
>>
>> diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
>> index 08605ce7379d..82e8a07e0856 100644
>> --- a/include/linux/rcupdate.h
>> +++ b/include/linux/rcupdate.h
>> @@ -108,6 +108,12 @@ static inline int rcu_preempt_depth(void)
>>  
>>  #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
>>  
>> +#ifdef CONFIG_RCU_LAZY
>> +void call_rcu_lazy(struct rcu_head *head, rcu_callback_t func);
>> +#else
>> +#define call_rcu_lazy(head, func) call_rcu(head, func)
> 
> Why about an inline instead?

Sure, I can do that.

>> +#endif
>> +
>>  /* Internal to kernel */
>>  void rcu_init(void);
>>  extern int rcu_scheduler_active;
>> diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
>> index d471d22a5e21..3128d01427cb 100644
>> --- a/kernel/rcu/Kconfig
>> +++ b/kernel/rcu/Kconfig
>> @@ -311,4 +311,12 @@ config TASKS_TRACE_RCU_READ_MB
>>  	  Say N here if you hate read-side memory barriers.
>>  	  Take the default if you are unsure.
>>  
>> +config RCU_LAZY
>> +	bool "RCU callback lazy invocation functionality"
>> +	depends on RCU_NOCB_CPU
>> +	default n
>> +	help
>> +	  To save power, batch RCU callbacks and flush after delay, memory
>> +	  pressure or callback list growing too big.
>> +
>>  endmenu # "RCU Subsystem"
>> diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
>> index be5979da07f5..94675f14efe8 100644
>> --- a/kernel/rcu/rcu.h
>> +++ b/kernel/rcu/rcu.h
>> @@ -474,6 +474,14 @@ enum rcutorture_type {
>>  	INVALID_RCU_FLAVOR
>>  };
>>  
>> +#if defined(CONFIG_RCU_LAZY)
>> +unsigned long rcu_lazy_get_jiffies_till_flush(void);
>> +void rcu_lazy_set_jiffies_till_flush(unsigned long j);
>> +#else
>> +static inline unsigned long rcu_lazy_get_jiffies_till_flush(void) { return 0; }
>> +static inline void rcu_lazy_set_jiffies_till_flush(unsigned long j) { }
>> +#endif
>> +
>>  #if defined(CONFIG_TREE_RCU)
>>  void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
>>  			    unsigned long *gp_seq);
>> @@ -483,6 +491,8 @@ void do_trace_rcu_torture_read(const char *rcutorturename,
>>  			       unsigned long c_old,
>>  			       unsigned long c);
>>  void rcu_gp_set_torture_wait(int duration);
>> +void rcu_force_call_rcu_to_lazy(bool force);
>> +
>>  #else
>>  static inline void rcutorture_get_gp_data(enum rcutorture_type test_type,
>>  					  int *flags, unsigned long *gp_seq)
>> @@ -501,6 +511,7 @@ void do_trace_rcu_torture_read(const char *rcutorturename,
>>  	do { } while (0)
>>  #endif
>>  static inline void rcu_gp_set_torture_wait(int duration) { }
>> +static inline void rcu_force_call_rcu_to_lazy(bool force) { }
>>  #endif
>>  
>>  #if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST)
>> diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
>> index c54ea2b6a36b..55b50e592986 100644
>> --- a/kernel/rcu/rcu_segcblist.c
>> +++ b/kernel/rcu/rcu_segcblist.c
>> @@ -38,7 +38,7 @@ void rcu_cblist_enqueue(struct rcu_cblist *rclp, struct rcu_head *rhp)
>>   * element of the second rcu_cblist structure, but ensuring that the second
>>   * rcu_cblist structure, if initially non-empty, always appears non-empty
>>   * throughout the process.  If rdp is NULL, the second rcu_cblist structure
>> - * is instead initialized to empty.
>> + * is instead initialized to empty. Also account for lazy_len for lazy CBs.
> 
> Leftover?

Will fix, thanks.

>> @@ -3904,7 +3943,8 @@ static void rcu_barrier_entrain(struct rcu_data *rdp)
>>  	rdp->barrier_head.func = rcu_barrier_callback;
>>  	debug_rcu_head_queue(&rdp->barrier_head);
>>  	rcu_nocb_lock(rdp);
>> -	WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies));
>> +	WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies, false,
>> +		     /* wake gp thread */ true));
> 
> It's a bad sign when you need to start commenting your boolean parameters :)
> Perhaps use a single two-bit flag instead of two booleans, for readability?

That's fair, what do you mean 2-bit flag? Are you saying, we encode the last 2
parameters to flush bypass in a u*?

> Also that's a subtle change which purpose isn't explained. It means that
> rcu_barrier_entrain() used to wait for the bypass timer in the worst case
> but now we force rcuog into it immediately. Should that be a separate change?

It could be split, but it is laziness that amplifies the issue so I thought of
keeping it in the same patch. I don't mind one way or the other.

>> diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
>> index 31068dd31315..7e97a7b6e046 100644
>> --- a/kernel/rcu/tree_nocb.h
>> +++ b/kernel/rcu/tree_nocb.h
>> @@ -256,6 +256,31 @@ static bool wake_nocb_gp(struct rcu_data *rdp, bool force)
>>  	return __wake_nocb_gp(rdp_gp, rdp, force, flags);
>>  }
>>  
>> +/*
>> + * LAZY_FLUSH_JIFFIES decides the maximum amount of time that
>> + * can elapse before lazy callbacks are flushed. Lazy callbacks
>> + * could be flushed much earlier for a number of other reasons
>> + * however, LAZY_FLUSH_JIFFIES will ensure no lazy callbacks are
>> + * left unsubmitted to RCU after those many jiffies.
>> + */
>> +#define LAZY_FLUSH_JIFFIES (10 * HZ)
>> +unsigned long jiffies_till_flush = LAZY_FLUSH_JIFFIES;
> 
> It seems this can be made static?

Yes, will do.
>> @@ -265,23 +290,39 @@ static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype,
>>  {
>>  	unsigned long flags;
>>  	struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
>> +	unsigned long mod_jif = 0;
>>  
>>  	raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
>>  
>>  	/*
>> -	 * Bypass wakeup overrides previous deferments. In case
>> -	 * of callback storm, no need to wake up too early.
>> +	 * Bypass and lazy wakeup overrides previous deferments. In case of
>> +	 * callback storm, no need to wake up too early.
>>  	 */
>> -	if (waketype == RCU_NOCB_WAKE_BYPASS) {
>> -		mod_timer(&rdp_gp->nocb_timer, jiffies + 2);
>> -		WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype);
>> -	} else {
>> +	switch (waketype) {
>> +	case RCU_NOCB_WAKE_LAZY:
>> +		if (rdp->nocb_defer_wakeup != RCU_NOCB_WAKE_LAZY)
>> +			mod_jif = jiffies_till_flush;
>> +		break;
> 
> Ah so this timer override all the others, and it's a several seconds
> timers. Then I'll have a related concern on nocb_gp_wait().

Ok I replied below.

>> +	case RCU_NOCB_WAKE_BYPASS:
>> +		mod_jif = 2;
>> +		break;
>> +
>> +	case RCU_NOCB_WAKE:
>> +	case RCU_NOCB_WAKE_FORCE:
>> +		// For these, make it wake up the soonest if we
>> +		// were in a bypass or lazy sleep before.
>>  		if (rdp_gp->nocb_defer_wakeup < RCU_NOCB_WAKE)
>> -			mod_timer(&rdp_gp->nocb_timer, jiffies + 1);
>> -		if (rdp_gp->nocb_defer_wakeup < waketype)
>> -			WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype);
>> +			mod_jif = 1;
>> +		break;
>>  	}
>>  
>> +	if (mod_jif)
>> +		mod_timer(&rdp_gp->nocb_timer, jiffies + mod_jif);
>> +
>> +	if (rdp_gp->nocb_defer_wakeup < waketype)
>> +		WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype);
> 
> So RCU_NOCB_WAKE_BYPASS and RCU_NOCB_WAKE_LAZY don't override the timer state
> anymore? Looks like something is missing.

My goal was to make sure that NOCB_WAKE_LAZY wake keeps the timer lazy. If I
don't do this, then when CPU enters idle, it will immediately do a wake up via
this call:

	rcu_nocb_need_deferred_wakeup(rdp_gp, RCU_NOCB_WAKE)

That was almost always causing lazy CBs to be non-lazy thus negating all the
benefits.

Note that bypass will also have same issue where the bypass CB will not wait for
intended bypass duration. To make it consistent with lazy, I made bypass also
not override nocb_defer_wakeup.

I agree its not pretty, but it works and I could not find any code path where it
does not work. That said, I am open to ideas for changing this and perhaps some
of these unneeded delays with bypass CBs can be split into separate patches.

Regarding your point about nocb_defer_wakeup state diverging from the timer
programming, that happens anyway here in current code:

 283        } else {
 284                if (rdp_gp->nocb_defer_wakeup < RCU_NOCB_WAKE)
 285                        mod_timer(&rdp_gp->nocb_timer, jiffies + 1);
 286                if (rdp_gp->nocb_defer_wakeup < waketype)
 287                        WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype);
 288        }

>> +
>>  	raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
>>  
>>  	trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, reason);
> [...]
>> @@ -705,12 +816,21 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
>>  	my_rdp->nocb_gp_gp = needwait_gp;
>>  	my_rdp->nocb_gp_seq = needwait_gp ? wait_gp_seq : 0;
>>  
>> -	if (bypass && !rcu_nocb_poll) {
>> -		// At least one child with non-empty ->nocb_bypass, so set
>> -		// timer in order to avoid stranding its callbacks.
>> -		wake_nocb_gp_defer(my_rdp, RCU_NOCB_WAKE_BYPASS,
>> -				   TPS("WakeBypassIsDeferred"));
>> +	// At least one child with non-empty ->nocb_bypass, so set
>> +	// timer in order to avoid stranding its callbacks.
>> +	if (!rcu_nocb_poll) {
>> +		// If bypass list only has lazy CBs. Add a deferred
>> +		// lazy wake up.
>> +		if (lazy && !bypass) {
>> +			wake_nocb_gp_defer(my_rdp, RCU_NOCB_WAKE_LAZY,
>> +					TPS("WakeLazyIsDeferred"));
> 
> What if:
> 
> 1) rdp(1) has lazy callbacks
> 2) all other rdp's have no callback at all
> 3) nocb_gp_wait() runs through all rdp's, everything is handled, except for
>    these lazy callbacks
> 4) It reaches the above path, ready to arm the RCU_NOCB_WAKE_LAZY timer,
>    but it hasn't yet called wake_nocb_gp_defer()
> 5) Oh but rdp(2) queues a non-lazy callback. interrupts are disabled so it defers
>    the wake up to nocb_gp_wait() with arming the timer in RCU_NOCB_WAKE.
> 6) nocb_gp_wait() finally calls wake_nocb_gp_defer() and override the timeout
>    to several seconds ahead.
> 7) No more callbacks queued, the non-lazy callback will have to wait several
>    seconds to complete.
> 
> Or did I miss something?

In theory, I can see this being an issue. In practice, I have not seen it to be.
In my view, the nocb GP thread should not go to sleep in the first place if
there are any non-bypass CBs being queued. If it does, then that seems an
optimization-related bug.

That said, we can make wake_nocb_gp_defer() more robust perhaps by making it not
overwrite the timer if the wake-type requested is weaker than RCU_NOCB_WAKE,
however that should not cause the going-into-idle issues I pointed. Whether the
idle time issue will happen, I have no idea. But in theory, will that address
your concern above?

> Note that the race exists with RCU_NOCB_WAKE_BYPASS
> but it's only about one jiffy delay, not seconds.

Well, 2 jiffies. But yeah.

Thanks, so far I do not see anything that cannot be fixed on top of this patch
but you raised some good points. Maybe we ought to rewrite the idle path to not
disturb lazy CBs in a different way, or something (while keeping the timer state
consistent with the programming of the timer in wake_nocb_gp_defer()).

> (further review later).

Ok thanks, looking forward to them.


- Joel




> 
> Thanks.
> 
>> +		// Otherwise add a deferred bypass wake up.
>> +		} else if (bypass) {
>> +			wake_nocb_gp_defer(my_rdp, RCU_NOCB_WAKE_BYPASS,
>> +					TPS("WakeBypassIsDeferred"));
>> +		}
>>  	}
>> +
>>  	if (rcu_nocb_poll) {
>>  		/* Polling, so trace if first poll in the series. */
>>  		if (gotcbs)
>> @@ -1036,7 +1156,7 @@ static long rcu_nocb_rdp_deoffload(void *arg)
>>  	 * return false, which means that future calls to rcu_nocb_try_bypass()
>>  	 * will refuse to put anything into the bypass.
>>  	 */
>> -	WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies));
>> +	WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies, false, false));
>>  	/*
>>  	 * Start with invoking rcu_core() early. This way if the current thread
>>  	 * happens to preempt an ongoing call to rcu_core() in the middle,
>> @@ -1290,6 +1410,7 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
>>  	raw_spin_lock_init(&rdp->nocb_gp_lock);
>>  	timer_setup(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer, 0);
>>  	rcu_cblist_init(&rdp->nocb_bypass);
>> +	WRITE_ONCE(rdp->lazy_len, 0);
>>  	mutex_init(&rdp->nocb_gp_kthread_mutex);
>>  }
>>  
>> @@ -1571,13 +1692,14 @@ static void rcu_init_one_nocb(struct rcu_node *rnp)
>>  }
>>  
>>  static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
>> -				  unsigned long j)
>> +				  unsigned long j, bool lazy, bool wakegp)
>>  {
>>  	return true;
>>  }
>>  
>>  static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
>> -				bool *was_alldone, unsigned long flags)
>> +				bool *was_alldone, unsigned long flags,
>> +				bool lazy)
>>  {
>>  	return false;
>>  }
>> -- 
>> 2.37.2.789.g6183377224-goog
>>
Paul E. McKenney Sept. 3, 2022, 2:03 p.m. UTC | #3
On Thu, Sep 01, 2022 at 10:17:08PM +0000, Joel Fernandes (Google) wrote:
> Implement timer-based RCU lazy callback batching. The batch is flushed
> whenever a certain amount of time has passed, or the batch on a
> particular CPU grows too big. Also memory pressure will flush it in a
> future patch.
> 
> To handle several corner cases automagically (such as rcu_barrier() and
> hotplug), we re-use bypass lists to handle lazy CBs. The bypass list
> length has the lazy CB length included in it. A separate lazy CB length
> counter is also introduced to keep track of the number of lazy CBs.
> 
> Suggested-by: Paul McKenney <paulmck@kernel.org>
> Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>

I got this from TREE01 and TREE04:

kernel/rcu/tree_nocb.h:439:46: error: ‘struct rcu_data’ has no member named ‘lazy_len’

So I moved the lazy_len field out from under CONFIG_RCU_LAZY.

							Thanx, Paul

> ---
>  include/linux/rcupdate.h   |   6 ++
>  kernel/rcu/Kconfig         |   8 ++
>  kernel/rcu/rcu.h           |  11 +++
>  kernel/rcu/rcu_segcblist.c |   2 +-
>  kernel/rcu/tree.c          | 130 +++++++++++++++---------
>  kernel/rcu/tree.h          |  13 ++-
>  kernel/rcu/tree_nocb.h     | 198 ++++++++++++++++++++++++++++++-------
>  7 files changed, 280 insertions(+), 88 deletions(-)
> 
> diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
> index 08605ce7379d..82e8a07e0856 100644
> --- a/include/linux/rcupdate.h
> +++ b/include/linux/rcupdate.h
> @@ -108,6 +108,12 @@ static inline int rcu_preempt_depth(void)
>  
>  #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
>  
> +#ifdef CONFIG_RCU_LAZY
> +void call_rcu_lazy(struct rcu_head *head, rcu_callback_t func);
> +#else
> +#define call_rcu_lazy(head, func) call_rcu(head, func)
> +#endif
> +
>  /* Internal to kernel */
>  void rcu_init(void);
>  extern int rcu_scheduler_active;
> diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
> index d471d22a5e21..3128d01427cb 100644
> --- a/kernel/rcu/Kconfig
> +++ b/kernel/rcu/Kconfig
> @@ -311,4 +311,12 @@ config TASKS_TRACE_RCU_READ_MB
>  	  Say N here if you hate read-side memory barriers.
>  	  Take the default if you are unsure.
>  
> +config RCU_LAZY
> +	bool "RCU callback lazy invocation functionality"
> +	depends on RCU_NOCB_CPU
> +	default n
> +	help
> +	  To save power, batch RCU callbacks and flush after delay, memory
> +	  pressure or callback list growing too big.
> +
>  endmenu # "RCU Subsystem"
> diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
> index be5979da07f5..94675f14efe8 100644
> --- a/kernel/rcu/rcu.h
> +++ b/kernel/rcu/rcu.h
> @@ -474,6 +474,14 @@ enum rcutorture_type {
>  	INVALID_RCU_FLAVOR
>  };
>  
> +#if defined(CONFIG_RCU_LAZY)
> +unsigned long rcu_lazy_get_jiffies_till_flush(void);
> +void rcu_lazy_set_jiffies_till_flush(unsigned long j);
> +#else
> +static inline unsigned long rcu_lazy_get_jiffies_till_flush(void) { return 0; }
> +static inline void rcu_lazy_set_jiffies_till_flush(unsigned long j) { }
> +#endif
> +
>  #if defined(CONFIG_TREE_RCU)
>  void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
>  			    unsigned long *gp_seq);
> @@ -483,6 +491,8 @@ void do_trace_rcu_torture_read(const char *rcutorturename,
>  			       unsigned long c_old,
>  			       unsigned long c);
>  void rcu_gp_set_torture_wait(int duration);
> +void rcu_force_call_rcu_to_lazy(bool force);
> +
>  #else
>  static inline void rcutorture_get_gp_data(enum rcutorture_type test_type,
>  					  int *flags, unsigned long *gp_seq)
> @@ -501,6 +511,7 @@ void do_trace_rcu_torture_read(const char *rcutorturename,
>  	do { } while (0)
>  #endif
>  static inline void rcu_gp_set_torture_wait(int duration) { }
> +static inline void rcu_force_call_rcu_to_lazy(bool force) { }
>  #endif
>  
>  #if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST)
> diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
> index c54ea2b6a36b..55b50e592986 100644
> --- a/kernel/rcu/rcu_segcblist.c
> +++ b/kernel/rcu/rcu_segcblist.c
> @@ -38,7 +38,7 @@ void rcu_cblist_enqueue(struct rcu_cblist *rclp, struct rcu_head *rhp)
>   * element of the second rcu_cblist structure, but ensuring that the second
>   * rcu_cblist structure, if initially non-empty, always appears non-empty
>   * throughout the process.  If rdp is NULL, the second rcu_cblist structure
> - * is instead initialized to empty.
> + * is instead initialized to empty. Also account for lazy_len for lazy CBs.
>   */
>  void rcu_cblist_flush_enqueue(struct rcu_cblist *drclp,
>  			      struct rcu_cblist *srclp,
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index 9fe581be8696..aaced29a0a71 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -2728,47 +2728,8 @@ static void check_cb_ovld(struct rcu_data *rdp)
>  	raw_spin_unlock_rcu_node(rnp);
>  }
>  
> -/**
> - * call_rcu() - Queue an RCU callback for invocation after a grace period.
> - * @head: structure to be used for queueing the RCU updates.
> - * @func: actual callback function to be invoked after the grace period
> - *
> - * The callback function will be invoked some time after a full grace
> - * period elapses, in other words after all pre-existing RCU read-side
> - * critical sections have completed.  However, the callback function
> - * might well execute concurrently with RCU read-side critical sections
> - * that started after call_rcu() was invoked.
> - *
> - * RCU read-side critical sections are delimited by rcu_read_lock()
> - * and rcu_read_unlock(), and may be nested.  In addition, but only in
> - * v5.0 and later, regions of code across which interrupts, preemption,
> - * or softirqs have been disabled also serve as RCU read-side critical
> - * sections.  This includes hardware interrupt handlers, softirq handlers,
> - * and NMI handlers.
> - *
> - * Note that all CPUs must agree that the grace period extended beyond
> - * all pre-existing RCU read-side critical section.  On systems with more
> - * than one CPU, this means that when "func()" is invoked, each CPU is
> - * guaranteed to have executed a full memory barrier since the end of its
> - * last RCU read-side critical section whose beginning preceded the call
> - * to call_rcu().  It also means that each CPU executing an RCU read-side
> - * critical section that continues beyond the start of "func()" must have
> - * executed a memory barrier after the call_rcu() but before the beginning
> - * of that RCU read-side critical section.  Note that these guarantees
> - * include CPUs that are offline, idle, or executing in user mode, as
> - * well as CPUs that are executing in the kernel.
> - *
> - * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the
> - * resulting RCU callback function "func()", then both CPU A and CPU B are
> - * guaranteed to execute a full memory barrier during the time interval
> - * between the call to call_rcu() and the invocation of "func()" -- even
> - * if CPU A and CPU B are the same CPU (but again only if the system has
> - * more than one CPU).
> - *
> - * Implementation of these memory-ordering guarantees is described here:
> - * Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst.
> - */
> -void call_rcu(struct rcu_head *head, rcu_callback_t func)
> +static void
> +__call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy)
>  {
>  	static atomic_t doublefrees;
>  	unsigned long flags;
> @@ -2818,7 +2779,7 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func)
>  		trace_rcu_callback(rcu_state.name, head,
>  				   rcu_segcblist_n_cbs(&rdp->cblist));
>  
> -	if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags))
> +	if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags, lazy))
>  		return; // Enqueued onto ->nocb_bypass, so just leave.
>  	// If no-CBs CPU gets here, rcu_nocb_try_bypass() acquired ->nocb_lock.
>  	rcu_segcblist_enqueue(&rdp->cblist, head);
> @@ -2833,8 +2794,86 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func)
>  		local_irq_restore(flags);
>  	}
>  }
> -EXPORT_SYMBOL_GPL(call_rcu);
>  
> +#ifdef CONFIG_RCU_LAZY
> +/**
> + * call_rcu_lazy() - Lazily queue RCU callback for invocation after grace period.
> + * @head: structure to be used for queueing the RCU updates.
> + * @func: actual callback function to be invoked after the grace period
> + *
> + * The callback function will be invoked some time after a full grace
> + * period elapses, in other words after all pre-existing RCU read-side
> + * critical sections have completed.
> + *
> + * Use this API instead of call_rcu() if you don't mind the callback being
> + * invoked after very long periods of time on systems without memory pressure
> + * and on systems which are lightly loaded or mostly idle.
> + *
> + * Other than the extra delay in callbacks being invoked, this function is
> + * identical to, and reuses call_rcu()'s logic. Refer to call_rcu() for more
> + * details about memory ordering and other functionality.
> + */
> +void call_rcu_lazy(struct rcu_head *head, rcu_callback_t func)
> +{
> +	return __call_rcu_common(head, func, true);
> +}
> +EXPORT_SYMBOL_GPL(call_rcu_lazy);
> +#endif
> +
> +static bool force_call_rcu_to_lazy;
> +
> +void rcu_force_call_rcu_to_lazy(bool force)
> +{
> +	if (IS_ENABLED(CONFIG_RCU_SCALE_TEST))
> +		WRITE_ONCE(force_call_rcu_to_lazy, force);
> +}
> +EXPORT_SYMBOL_GPL(rcu_force_call_rcu_to_lazy);
> +
> +/**
> + * call_rcu() - Queue an RCU callback for invocation after a grace period.
> + * @head: structure to be used for queueing the RCU updates.
> + * @func: actual callback function to be invoked after the grace period
> + *
> + * The callback function will be invoked some time after a full grace
> + * period elapses, in other words after all pre-existing RCU read-side
> + * critical sections have completed.  However, the callback function
> + * might well execute concurrently with RCU read-side critical sections
> + * that started after call_rcu() was invoked.
> + *
> + * RCU read-side critical sections are delimited by rcu_read_lock()
> + * and rcu_read_unlock(), and may be nested.  In addition, but only in
> + * v5.0 and later, regions of code across which interrupts, preemption,
> + * or softirqs have been disabled also serve as RCU read-side critical
> + * sections.  This includes hardware interrupt handlers, softirq handlers,
> + * and NMI handlers.
> + *
> + * Note that all CPUs must agree that the grace period extended beyond
> + * all pre-existing RCU read-side critical section.  On systems with more
> + * than one CPU, this means that when "func()" is invoked, each CPU is
> + * guaranteed to have executed a full memory barrier since the end of its
> + * last RCU read-side critical section whose beginning preceded the call
> + * to call_rcu().  It also means that each CPU executing an RCU read-side
> + * critical section that continues beyond the start of "func()" must have
> + * executed a memory barrier after the call_rcu() but before the beginning
> + * of that RCU read-side critical section.  Note that these guarantees
> + * include CPUs that are offline, idle, or executing in user mode, as
> + * well as CPUs that are executing in the kernel.
> + *
> + * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the
> + * resulting RCU callback function "func()", then both CPU A and CPU B are
> + * guaranteed to execute a full memory barrier during the time interval
> + * between the call to call_rcu() and the invocation of "func()" -- even
> + * if CPU A and CPU B are the same CPU (but again only if the system has
> + * more than one CPU).
> + *
> + * Implementation of these memory-ordering guarantees is described here:
> + * Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst.
> + */
> +void call_rcu(struct rcu_head *head, rcu_callback_t func)
> +{
> +	return __call_rcu_common(head, func, force_call_rcu_to_lazy);
> +}
> +EXPORT_SYMBOL_GPL(call_rcu);
>  
>  /* Maximum number of jiffies to wait before draining a batch. */
>  #define KFREE_DRAIN_JIFFIES (5 * HZ)
> @@ -3904,7 +3943,8 @@ static void rcu_barrier_entrain(struct rcu_data *rdp)
>  	rdp->barrier_head.func = rcu_barrier_callback;
>  	debug_rcu_head_queue(&rdp->barrier_head);
>  	rcu_nocb_lock(rdp);
> -	WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies));
> +	WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies, false,
> +		     /* wake gp thread */ true));
>  	if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head)) {
>  		atomic_inc(&rcu_state.barrier_cpu_count);
>  	} else {
> @@ -4325,7 +4365,7 @@ void rcutree_migrate_callbacks(int cpu)
>  	my_rdp = this_cpu_ptr(&rcu_data);
>  	my_rnp = my_rdp->mynode;
>  	rcu_nocb_lock(my_rdp); /* irqs already disabled. */
> -	WARN_ON_ONCE(!rcu_nocb_flush_bypass(my_rdp, NULL, jiffies));
> +	WARN_ON_ONCE(!rcu_nocb_flush_bypass(my_rdp, NULL, jiffies, false, false));
>  	raw_spin_lock_rcu_node(my_rnp); /* irqs already disabled. */
>  	/* Leverage recent GPs and set GP for new callbacks. */
>  	needwake = rcu_advance_cbs(my_rnp, rdp) ||
> diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
> index d4a97e40ea9c..946d819b23fc 100644
> --- a/kernel/rcu/tree.h
> +++ b/kernel/rcu/tree.h
> @@ -263,14 +263,18 @@ struct rcu_data {
>  	unsigned long last_fqs_resched;	/* Time of last rcu_resched(). */
>  	unsigned long last_sched_clock;	/* Jiffies of last rcu_sched_clock_irq(). */
>  
> +#ifdef CONFIG_RCU_LAZY
> +	long lazy_len;			/* Length of buffered lazy callbacks. */
> +#endif
>  	int cpu;
>  };
>  
>  /* Values for nocb_defer_wakeup field in struct rcu_data. */
>  #define RCU_NOCB_WAKE_NOT	0
>  #define RCU_NOCB_WAKE_BYPASS	1
> -#define RCU_NOCB_WAKE		2
> -#define RCU_NOCB_WAKE_FORCE	3
> +#define RCU_NOCB_WAKE_LAZY	2
> +#define RCU_NOCB_WAKE		3
> +#define RCU_NOCB_WAKE_FORCE	4
>  
>  #define RCU_JIFFIES_TILL_FORCE_QS (1 + (HZ > 250) + (HZ > 500))
>  					/* For jiffies_till_first_fqs and */
> @@ -440,9 +444,10 @@ static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp);
>  static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq);
>  static void rcu_init_one_nocb(struct rcu_node *rnp);
>  static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
> -				  unsigned long j);
> +				  unsigned long j, bool lazy, bool wakegp);
>  static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
> -				bool *was_alldone, unsigned long flags);
> +				bool *was_alldone, unsigned long flags,
> +				bool lazy);
>  static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty,
>  				 unsigned long flags);
>  static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level);
> diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
> index 31068dd31315..7e97a7b6e046 100644
> --- a/kernel/rcu/tree_nocb.h
> +++ b/kernel/rcu/tree_nocb.h
> @@ -256,6 +256,31 @@ static bool wake_nocb_gp(struct rcu_data *rdp, bool force)
>  	return __wake_nocb_gp(rdp_gp, rdp, force, flags);
>  }
>  
> +/*
> + * LAZY_FLUSH_JIFFIES decides the maximum amount of time that
> + * can elapse before lazy callbacks are flushed. Lazy callbacks
> + * could be flushed much earlier for a number of other reasons
> + * however, LAZY_FLUSH_JIFFIES will ensure no lazy callbacks are
> + * left unsubmitted to RCU after those many jiffies.
> + */
> +#define LAZY_FLUSH_JIFFIES (10 * HZ)
> +unsigned long jiffies_till_flush = LAZY_FLUSH_JIFFIES;
> +
> +#ifdef CONFIG_RCU_LAZY
> +// To be called only from test code.
> +void rcu_lazy_set_jiffies_till_flush(unsigned long jif)
> +{
> +	jiffies_till_flush = jif;
> +}
> +EXPORT_SYMBOL(rcu_lazy_set_jiffies_till_flush);
> +
> +unsigned long rcu_lazy_get_jiffies_till_flush(void)
> +{
> +	return jiffies_till_flush;
> +}
> +EXPORT_SYMBOL(rcu_lazy_get_jiffies_till_flush);
> +#endif
> +
>  /*
>   * Arrange to wake the GP kthread for this NOCB group at some future
>   * time when it is safe to do so.
> @@ -265,23 +290,39 @@ static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype,
>  {
>  	unsigned long flags;
>  	struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
> +	unsigned long mod_jif = 0;
>  
>  	raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
>  
>  	/*
> -	 * Bypass wakeup overrides previous deferments. In case
> -	 * of callback storm, no need to wake up too early.
> +	 * Bypass and lazy wakeup overrides previous deferments. In case of
> +	 * callback storm, no need to wake up too early.
>  	 */
> -	if (waketype == RCU_NOCB_WAKE_BYPASS) {
> -		mod_timer(&rdp_gp->nocb_timer, jiffies + 2);
> -		WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype);
> -	} else {
> +	switch (waketype) {
> +	case RCU_NOCB_WAKE_LAZY:
> +		if (rdp->nocb_defer_wakeup != RCU_NOCB_WAKE_LAZY)
> +			mod_jif = jiffies_till_flush;
> +		break;
> +
> +	case RCU_NOCB_WAKE_BYPASS:
> +		mod_jif = 2;
> +		break;
> +
> +	case RCU_NOCB_WAKE:
> +	case RCU_NOCB_WAKE_FORCE:
> +		// For these, make it wake up the soonest if we
> +		// were in a bypass or lazy sleep before.
>  		if (rdp_gp->nocb_defer_wakeup < RCU_NOCB_WAKE)
> -			mod_timer(&rdp_gp->nocb_timer, jiffies + 1);
> -		if (rdp_gp->nocb_defer_wakeup < waketype)
> -			WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype);
> +			mod_jif = 1;
> +		break;
>  	}
>  
> +	if (mod_jif)
> +		mod_timer(&rdp_gp->nocb_timer, jiffies + mod_jif);
> +
> +	if (rdp_gp->nocb_defer_wakeup < waketype)
> +		WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype);
> +
>  	raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
>  
>  	trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, reason);
> @@ -293,10 +334,13 @@ static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype,
>   * proves to be initially empty, just return false because the no-CB GP
>   * kthread may need to be awakened in this case.
>   *
> + * Return true if there was something to be flushed and it succeeded, otherwise
> + * false.
> + *
>   * Note that this function always returns true if rhp is NULL.
>   */
>  static bool rcu_nocb_do_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
> -				     unsigned long j)
> +				     unsigned long j, bool lazy)
>  {
>  	struct rcu_cblist rcl;
>  
> @@ -310,7 +354,18 @@ static bool rcu_nocb_do_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
>  	/* Note: ->cblist.len already accounts for ->nocb_bypass contents. */
>  	if (rhp)
>  		rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */
> -	rcu_cblist_flush_enqueue(&rcl, &rdp->nocb_bypass, rhp);
> +
> +	/*
> +	 * If the new CB requested was a lazy one, queue it onto the main
> +	 * ->cblist so we can take advantage of a sooner grade period.
> +	 */
> +	if (lazy && rhp) {
> +		rcu_cblist_flush_enqueue(&rcl, &rdp->nocb_bypass, NULL);
> +		rcu_cblist_enqueue(&rcl, rhp);
> +	} else {
> +		rcu_cblist_flush_enqueue(&rcl, &rdp->nocb_bypass, rhp);
> +	}
> +
>  	rcu_segcblist_insert_pend_cbs(&rdp->cblist, &rcl);
>  	WRITE_ONCE(rdp->nocb_bypass_first, j);
>  	rcu_nocb_bypass_unlock(rdp);
> @@ -326,13 +381,20 @@ static bool rcu_nocb_do_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
>   * Note that this function always returns true if rhp is NULL.
>   */
>  static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
> -				  unsigned long j)
> +				  unsigned long j, bool lazy, bool wake_gp)
>  {
> +	bool ret;
> +
>  	if (!rcu_rdp_is_offloaded(rdp))
>  		return true;
>  	rcu_lockdep_assert_cblist_protected(rdp);
>  	rcu_nocb_bypass_lock(rdp);
> -	return rcu_nocb_do_flush_bypass(rdp, rhp, j);
> +	ret = rcu_nocb_do_flush_bypass(rdp, rhp, j, lazy);
> +
> +	if (wake_gp)
> +		wake_nocb_gp(rdp, true);
> +
> +	return ret;
>  }
>  
>  /*
> @@ -345,7 +407,7 @@ static void rcu_nocb_try_flush_bypass(struct rcu_data *rdp, unsigned long j)
>  	if (!rcu_rdp_is_offloaded(rdp) ||
>  	    !rcu_nocb_bypass_trylock(rdp))
>  		return;
> -	WARN_ON_ONCE(!rcu_nocb_do_flush_bypass(rdp, NULL, j));
> +	WARN_ON_ONCE(!rcu_nocb_do_flush_bypass(rdp, NULL, j, false));
>  }
>  
>  /*
> @@ -367,12 +429,14 @@ static void rcu_nocb_try_flush_bypass(struct rcu_data *rdp, unsigned long j)
>   * there is only one CPU in operation.
>   */
>  static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
> -				bool *was_alldone, unsigned long flags)
> +				bool *was_alldone, unsigned long flags,
> +				bool lazy)
>  {
>  	unsigned long c;
>  	unsigned long cur_gp_seq;
>  	unsigned long j = jiffies;
>  	long ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
> +	bool bypass_is_lazy = (ncbs == READ_ONCE(rdp->lazy_len));
>  
>  	lockdep_assert_irqs_disabled();
>  
> @@ -417,23 +481,29 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
>  	// If there hasn't yet been all that many ->cblist enqueues
>  	// this jiffy, tell the caller to enqueue onto ->cblist.  But flush
>  	// ->nocb_bypass first.
> -	if (rdp->nocb_nobypass_count < nocb_nobypass_lim_per_jiffy) {
> +	// Lazy CBs throttle this back and do immediate bypass queuing.
> +	if (rdp->nocb_nobypass_count < nocb_nobypass_lim_per_jiffy && !lazy) {
>  		rcu_nocb_lock(rdp);
>  		*was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
>  		if (*was_alldone)
>  			trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
>  					    TPS("FirstQ"));
> -		WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, j));
> +
> +		WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, j, false, false));
> +
>  		WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
>  		return false; // Caller must enqueue the callback.
>  	}
>  
>  	// If ->nocb_bypass has been used too long or is too full,
>  	// flush ->nocb_bypass to ->cblist.
> -	if ((ncbs && j != READ_ONCE(rdp->nocb_bypass_first)) ||
> +	if ((ncbs && !bypass_is_lazy && j != READ_ONCE(rdp->nocb_bypass_first)) ||
> +	    (ncbs &&  bypass_is_lazy &&
> +		(time_after(j, READ_ONCE(rdp->nocb_bypass_first) + jiffies_till_flush))) ||
>  	    ncbs >= qhimark) {
>  		rcu_nocb_lock(rdp);
> -		if (!rcu_nocb_flush_bypass(rdp, rhp, j)) {
> +
> +		if (!rcu_nocb_flush_bypass(rdp, rhp, j, lazy, false)) {
>  			*was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
>  			if (*was_alldone)
>  				trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
> @@ -460,16 +530,29 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
>  	// We need to use the bypass.
>  	rcu_nocb_wait_contended(rdp);
>  	rcu_nocb_bypass_lock(rdp);
> +
>  	ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
>  	rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */
>  	rcu_cblist_enqueue(&rdp->nocb_bypass, rhp);
> +
> +	if (IS_ENABLED(CONFIG_RCU_LAZY) && lazy)
> +		WRITE_ONCE(rdp->lazy_len, rdp->lazy_len + 1);
> +
>  	if (!ncbs) {
>  		WRITE_ONCE(rdp->nocb_bypass_first, j);
>  		trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("FirstBQ"));
>  	}
> +
>  	rcu_nocb_bypass_unlock(rdp);
>  	smp_mb(); /* Order enqueue before wake. */
> -	if (ncbs) {
> +
> +	// We had CBs in the bypass list before. There is nothing else to do if:
> +	// There were only non-lazy CBs before, in this case, the bypass timer
> +	// or GP-thread will handle the CBs including any new lazy ones.
> +	// Or, the new CB is lazy and the old bypass-CBs were also lazy. In this
> +	// case the old lazy timer would have been setup. When that expires,
> +	// the new lazy one will be handled.
> +	if (ncbs && (!bypass_is_lazy || lazy)) {
>  		local_irq_restore(flags);
>  	} else {
>  		// No-CBs GP kthread might be indefinitely asleep, if so, wake.
> @@ -478,6 +561,10 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
>  			trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
>  					    TPS("FirstBQwake"));
>  			__call_rcu_nocb_wake(rdp, true, flags);
> +		} else if (bypass_is_lazy && !lazy) {
> +			trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
> +					    TPS("FirstBQwakeLazy2Non"));
> +			__call_rcu_nocb_wake(rdp, true, flags);
>  		} else {
>  			trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
>  					    TPS("FirstBQnoWake"));
> @@ -499,7 +586,7 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone,
>  {
>  	unsigned long cur_gp_seq;
>  	unsigned long j;
> -	long len;
> +	long len, lazy_len, bypass_len;
>  	struct task_struct *t;
>  
>  	// If we are being polled or there is no kthread, just leave.
> @@ -512,9 +599,16 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone,
>  	}
>  	// Need to actually to a wakeup.
>  	len = rcu_segcblist_n_cbs(&rdp->cblist);
> +	bypass_len = rcu_cblist_n_cbs(&rdp->nocb_bypass);
> +	lazy_len = READ_ONCE(rdp->lazy_len);
>  	if (was_alldone) {
>  		rdp->qlen_last_fqs_check = len;
> -		if (!irqs_disabled_flags(flags)) {
> +		// Only lazy CBs in bypass list
> +		if (lazy_len && bypass_len == lazy_len) {
> +			rcu_nocb_unlock_irqrestore(rdp, flags);
> +			wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_LAZY,
> +					   TPS("WakeLazy"));
> +		} else if (!irqs_disabled_flags(flags)) {
>  			/* ... if queue was empty ... */
>  			rcu_nocb_unlock_irqrestore(rdp, flags);
>  			wake_nocb_gp(rdp, false);
> @@ -604,8 +698,8 @@ static void nocb_gp_sleep(struct rcu_data *my_rdp, int cpu)
>   */
>  static void nocb_gp_wait(struct rcu_data *my_rdp)
>  {
> -	bool bypass = false;
> -	long bypass_ncbs;
> +	bool bypass = false, lazy = false;
> +	long bypass_ncbs, lazy_ncbs;
>  	int __maybe_unused cpu = my_rdp->cpu;
>  	unsigned long cur_gp_seq;
>  	unsigned long flags;
> @@ -640,24 +734,41 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
>  	 * won't be ignored for long.
>  	 */
>  	list_for_each_entry(rdp, &my_rdp->nocb_head_rdp, nocb_entry_rdp) {
> +		bool flush_bypass = false;
> +
>  		trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Check"));
>  		rcu_nocb_lock_irqsave(rdp, flags);
>  		lockdep_assert_held(&rdp->nocb_lock);
>  		bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
> -		if (bypass_ncbs &&
> +		lazy_ncbs = READ_ONCE(rdp->lazy_len);
> +
> +		if (bypass_ncbs && (lazy_ncbs == bypass_ncbs) &&
> +		    (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + jiffies_till_flush) ||
> +		     bypass_ncbs > 2 * qhimark)) {
> +			flush_bypass = true;
> +		} else if (bypass_ncbs && (lazy_ncbs != bypass_ncbs) &&
>  		    (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + 1) ||
>  		     bypass_ncbs > 2 * qhimark)) {
> -			// Bypass full or old, so flush it.
> -			(void)rcu_nocb_try_flush_bypass(rdp, j);
> -			bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
> +			flush_bypass = true;
>  		} else if (!bypass_ncbs && rcu_segcblist_empty(&rdp->cblist)) {
>  			rcu_nocb_unlock_irqrestore(rdp, flags);
>  			continue; /* No callbacks here, try next. */
>  		}
> +
> +		if (flush_bypass) {
> +			// Bypass full or old, so flush it.
> +			(void)rcu_nocb_try_flush_bypass(rdp, j);
> +			bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
> +			lazy_ncbs = READ_ONCE(rdp->lazy_len);
> +		}
> +
>  		if (bypass_ncbs) {
>  			trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
> -					    TPS("Bypass"));
> -			bypass = true;
> +				    bypass_ncbs == lazy_ncbs ? TPS("Lazy") : TPS("Bypass"));
> +			if (bypass_ncbs == lazy_ncbs)
> +				lazy = true;
> +			else
> +				bypass = true;
>  		}
>  		rnp = rdp->mynode;
>  
> @@ -705,12 +816,21 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
>  	my_rdp->nocb_gp_gp = needwait_gp;
>  	my_rdp->nocb_gp_seq = needwait_gp ? wait_gp_seq : 0;
>  
> -	if (bypass && !rcu_nocb_poll) {
> -		// At least one child with non-empty ->nocb_bypass, so set
> -		// timer in order to avoid stranding its callbacks.
> -		wake_nocb_gp_defer(my_rdp, RCU_NOCB_WAKE_BYPASS,
> -				   TPS("WakeBypassIsDeferred"));
> +	// At least one child with non-empty ->nocb_bypass, so set
> +	// timer in order to avoid stranding its callbacks.
> +	if (!rcu_nocb_poll) {
> +		// If bypass list only has lazy CBs. Add a deferred
> +		// lazy wake up.
> +		if (lazy && !bypass) {
> +			wake_nocb_gp_defer(my_rdp, RCU_NOCB_WAKE_LAZY,
> +					TPS("WakeLazyIsDeferred"));
> +		// Otherwise add a deferred bypass wake up.
> +		} else if (bypass) {
> +			wake_nocb_gp_defer(my_rdp, RCU_NOCB_WAKE_BYPASS,
> +					TPS("WakeBypassIsDeferred"));
> +		}
>  	}
> +
>  	if (rcu_nocb_poll) {
>  		/* Polling, so trace if first poll in the series. */
>  		if (gotcbs)
> @@ -1036,7 +1156,7 @@ static long rcu_nocb_rdp_deoffload(void *arg)
>  	 * return false, which means that future calls to rcu_nocb_try_bypass()
>  	 * will refuse to put anything into the bypass.
>  	 */
> -	WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies));
> +	WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies, false, false));
>  	/*
>  	 * Start with invoking rcu_core() early. This way if the current thread
>  	 * happens to preempt an ongoing call to rcu_core() in the middle,
> @@ -1290,6 +1410,7 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
>  	raw_spin_lock_init(&rdp->nocb_gp_lock);
>  	timer_setup(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer, 0);
>  	rcu_cblist_init(&rdp->nocb_bypass);
> +	WRITE_ONCE(rdp->lazy_len, 0);
>  	mutex_init(&rdp->nocb_gp_kthread_mutex);
>  }
>  
> @@ -1571,13 +1692,14 @@ static void rcu_init_one_nocb(struct rcu_node *rnp)
>  }
>  
>  static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
> -				  unsigned long j)
> +				  unsigned long j, bool lazy, bool wakegp)
>  {
>  	return true;
>  }
>  
>  static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
> -				bool *was_alldone, unsigned long flags)
> +				bool *was_alldone, unsigned long flags,
> +				bool lazy)
>  {
>  	return false;
>  }
> -- 
> 2.37.2.789.g6183377224-goog
>
Joel Fernandes Sept. 3, 2022, 2:05 p.m. UTC | #4
On 9/3/2022 10:03 AM, Paul E. McKenney wrote:
> On Thu, Sep 01, 2022 at 10:17:08PM +0000, Joel Fernandes (Google) wrote:
>> Implement timer-based RCU lazy callback batching. The batch is flushed
>> whenever a certain amount of time has passed, or the batch on a
>> particular CPU grows too big. Also memory pressure will flush it in a
>> future patch.
>>
>> To handle several corner cases automagically (such as rcu_barrier() and
>> hotplug), we re-use bypass lists to handle lazy CBs. The bypass list
>> length has the lazy CB length included in it. A separate lazy CB length
>> counter is also introduced to keep track of the number of lazy CBs.
>>
>> Suggested-by: Paul McKenney <paulmck@kernel.org>
>> Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
> 
> I got this from TREE01 and TREE04:
> 
> kernel/rcu/tree_nocb.h:439:46: error: ‘struct rcu_data’ has no member named ‘lazy_len’
> 
> So I moved the lazy_len field out from under CONFIG_RCU_LAZY.

Ok, thank you!

- Joel

> 
> 							Thanx, Paul
> 
>> ---
>>  include/linux/rcupdate.h   |   6 ++
>>  kernel/rcu/Kconfig         |   8 ++
>>  kernel/rcu/rcu.h           |  11 +++
>>  kernel/rcu/rcu_segcblist.c |   2 +-
>>  kernel/rcu/tree.c          | 130 +++++++++++++++---------
>>  kernel/rcu/tree.h          |  13 ++-
>>  kernel/rcu/tree_nocb.h     | 198 ++++++++++++++++++++++++++++++-------
>>  7 files changed, 280 insertions(+), 88 deletions(-)
>>
>> diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
>> index 08605ce7379d..82e8a07e0856 100644
>> --- a/include/linux/rcupdate.h
>> +++ b/include/linux/rcupdate.h
>> @@ -108,6 +108,12 @@ static inline int rcu_preempt_depth(void)
>>  
>>  #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
>>  
>> +#ifdef CONFIG_RCU_LAZY
>> +void call_rcu_lazy(struct rcu_head *head, rcu_callback_t func);
>> +#else
>> +#define call_rcu_lazy(head, func) call_rcu(head, func)
>> +#endif
>> +
>>  /* Internal to kernel */
>>  void rcu_init(void);
>>  extern int rcu_scheduler_active;
>> diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
>> index d471d22a5e21..3128d01427cb 100644
>> --- a/kernel/rcu/Kconfig
>> +++ b/kernel/rcu/Kconfig
>> @@ -311,4 +311,12 @@ config TASKS_TRACE_RCU_READ_MB
>>  	  Say N here if you hate read-side memory barriers.
>>  	  Take the default if you are unsure.
>>  
>> +config RCU_LAZY
>> +	bool "RCU callback lazy invocation functionality"
>> +	depends on RCU_NOCB_CPU
>> +	default n
>> +	help
>> +	  To save power, batch RCU callbacks and flush after delay, memory
>> +	  pressure or callback list growing too big.
>> +
>>  endmenu # "RCU Subsystem"
>> diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
>> index be5979da07f5..94675f14efe8 100644
>> --- a/kernel/rcu/rcu.h
>> +++ b/kernel/rcu/rcu.h
>> @@ -474,6 +474,14 @@ enum rcutorture_type {
>>  	INVALID_RCU_FLAVOR
>>  };
>>  
>> +#if defined(CONFIG_RCU_LAZY)
>> +unsigned long rcu_lazy_get_jiffies_till_flush(void);
>> +void rcu_lazy_set_jiffies_till_flush(unsigned long j);
>> +#else
>> +static inline unsigned long rcu_lazy_get_jiffies_till_flush(void) { return 0; }
>> +static inline void rcu_lazy_set_jiffies_till_flush(unsigned long j) { }
>> +#endif
>> +
>>  #if defined(CONFIG_TREE_RCU)
>>  void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
>>  			    unsigned long *gp_seq);
>> @@ -483,6 +491,8 @@ void do_trace_rcu_torture_read(const char *rcutorturename,
>>  			       unsigned long c_old,
>>  			       unsigned long c);
>>  void rcu_gp_set_torture_wait(int duration);
>> +void rcu_force_call_rcu_to_lazy(bool force);
>> +
>>  #else
>>  static inline void rcutorture_get_gp_data(enum rcutorture_type test_type,
>>  					  int *flags, unsigned long *gp_seq)
>> @@ -501,6 +511,7 @@ void do_trace_rcu_torture_read(const char *rcutorturename,
>>  	do { } while (0)
>>  #endif
>>  static inline void rcu_gp_set_torture_wait(int duration) { }
>> +static inline void rcu_force_call_rcu_to_lazy(bool force) { }
>>  #endif
>>  
>>  #if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST)
>> diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
>> index c54ea2b6a36b..55b50e592986 100644
>> --- a/kernel/rcu/rcu_segcblist.c
>> +++ b/kernel/rcu/rcu_segcblist.c
>> @@ -38,7 +38,7 @@ void rcu_cblist_enqueue(struct rcu_cblist *rclp, struct rcu_head *rhp)
>>   * element of the second rcu_cblist structure, but ensuring that the second
>>   * rcu_cblist structure, if initially non-empty, always appears non-empty
>>   * throughout the process.  If rdp is NULL, the second rcu_cblist structure
>> - * is instead initialized to empty.
>> + * is instead initialized to empty. Also account for lazy_len for lazy CBs.
>>   */
>>  void rcu_cblist_flush_enqueue(struct rcu_cblist *drclp,
>>  			      struct rcu_cblist *srclp,
>> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
>> index 9fe581be8696..aaced29a0a71 100644
>> --- a/kernel/rcu/tree.c
>> +++ b/kernel/rcu/tree.c
>> @@ -2728,47 +2728,8 @@ static void check_cb_ovld(struct rcu_data *rdp)
>>  	raw_spin_unlock_rcu_node(rnp);
>>  }
>>  
>> -/**
>> - * call_rcu() - Queue an RCU callback for invocation after a grace period.
>> - * @head: structure to be used for queueing the RCU updates.
>> - * @func: actual callback function to be invoked after the grace period
>> - *
>> - * The callback function will be invoked some time after a full grace
>> - * period elapses, in other words after all pre-existing RCU read-side
>> - * critical sections have completed.  However, the callback function
>> - * might well execute concurrently with RCU read-side critical sections
>> - * that started after call_rcu() was invoked.
>> - *
>> - * RCU read-side critical sections are delimited by rcu_read_lock()
>> - * and rcu_read_unlock(), and may be nested.  In addition, but only in
>> - * v5.0 and later, regions of code across which interrupts, preemption,
>> - * or softirqs have been disabled also serve as RCU read-side critical
>> - * sections.  This includes hardware interrupt handlers, softirq handlers,
>> - * and NMI handlers.
>> - *
>> - * Note that all CPUs must agree that the grace period extended beyond
>> - * all pre-existing RCU read-side critical section.  On systems with more
>> - * than one CPU, this means that when "func()" is invoked, each CPU is
>> - * guaranteed to have executed a full memory barrier since the end of its
>> - * last RCU read-side critical section whose beginning preceded the call
>> - * to call_rcu().  It also means that each CPU executing an RCU read-side
>> - * critical section that continues beyond the start of "func()" must have
>> - * executed a memory barrier after the call_rcu() but before the beginning
>> - * of that RCU read-side critical section.  Note that these guarantees
>> - * include CPUs that are offline, idle, or executing in user mode, as
>> - * well as CPUs that are executing in the kernel.
>> - *
>> - * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the
>> - * resulting RCU callback function "func()", then both CPU A and CPU B are
>> - * guaranteed to execute a full memory barrier during the time interval
>> - * between the call to call_rcu() and the invocation of "func()" -- even
>> - * if CPU A and CPU B are the same CPU (but again only if the system has
>> - * more than one CPU).
>> - *
>> - * Implementation of these memory-ordering guarantees is described here:
>> - * Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst.
>> - */
>> -void call_rcu(struct rcu_head *head, rcu_callback_t func)
>> +static void
>> +__call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy)
>>  {
>>  	static atomic_t doublefrees;
>>  	unsigned long flags;
>> @@ -2818,7 +2779,7 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func)
>>  		trace_rcu_callback(rcu_state.name, head,
>>  				   rcu_segcblist_n_cbs(&rdp->cblist));
>>  
>> -	if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags))
>> +	if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags, lazy))
>>  		return; // Enqueued onto ->nocb_bypass, so just leave.
>>  	// If no-CBs CPU gets here, rcu_nocb_try_bypass() acquired ->nocb_lock.
>>  	rcu_segcblist_enqueue(&rdp->cblist, head);
>> @@ -2833,8 +2794,86 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func)
>>  		local_irq_restore(flags);
>>  	}
>>  }
>> -EXPORT_SYMBOL_GPL(call_rcu);
>>  
>> +#ifdef CONFIG_RCU_LAZY
>> +/**
>> + * call_rcu_lazy() - Lazily queue RCU callback for invocation after grace period.
>> + * @head: structure to be used for queueing the RCU updates.
>> + * @func: actual callback function to be invoked after the grace period
>> + *
>> + * The callback function will be invoked some time after a full grace
>> + * period elapses, in other words after all pre-existing RCU read-side
>> + * critical sections have completed.
>> + *
>> + * Use this API instead of call_rcu() if you don't mind the callback being
>> + * invoked after very long periods of time on systems without memory pressure
>> + * and on systems which are lightly loaded or mostly idle.
>> + *
>> + * Other than the extra delay in callbacks being invoked, this function is
>> + * identical to, and reuses call_rcu()'s logic. Refer to call_rcu() for more
>> + * details about memory ordering and other functionality.
>> + */
>> +void call_rcu_lazy(struct rcu_head *head, rcu_callback_t func)
>> +{
>> +	return __call_rcu_common(head, func, true);
>> +}
>> +EXPORT_SYMBOL_GPL(call_rcu_lazy);
>> +#endif
>> +
>> +static bool force_call_rcu_to_lazy;
>> +
>> +void rcu_force_call_rcu_to_lazy(bool force)
>> +{
>> +	if (IS_ENABLED(CONFIG_RCU_SCALE_TEST))
>> +		WRITE_ONCE(force_call_rcu_to_lazy, force);
>> +}
>> +EXPORT_SYMBOL_GPL(rcu_force_call_rcu_to_lazy);
>> +
>> +/**
>> + * call_rcu() - Queue an RCU callback for invocation after a grace period.
>> + * @head: structure to be used for queueing the RCU updates.
>> + * @func: actual callback function to be invoked after the grace period
>> + *
>> + * The callback function will be invoked some time after a full grace
>> + * period elapses, in other words after all pre-existing RCU read-side
>> + * critical sections have completed.  However, the callback function
>> + * might well execute concurrently with RCU read-side critical sections
>> + * that started after call_rcu() was invoked.
>> + *
>> + * RCU read-side critical sections are delimited by rcu_read_lock()
>> + * and rcu_read_unlock(), and may be nested.  In addition, but only in
>> + * v5.0 and later, regions of code across which interrupts, preemption,
>> + * or softirqs have been disabled also serve as RCU read-side critical
>> + * sections.  This includes hardware interrupt handlers, softirq handlers,
>> + * and NMI handlers.
>> + *
>> + * Note that all CPUs must agree that the grace period extended beyond
>> + * all pre-existing RCU read-side critical section.  On systems with more
>> + * than one CPU, this means that when "func()" is invoked, each CPU is
>> + * guaranteed to have executed a full memory barrier since the end of its
>> + * last RCU read-side critical section whose beginning preceded the call
>> + * to call_rcu().  It also means that each CPU executing an RCU read-side
>> + * critical section that continues beyond the start of "func()" must have
>> + * executed a memory barrier after the call_rcu() but before the beginning
>> + * of that RCU read-side critical section.  Note that these guarantees
>> + * include CPUs that are offline, idle, or executing in user mode, as
>> + * well as CPUs that are executing in the kernel.
>> + *
>> + * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the
>> + * resulting RCU callback function "func()", then both CPU A and CPU B are
>> + * guaranteed to execute a full memory barrier during the time interval
>> + * between the call to call_rcu() and the invocation of "func()" -- even
>> + * if CPU A and CPU B are the same CPU (but again only if the system has
>> + * more than one CPU).
>> + *
>> + * Implementation of these memory-ordering guarantees is described here:
>> + * Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst.
>> + */
>> +void call_rcu(struct rcu_head *head, rcu_callback_t func)
>> +{
>> +	return __call_rcu_common(head, func, force_call_rcu_to_lazy);
>> +}
>> +EXPORT_SYMBOL_GPL(call_rcu);
>>  
>>  /* Maximum number of jiffies to wait before draining a batch. */
>>  #define KFREE_DRAIN_JIFFIES (5 * HZ)
>> @@ -3904,7 +3943,8 @@ static void rcu_barrier_entrain(struct rcu_data *rdp)
>>  	rdp->barrier_head.func = rcu_barrier_callback;
>>  	debug_rcu_head_queue(&rdp->barrier_head);
>>  	rcu_nocb_lock(rdp);
>> -	WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies));
>> +	WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies, false,
>> +		     /* wake gp thread */ true));
>>  	if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head)) {
>>  		atomic_inc(&rcu_state.barrier_cpu_count);
>>  	} else {
>> @@ -4325,7 +4365,7 @@ void rcutree_migrate_callbacks(int cpu)
>>  	my_rdp = this_cpu_ptr(&rcu_data);
>>  	my_rnp = my_rdp->mynode;
>>  	rcu_nocb_lock(my_rdp); /* irqs already disabled. */
>> -	WARN_ON_ONCE(!rcu_nocb_flush_bypass(my_rdp, NULL, jiffies));
>> +	WARN_ON_ONCE(!rcu_nocb_flush_bypass(my_rdp, NULL, jiffies, false, false));
>>  	raw_spin_lock_rcu_node(my_rnp); /* irqs already disabled. */
>>  	/* Leverage recent GPs and set GP for new callbacks. */
>>  	needwake = rcu_advance_cbs(my_rnp, rdp) ||
>> diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
>> index d4a97e40ea9c..946d819b23fc 100644
>> --- a/kernel/rcu/tree.h
>> +++ b/kernel/rcu/tree.h
>> @@ -263,14 +263,18 @@ struct rcu_data {
>>  	unsigned long last_fqs_resched;	/* Time of last rcu_resched(). */
>>  	unsigned long last_sched_clock;	/* Jiffies of last rcu_sched_clock_irq(). */
>>  
>> +#ifdef CONFIG_RCU_LAZY
>> +	long lazy_len;			/* Length of buffered lazy callbacks. */
>> +#endif
>>  	int cpu;
>>  };
>>  
>>  /* Values for nocb_defer_wakeup field in struct rcu_data. */
>>  #define RCU_NOCB_WAKE_NOT	0
>>  #define RCU_NOCB_WAKE_BYPASS	1
>> -#define RCU_NOCB_WAKE		2
>> -#define RCU_NOCB_WAKE_FORCE	3
>> +#define RCU_NOCB_WAKE_LAZY	2
>> +#define RCU_NOCB_WAKE		3
>> +#define RCU_NOCB_WAKE_FORCE	4
>>  
>>  #define RCU_JIFFIES_TILL_FORCE_QS (1 + (HZ > 250) + (HZ > 500))
>>  					/* For jiffies_till_first_fqs and */
>> @@ -440,9 +444,10 @@ static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp);
>>  static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq);
>>  static void rcu_init_one_nocb(struct rcu_node *rnp);
>>  static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
>> -				  unsigned long j);
>> +				  unsigned long j, bool lazy, bool wakegp);
>>  static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
>> -				bool *was_alldone, unsigned long flags);
>> +				bool *was_alldone, unsigned long flags,
>> +				bool lazy);
>>  static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty,
>>  				 unsigned long flags);
>>  static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level);
>> diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
>> index 31068dd31315..7e97a7b6e046 100644
>> --- a/kernel/rcu/tree_nocb.h
>> +++ b/kernel/rcu/tree_nocb.h
>> @@ -256,6 +256,31 @@ static bool wake_nocb_gp(struct rcu_data *rdp, bool force)
>>  	return __wake_nocb_gp(rdp_gp, rdp, force, flags);
>>  }
>>  
>> +/*
>> + * LAZY_FLUSH_JIFFIES decides the maximum amount of time that
>> + * can elapse before lazy callbacks are flushed. Lazy callbacks
>> + * could be flushed much earlier for a number of other reasons
>> + * however, LAZY_FLUSH_JIFFIES will ensure no lazy callbacks are
>> + * left unsubmitted to RCU after those many jiffies.
>> + */
>> +#define LAZY_FLUSH_JIFFIES (10 * HZ)
>> +unsigned long jiffies_till_flush = LAZY_FLUSH_JIFFIES;
>> +
>> +#ifdef CONFIG_RCU_LAZY
>> +// To be called only from test code.
>> +void rcu_lazy_set_jiffies_till_flush(unsigned long jif)
>> +{
>> +	jiffies_till_flush = jif;
>> +}
>> +EXPORT_SYMBOL(rcu_lazy_set_jiffies_till_flush);
>> +
>> +unsigned long rcu_lazy_get_jiffies_till_flush(void)
>> +{
>> +	return jiffies_till_flush;
>> +}
>> +EXPORT_SYMBOL(rcu_lazy_get_jiffies_till_flush);
>> +#endif
>> +
>>  /*
>>   * Arrange to wake the GP kthread for this NOCB group at some future
>>   * time when it is safe to do so.
>> @@ -265,23 +290,39 @@ static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype,
>>  {
>>  	unsigned long flags;
>>  	struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
>> +	unsigned long mod_jif = 0;
>>  
>>  	raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
>>  
>>  	/*
>> -	 * Bypass wakeup overrides previous deferments. In case
>> -	 * of callback storm, no need to wake up too early.
>> +	 * Bypass and lazy wakeup overrides previous deferments. In case of
>> +	 * callback storm, no need to wake up too early.
>>  	 */
>> -	if (waketype == RCU_NOCB_WAKE_BYPASS) {
>> -		mod_timer(&rdp_gp->nocb_timer, jiffies + 2);
>> -		WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype);
>> -	} else {
>> +	switch (waketype) {
>> +	case RCU_NOCB_WAKE_LAZY:
>> +		if (rdp->nocb_defer_wakeup != RCU_NOCB_WAKE_LAZY)
>> +			mod_jif = jiffies_till_flush;
>> +		break;
>> +
>> +	case RCU_NOCB_WAKE_BYPASS:
>> +		mod_jif = 2;
>> +		break;
>> +
>> +	case RCU_NOCB_WAKE:
>> +	case RCU_NOCB_WAKE_FORCE:
>> +		// For these, make it wake up the soonest if we
>> +		// were in a bypass or lazy sleep before.
>>  		if (rdp_gp->nocb_defer_wakeup < RCU_NOCB_WAKE)
>> -			mod_timer(&rdp_gp->nocb_timer, jiffies + 1);
>> -		if (rdp_gp->nocb_defer_wakeup < waketype)
>> -			WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype);
>> +			mod_jif = 1;
>> +		break;
>>  	}
>>  
>> +	if (mod_jif)
>> +		mod_timer(&rdp_gp->nocb_timer, jiffies + mod_jif);
>> +
>> +	if (rdp_gp->nocb_defer_wakeup < waketype)
>> +		WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype);
>> +
>>  	raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
>>  
>>  	trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, reason);
>> @@ -293,10 +334,13 @@ static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype,
>>   * proves to be initially empty, just return false because the no-CB GP
>>   * kthread may need to be awakened in this case.
>>   *
>> + * Return true if there was something to be flushed and it succeeded, otherwise
>> + * false.
>> + *
>>   * Note that this function always returns true if rhp is NULL.
>>   */
>>  static bool rcu_nocb_do_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
>> -				     unsigned long j)
>> +				     unsigned long j, bool lazy)
>>  {
>>  	struct rcu_cblist rcl;
>>  
>> @@ -310,7 +354,18 @@ static bool rcu_nocb_do_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
>>  	/* Note: ->cblist.len already accounts for ->nocb_bypass contents. */
>>  	if (rhp)
>>  		rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */
>> -	rcu_cblist_flush_enqueue(&rcl, &rdp->nocb_bypass, rhp);
>> +
>> +	/*
>> +	 * If the new CB requested was a lazy one, queue it onto the main
>> +	 * ->cblist so we can take advantage of a sooner grade period.
>> +	 */
>> +	if (lazy && rhp) {
>> +		rcu_cblist_flush_enqueue(&rcl, &rdp->nocb_bypass, NULL);
>> +		rcu_cblist_enqueue(&rcl, rhp);
>> +	} else {
>> +		rcu_cblist_flush_enqueue(&rcl, &rdp->nocb_bypass, rhp);
>> +	}
>> +
>>  	rcu_segcblist_insert_pend_cbs(&rdp->cblist, &rcl);
>>  	WRITE_ONCE(rdp->nocb_bypass_first, j);
>>  	rcu_nocb_bypass_unlock(rdp);
>> @@ -326,13 +381,20 @@ static bool rcu_nocb_do_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
>>   * Note that this function always returns true if rhp is NULL.
>>   */
>>  static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
>> -				  unsigned long j)
>> +				  unsigned long j, bool lazy, bool wake_gp)
>>  {
>> +	bool ret;
>> +
>>  	if (!rcu_rdp_is_offloaded(rdp))
>>  		return true;
>>  	rcu_lockdep_assert_cblist_protected(rdp);
>>  	rcu_nocb_bypass_lock(rdp);
>> -	return rcu_nocb_do_flush_bypass(rdp, rhp, j);
>> +	ret = rcu_nocb_do_flush_bypass(rdp, rhp, j, lazy);
>> +
>> +	if (wake_gp)
>> +		wake_nocb_gp(rdp, true);
>> +
>> +	return ret;
>>  }
>>  
>>  /*
>> @@ -345,7 +407,7 @@ static void rcu_nocb_try_flush_bypass(struct rcu_data *rdp, unsigned long j)
>>  	if (!rcu_rdp_is_offloaded(rdp) ||
>>  	    !rcu_nocb_bypass_trylock(rdp))
>>  		return;
>> -	WARN_ON_ONCE(!rcu_nocb_do_flush_bypass(rdp, NULL, j));
>> +	WARN_ON_ONCE(!rcu_nocb_do_flush_bypass(rdp, NULL, j, false));
>>  }
>>  
>>  /*
>> @@ -367,12 +429,14 @@ static void rcu_nocb_try_flush_bypass(struct rcu_data *rdp, unsigned long j)
>>   * there is only one CPU in operation.
>>   */
>>  static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
>> -				bool *was_alldone, unsigned long flags)
>> +				bool *was_alldone, unsigned long flags,
>> +				bool lazy)
>>  {
>>  	unsigned long c;
>>  	unsigned long cur_gp_seq;
>>  	unsigned long j = jiffies;
>>  	long ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
>> +	bool bypass_is_lazy = (ncbs == READ_ONCE(rdp->lazy_len));
>>  
>>  	lockdep_assert_irqs_disabled();
>>  
>> @@ -417,23 +481,29 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
>>  	// If there hasn't yet been all that many ->cblist enqueues
>>  	// this jiffy, tell the caller to enqueue onto ->cblist.  But flush
>>  	// ->nocb_bypass first.
>> -	if (rdp->nocb_nobypass_count < nocb_nobypass_lim_per_jiffy) {
>> +	// Lazy CBs throttle this back and do immediate bypass queuing.
>> +	if (rdp->nocb_nobypass_count < nocb_nobypass_lim_per_jiffy && !lazy) {
>>  		rcu_nocb_lock(rdp);
>>  		*was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
>>  		if (*was_alldone)
>>  			trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
>>  					    TPS("FirstQ"));
>> -		WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, j));
>> +
>> +		WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, j, false, false));
>> +
>>  		WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
>>  		return false; // Caller must enqueue the callback.
>>  	}
>>  
>>  	// If ->nocb_bypass has been used too long or is too full,
>>  	// flush ->nocb_bypass to ->cblist.
>> -	if ((ncbs && j != READ_ONCE(rdp->nocb_bypass_first)) ||
>> +	if ((ncbs && !bypass_is_lazy && j != READ_ONCE(rdp->nocb_bypass_first)) ||
>> +	    (ncbs &&  bypass_is_lazy &&
>> +		(time_after(j, READ_ONCE(rdp->nocb_bypass_first) + jiffies_till_flush))) ||
>>  	    ncbs >= qhimark) {
>>  		rcu_nocb_lock(rdp);
>> -		if (!rcu_nocb_flush_bypass(rdp, rhp, j)) {
>> +
>> +		if (!rcu_nocb_flush_bypass(rdp, rhp, j, lazy, false)) {
>>  			*was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
>>  			if (*was_alldone)
>>  				trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
>> @@ -460,16 +530,29 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
>>  	// We need to use the bypass.
>>  	rcu_nocb_wait_contended(rdp);
>>  	rcu_nocb_bypass_lock(rdp);
>> +
>>  	ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
>>  	rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */
>>  	rcu_cblist_enqueue(&rdp->nocb_bypass, rhp);
>> +
>> +	if (IS_ENABLED(CONFIG_RCU_LAZY) && lazy)
>> +		WRITE_ONCE(rdp->lazy_len, rdp->lazy_len + 1);
>> +
>>  	if (!ncbs) {
>>  		WRITE_ONCE(rdp->nocb_bypass_first, j);
>>  		trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("FirstBQ"));
>>  	}
>> +
>>  	rcu_nocb_bypass_unlock(rdp);
>>  	smp_mb(); /* Order enqueue before wake. */
>> -	if (ncbs) {
>> +
>> +	// We had CBs in the bypass list before. There is nothing else to do if:
>> +	// There were only non-lazy CBs before, in this case, the bypass timer
>> +	// or GP-thread will handle the CBs including any new lazy ones.
>> +	// Or, the new CB is lazy and the old bypass-CBs were also lazy. In this
>> +	// case the old lazy timer would have been setup. When that expires,
>> +	// the new lazy one will be handled.
>> +	if (ncbs && (!bypass_is_lazy || lazy)) {
>>  		local_irq_restore(flags);
>>  	} else {
>>  		// No-CBs GP kthread might be indefinitely asleep, if so, wake.
>> @@ -478,6 +561,10 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
>>  			trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
>>  					    TPS("FirstBQwake"));
>>  			__call_rcu_nocb_wake(rdp, true, flags);
>> +		} else if (bypass_is_lazy && !lazy) {
>> +			trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
>> +					    TPS("FirstBQwakeLazy2Non"));
>> +			__call_rcu_nocb_wake(rdp, true, flags);
>>  		} else {
>>  			trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
>>  					    TPS("FirstBQnoWake"));
>> @@ -499,7 +586,7 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone,
>>  {
>>  	unsigned long cur_gp_seq;
>>  	unsigned long j;
>> -	long len;
>> +	long len, lazy_len, bypass_len;
>>  	struct task_struct *t;
>>  
>>  	// If we are being polled or there is no kthread, just leave.
>> @@ -512,9 +599,16 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone,
>>  	}
>>  	// Need to actually to a wakeup.
>>  	len = rcu_segcblist_n_cbs(&rdp->cblist);
>> +	bypass_len = rcu_cblist_n_cbs(&rdp->nocb_bypass);
>> +	lazy_len = READ_ONCE(rdp->lazy_len);
>>  	if (was_alldone) {
>>  		rdp->qlen_last_fqs_check = len;
>> -		if (!irqs_disabled_flags(flags)) {
>> +		// Only lazy CBs in bypass list
>> +		if (lazy_len && bypass_len == lazy_len) {
>> +			rcu_nocb_unlock_irqrestore(rdp, flags);
>> +			wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_LAZY,
>> +					   TPS("WakeLazy"));
>> +		} else if (!irqs_disabled_flags(flags)) {
>>  			/* ... if queue was empty ... */
>>  			rcu_nocb_unlock_irqrestore(rdp, flags);
>>  			wake_nocb_gp(rdp, false);
>> @@ -604,8 +698,8 @@ static void nocb_gp_sleep(struct rcu_data *my_rdp, int cpu)
>>   */
>>  static void nocb_gp_wait(struct rcu_data *my_rdp)
>>  {
>> -	bool bypass = false;
>> -	long bypass_ncbs;
>> +	bool bypass = false, lazy = false;
>> +	long bypass_ncbs, lazy_ncbs;
>>  	int __maybe_unused cpu = my_rdp->cpu;
>>  	unsigned long cur_gp_seq;
>>  	unsigned long flags;
>> @@ -640,24 +734,41 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
>>  	 * won't be ignored for long.
>>  	 */
>>  	list_for_each_entry(rdp, &my_rdp->nocb_head_rdp, nocb_entry_rdp) {
>> +		bool flush_bypass = false;
>> +
>>  		trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Check"));
>>  		rcu_nocb_lock_irqsave(rdp, flags);
>>  		lockdep_assert_held(&rdp->nocb_lock);
>>  		bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
>> -		if (bypass_ncbs &&
>> +		lazy_ncbs = READ_ONCE(rdp->lazy_len);
>> +
>> +		if (bypass_ncbs && (lazy_ncbs == bypass_ncbs) &&
>> +		    (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + jiffies_till_flush) ||
>> +		     bypass_ncbs > 2 * qhimark)) {
>> +			flush_bypass = true;
>> +		} else if (bypass_ncbs && (lazy_ncbs != bypass_ncbs) &&
>>  		    (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + 1) ||
>>  		     bypass_ncbs > 2 * qhimark)) {
>> -			// Bypass full or old, so flush it.
>> -			(void)rcu_nocb_try_flush_bypass(rdp, j);
>> -			bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
>> +			flush_bypass = true;
>>  		} else if (!bypass_ncbs && rcu_segcblist_empty(&rdp->cblist)) {
>>  			rcu_nocb_unlock_irqrestore(rdp, flags);
>>  			continue; /* No callbacks here, try next. */
>>  		}
>> +
>> +		if (flush_bypass) {
>> +			// Bypass full or old, so flush it.
>> +			(void)rcu_nocb_try_flush_bypass(rdp, j);
>> +			bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
>> +			lazy_ncbs = READ_ONCE(rdp->lazy_len);
>> +		}
>> +
>>  		if (bypass_ncbs) {
>>  			trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
>> -					    TPS("Bypass"));
>> -			bypass = true;
>> +				    bypass_ncbs == lazy_ncbs ? TPS("Lazy") : TPS("Bypass"));
>> +			if (bypass_ncbs == lazy_ncbs)
>> +				lazy = true;
>> +			else
>> +				bypass = true;
>>  		}
>>  		rnp = rdp->mynode;
>>  
>> @@ -705,12 +816,21 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
>>  	my_rdp->nocb_gp_gp = needwait_gp;
>>  	my_rdp->nocb_gp_seq = needwait_gp ? wait_gp_seq : 0;
>>  
>> -	if (bypass && !rcu_nocb_poll) {
>> -		// At least one child with non-empty ->nocb_bypass, so set
>> -		// timer in order to avoid stranding its callbacks.
>> -		wake_nocb_gp_defer(my_rdp, RCU_NOCB_WAKE_BYPASS,
>> -				   TPS("WakeBypassIsDeferred"));
>> +	// At least one child with non-empty ->nocb_bypass, so set
>> +	// timer in order to avoid stranding its callbacks.
>> +	if (!rcu_nocb_poll) {
>> +		// If bypass list only has lazy CBs. Add a deferred
>> +		// lazy wake up.
>> +		if (lazy && !bypass) {
>> +			wake_nocb_gp_defer(my_rdp, RCU_NOCB_WAKE_LAZY,
>> +					TPS("WakeLazyIsDeferred"));
>> +		// Otherwise add a deferred bypass wake up.
>> +		} else if (bypass) {
>> +			wake_nocb_gp_defer(my_rdp, RCU_NOCB_WAKE_BYPASS,
>> +					TPS("WakeBypassIsDeferred"));
>> +		}
>>  	}
>> +
>>  	if (rcu_nocb_poll) {
>>  		/* Polling, so trace if first poll in the series. */
>>  		if (gotcbs)
>> @@ -1036,7 +1156,7 @@ static long rcu_nocb_rdp_deoffload(void *arg)
>>  	 * return false, which means that future calls to rcu_nocb_try_bypass()
>>  	 * will refuse to put anything into the bypass.
>>  	 */
>> -	WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies));
>> +	WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies, false, false));
>>  	/*
>>  	 * Start with invoking rcu_core() early. This way if the current thread
>>  	 * happens to preempt an ongoing call to rcu_core() in the middle,
>> @@ -1290,6 +1410,7 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
>>  	raw_spin_lock_init(&rdp->nocb_gp_lock);
>>  	timer_setup(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer, 0);
>>  	rcu_cblist_init(&rdp->nocb_bypass);
>> +	WRITE_ONCE(rdp->lazy_len, 0);
>>  	mutex_init(&rdp->nocb_gp_kthread_mutex);
>>  }
>>  
>> @@ -1571,13 +1692,14 @@ static void rcu_init_one_nocb(struct rcu_node *rnp)
>>  }
>>  
>>  static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
>> -				  unsigned long j)
>> +				  unsigned long j, bool lazy, bool wakegp)
>>  {
>>  	return true;
>>  }
>>  
>>  static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
>> -				bool *was_alldone, unsigned long flags)
>> +				bool *was_alldone, unsigned long flags,
>> +				bool lazy)
>>  {
>>  	return false;
>>  }
>> -- 
>> 2.37.2.789.g6183377224-goog
>>
Joel Fernandes Sept. 3, 2022, 10 p.m. UTC | #5
On Fri, Sep 02, 2022 at 05:21:32PM +0200, Frederic Weisbecker wrote:
[..] 
> > +
> >  	raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
> >  
> >  	trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, reason);
> [...]
> > @@ -705,12 +816,21 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
> >  	my_rdp->nocb_gp_gp = needwait_gp;
> >  	my_rdp->nocb_gp_seq = needwait_gp ? wait_gp_seq : 0;
> >  
> > -	if (bypass && !rcu_nocb_poll) {
> > -		// At least one child with non-empty ->nocb_bypass, so set
> > -		// timer in order to avoid stranding its callbacks.
> > -		wake_nocb_gp_defer(my_rdp, RCU_NOCB_WAKE_BYPASS,
> > -				   TPS("WakeBypassIsDeferred"));
> > +	// At least one child with non-empty ->nocb_bypass, so set
> > +	// timer in order to avoid stranding its callbacks.
> > +	if (!rcu_nocb_poll) {
> > +		// If bypass list only has lazy CBs. Add a deferred
> > +		// lazy wake up.
> > +		if (lazy && !bypass) {
> > +			wake_nocb_gp_defer(my_rdp, RCU_NOCB_WAKE_LAZY,
> > +					TPS("WakeLazyIsDeferred"));
> 
> What if:
> 
> 1) rdp(1) has lazy callbacks
> 2) all other rdp's have no callback at all
> 3) nocb_gp_wait() runs through all rdp's, everything is handled, except for
>    these lazy callbacks
> 4) It reaches the above path, ready to arm the RCU_NOCB_WAKE_LAZY timer,
>    but it hasn't yet called wake_nocb_gp_defer()
> 5) Oh but rdp(2) queues a non-lazy callback. interrupts are disabled so it defers
>    the wake up to nocb_gp_wait() with arming the timer in RCU_NOCB_WAKE.
> 6) nocb_gp_wait() finally calls wake_nocb_gp_defer() and override the timeout
>    to several seconds ahead.
> 7) No more callbacks queued, the non-lazy callback will have to wait several
>    seconds to complete.
> 
> Or did I miss something? Note that the race exists with RCU_NOCB_WAKE_BYPASS
> but it's only about one jiffy delay, not seconds.
> 

So I think the below patch should fix that. But I have not tested it at all
and it could very well have issues. In particular, there is a likelihood of a
wake up while holding a lock which I'm not sure is safe due to scheduler
locks. I'll test it next week. Let me know any thoughts though.

---8<-----------------------

From: "Joel Fernandes (Google)" <joel@joelfernandes.org>
Subject: [PATCH] rcu: Fix race where wake_nocb_gp_defer() lazy wake can
 overwrite a non-lazy wake

Fix by holding nocb_gp_lock when observing the state of all rdps. If any
rdp queued a non-lazy CB, we would do a wake up of the main gp thread.

This should address the race Frederick reported (which could effect both
non-lazy CBs using the bypass list, and lazy CBs, though lazy CBs much
more noticeably).

Quoting from Frederic's email:

1) rdp(1) has lazy callbacks
2) all other rdp's have no callback at all
3) nocb_gp_wait() runs through all rdp's, everything is handled, except for
   these lazy callbacks
4) It reaches the above path, ready to arm the RCU_NOCB_WAKE_LAZY timer,
   but it hasn't yet called wake_nocb_gp_defer()
5) Oh but rdp(2) queues a non-lazy callback. interrupts are disabled so it defers
   the wake up to nocb_gp_wait() with arming the timer in RCU_NOCB_WAKE.
6) nocb_gp_wait() finally calls wake_nocb_gp_defer() and override the timeout
   to several seconds ahead.
7) No more callbacks queued, the non-lazy callback will have to wait several
   seconds to complete.

Here, the nocb gp lock is held when #4 happens. So the deferred wakeup
on #5 has to wait till #4 finishes.

Reported-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
---
 kernel/rcu/tree_nocb.h | 28 ++++++++++++++++++++++------
 1 file changed, 22 insertions(+), 6 deletions(-)

diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index 8b46442e4473..6690ece8fe20 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -285,14 +285,15 @@ EXPORT_SYMBOL(rcu_lazy_get_jiffies_till_flush);
  * Arrange to wake the GP kthread for this NOCB group at some future
  * time when it is safe to do so.
  */
-static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype,
-			       const char *reason)
+static void __wake_nocb_gp_defer(struct rcu_data *rdp, int waketype,
+			       const char *reason, bool locked)
 {
 	unsigned long flags;
 	struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
 	unsigned long mod_jif = 0;
 
-	raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
+	if (!locked)
+		raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
 
 	/*
 	 * Bypass and lazy wakeup overrides previous deferments. In case of
@@ -323,11 +324,23 @@ static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype,
 	if (rdp_gp->nocb_defer_wakeup < waketype)
 		WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype);
 
-	raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
+	if (!locked)
+		raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
 
 	trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, reason);
 }
 
+static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype,
+		const char *reason) {
+	__wake_nocb_gp_defer(rdp, waketype, reason, false);
+}
+
+
+static void wake_nocb_gp_defer_locked(struct rcu_data *rdp, int waketype,
+		const char *reason) {
+	__wake_nocb_gp_defer(rdp, waketype, reason, true);

+
 /*
  * Flush the ->nocb_bypass queue into ->cblist, enqueuing rhp if non-NULL.
  * However, if there is a callback to be enqueued and if ->nocb_bypass
@@ -754,6 +767,8 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
 	 * is added to the list, so the skipped-over rcu_data structures
 	 * won't be ignored for long.
 	 */
+
+	raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags);
 	list_for_each_entry_rcu(rdp, &my_rdp->nocb_head_rdp, nocb_entry_rdp, 1) {
 		bool needwake_state = false;
 		bool flush_bypass = false;
@@ -855,14 +870,15 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
 		// If bypass list only has lazy CBs. Add a deferred
 		// lazy wake up.
 		if (lazy && !bypass) {
-			wake_nocb_gp_defer(my_rdp, RCU_NOCB_WAKE_LAZY,
+			wake_nocb_gp_defer_locked(my_rdp, RCU_NOCB_WAKE_LAZY,
 					TPS("WakeLazyIsDeferred"));
 		// Otherwise add a deferred bypass wake up.
 		} else if (bypass) {
-			wake_nocb_gp_defer(my_rdp, RCU_NOCB_WAKE_BYPASS,
+			wake_nocb_gp_defer_locked(my_rdp, RCU_NOCB_WAKE_BYPASS,
 					TPS("WakeBypassIsDeferred"));
 		}
 	}
+	raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags);
 
 	if (rcu_nocb_poll) {
 		/* Polling, so trace if first poll in the series. */
Frederic Weisbecker Sept. 4, 2022, 9:01 p.m. UTC | #6
On Sat, Sep 03, 2022 at 10:00:29PM +0000, Joel Fernandes wrote:
> On Fri, Sep 02, 2022 at 05:21:32PM +0200, Frederic Weisbecker wrote:
> +
> +	raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags);

This is locking during the whole group iteration potentially contending call_rcu(),
idle loop, resume to userspace on all rdp in the group...

How about not overwriting timers instead and only set the RCU_NOCB_WAKE_LAZY
timer when it is previously in RCU_NOCB_WAKE_NOT state? After all if the timer is armed,
it's because we have regular callbacks queued and thus we don't need to wait
before processing the lazy callbacks since we are going to start a grace period
anyway.

Thanks.


>  	list_for_each_entry_rcu(rdp, &my_rdp->nocb_head_rdp, nocb_entry_rdp, 1) {
>  		bool needwake_state = false;
>  		bool flush_bypass = false;
> @@ -855,14 +870,15 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
>  		// If bypass list only has lazy CBs. Add a deferred
>  		// lazy wake up.
>  		if (lazy && !bypass) {
> -			wake_nocb_gp_defer(my_rdp, RCU_NOCB_WAKE_LAZY,
> +			wake_nocb_gp_defer_locked(my_rdp, RCU_NOCB_WAKE_LAZY,
>  					TPS("WakeLazyIsDeferred"));
>  		// Otherwise add a deferred bypass wake up.
>  		} else if (bypass) {
> -			wake_nocb_gp_defer(my_rdp, RCU_NOCB_WAKE_BYPASS,
> +			wake_nocb_gp_defer_locked(my_rdp, RCU_NOCB_WAKE_BYPASS,
>  					TPS("WakeBypassIsDeferred"));
>  		}
>  	}
> +	raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags);
>  
>  	if (rcu_nocb_poll) {
>  		/* Polling, so trace if first poll in the series. */
> -- 
> 2.37.2.789.g6183377224-goog
>
Frederic Weisbecker Sept. 5, 2022, 12:59 p.m. UTC | #7
On Fri, Sep 02, 2022 at 07:09:39PM -0400, Joel Fernandes wrote:
> On 9/2/2022 11:21 AM, Frederic Weisbecker wrote:
> >> @@ -3904,7 +3943,8 @@ static void rcu_barrier_entrain(struct rcu_data *rdp)
> >>  	rdp->barrier_head.func = rcu_barrier_callback;
> >>  	debug_rcu_head_queue(&rdp->barrier_head);
> >>  	rcu_nocb_lock(rdp);
> >> -	WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies));
> >> +	WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies, false,
> >> +		     /* wake gp thread */ true));
> > 
> > It's a bad sign when you need to start commenting your boolean parameters :)
> > Perhaps use a single two-bit flag instead of two booleans, for readability?
> 
> That's fair, what do you mean 2-bit flag? Are you saying, we encode the last 2
> parameters to flush bypass in a u*?

Yeah exactly. Such as rcu_nocb_flush_bypass(rdp, NULL, jiffies, FLUSH_LAZY | FLUSH_WAKE)

> 
> > Also that's a subtle change which purpose isn't explained. It means that
> > rcu_barrier_entrain() used to wait for the bypass timer in the worst case
> > but now we force rcuog into it immediately. Should that be a separate change?
> 
> It could be split, but it is laziness that amplifies the issue so I thought of
> keeping it in the same patch. I don't mind one way or the other.

Ok then lets keep it here but please add a comment for the reason to
force wake here.

> >> +	case RCU_NOCB_WAKE_BYPASS:
> >> +		mod_jif = 2;
> >> +		break;
> >> +
> >> +	case RCU_NOCB_WAKE:
> >> +	case RCU_NOCB_WAKE_FORCE:
> >> +		// For these, make it wake up the soonest if we
> >> +		// were in a bypass or lazy sleep before.
> >>  		if (rdp_gp->nocb_defer_wakeup < RCU_NOCB_WAKE)
> >> -			mod_timer(&rdp_gp->nocb_timer, jiffies + 1);
> >> -		if (rdp_gp->nocb_defer_wakeup < waketype)
> >> -			WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype);
> >> +			mod_jif = 1;
> >> +		break;
> >>  	}
> >>  
> >> +	if (mod_jif)
> >> +		mod_timer(&rdp_gp->nocb_timer, jiffies + mod_jif);
> >> +
> >> +	if (rdp_gp->nocb_defer_wakeup < waketype)
> >> +		WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype);
> > 
> > So RCU_NOCB_WAKE_BYPASS and RCU_NOCB_WAKE_LAZY don't override the timer state
> > anymore? Looks like something is missing.
> 
> My goal was to make sure that NOCB_WAKE_LAZY wake keeps the timer lazy. If I
> don't do this, then when CPU enters idle, it will immediately do a wake up via
> this call:
> 
> 	rcu_nocb_need_deferred_wakeup(rdp_gp, RCU_NOCB_WAKE)

But if the timer is in RCU_NOCB_WAKE_LAZY mode, that shouldn't be a problem.

> 
> That was almost always causing lazy CBs to be non-lazy thus negating all the
> benefits.
> 
> Note that bypass will also have same issue where the bypass CB will not wait for
> intended bypass duration. To make it consistent with lazy, I made bypass also
> not override nocb_defer_wakeup.

I'm surprised because rcu_nocb_flush_deferred_wakeup() should only do the wake up
if the timer is RCU_NOCB_WAKE or RCU_NOCB_WAKE_FORCE. Or is that code buggy
somehow?

Actually your change is modifying the timer delay without changing the timer
mode, which may shortcut rcu_nocb_flush_deferred_wakeup() check and actually
make it perform early upon idle loop entry.

Or am I missing something?


> 
> I agree its not pretty, but it works and I could not find any code path where it
> does not work. That said, I am open to ideas for changing this and perhaps some
> of these unneeded delays with bypass CBs can be split into separate patches.
> 
> Regarding your point about nocb_defer_wakeup state diverging from the timer
> programming, that happens anyway here in current code:
> 
>  283        } else {
>  284                if (rdp_gp->nocb_defer_wakeup < RCU_NOCB_WAKE)
>  285                        mod_timer(&rdp_gp->nocb_timer, jiffies + 1);
>  286                if (rdp_gp->nocb_defer_wakeup < waketype)
>  287                        WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype);
>  288        }

How so?

> >> @@ -705,12 +816,21 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
> >>  	my_rdp->nocb_gp_gp = needwait_gp;
> >>  	my_rdp->nocb_gp_seq = needwait_gp ? wait_gp_seq : 0;
> >>  
> >> -	if (bypass && !rcu_nocb_poll) {
> >> -		// At least one child with non-empty ->nocb_bypass, so set
> >> -		// timer in order to avoid stranding its callbacks.
> >> -		wake_nocb_gp_defer(my_rdp, RCU_NOCB_WAKE_BYPASS,
> >> -				   TPS("WakeBypassIsDeferred"));
> >> +	// At least one child with non-empty ->nocb_bypass, so set
> >> +	// timer in order to avoid stranding its callbacks.
> >> +	if (!rcu_nocb_poll) {
> >> +		// If bypass list only has lazy CBs. Add a deferred
> >> +		// lazy wake up.
> >> +		if (lazy && !bypass) {
> >> +			wake_nocb_gp_defer(my_rdp, RCU_NOCB_WAKE_LAZY,
> >> +					TPS("WakeLazyIsDeferred"));
> > 
> > What if:
> > 
> > 1) rdp(1) has lazy callbacks
> > 2) all other rdp's have no callback at all
> > 3) nocb_gp_wait() runs through all rdp's, everything is handled, except for
> >    these lazy callbacks
> > 4) It reaches the above path, ready to arm the RCU_NOCB_WAKE_LAZY timer,
> >    but it hasn't yet called wake_nocb_gp_defer()
> > 5) Oh but rdp(2) queues a non-lazy callback. interrupts are disabled so it defers
> >    the wake up to nocb_gp_wait() with arming the timer in RCU_NOCB_WAKE.
> > 6) nocb_gp_wait() finally calls wake_nocb_gp_defer() and override the timeout
> >    to several seconds ahead.
> > 7) No more callbacks queued, the non-lazy callback will have to wait several
> >    seconds to complete.
> > 
> > Or did I miss something?
> 
> In theory, I can see this being an issue. In practice, I have not seen it to
> be.

What matters is that the issue looks plausible.

> In my view, the nocb GP thread should not go to sleep in the first place if
> there are any non-bypass CBs being queued. If it does, then that seems an
> optimization-related bug.

Yeah, it's a constraint introduced by the optimized delayed wake up.
By why would RCU_NOCB_WAKE_LAZY need to overwrite RCU_NOCB_WAKE in the first
place?

> 
> That said, we can make wake_nocb_gp_defer() more robust perhaps by making it not
> overwrite the timer if the wake-type requested is weaker than RCU_NOCB_WAKE,
> however that should not cause the going-into-idle issues I pointed. Whether the
> idle time issue will happen, I have no idea. But in theory, will that address
> your concern above?

Yes, but I'm still confused by this idle time issue.

> 
> > Note that the race exists with RCU_NOCB_WAKE_BYPASS
> > but it's only about one jiffy delay, not seconds.
> 
> Well, 2 jiffies. But yeah.
> 
> Thanks, so far I do not see anything that cannot be fixed on top of this patch
> but you raised some good points. Maybe we ought to rewrite the idle path to not
> disturb lazy CBs in a different way, or something (while keeping the timer state
> consistent with the programming of the timer in wake_nocb_gp_defer()).

I'd rather see an updated patch (not the whole patchset but just this one) rather
than deltas, just to make sure I'm not missing something in the whole picture.

Thanks.
Joel Fernandes Sept. 5, 2022, 8:18 p.m. UTC | #8
Hi Frederick,

On 9/5/2022 8:59 AM, Frederic Weisbecker wrote:
>>> Also that's a subtle change which purpose isn't explained. It means that
>>> rcu_barrier_entrain() used to wait for the bypass timer in the worst case
>>> but now we force rcuog into it immediately. Should that be a separate change?
>> It could be split, but it is laziness that amplifies the issue so I thought of
>> keeping it in the same patch. I don't mind one way or the other.
> Ok then lets keep it here but please add a comment for the reason to
> force wake here.

Ok will do, thanks.

>>>> +	case RCU_NOCB_WAKE_BYPASS:
>>>> +		mod_jif = 2;
>>>> +		break;
>>>> +
>>>> +	case RCU_NOCB_WAKE:
>>>> +	case RCU_NOCB_WAKE_FORCE:
>>>> +		// For these, make it wake up the soonest if we
>>>> +		// were in a bypass or lazy sleep before.
>>>>  		if (rdp_gp->nocb_defer_wakeup < RCU_NOCB_WAKE)
>>>> -			mod_timer(&rdp_gp->nocb_timer, jiffies + 1);
>>>> -		if (rdp_gp->nocb_defer_wakeup < waketype)
>>>> -			WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype);
>>>> +			mod_jif = 1;
>>>> +		break;
>>>>  	}
>>>>  
>>>> +	if (mod_jif)
>>>> +		mod_timer(&rdp_gp->nocb_timer, jiffies + mod_jif);
>>>> +
>>>> +	if (rdp_gp->nocb_defer_wakeup < waketype)
>>>> +		WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype);
>>> So RCU_NOCB_WAKE_BYPASS and RCU_NOCB_WAKE_LAZY don't override the timer state
>>> anymore? Looks like something is missing.
>> My goal was to make sure that NOCB_WAKE_LAZY wake keeps the timer lazy. If I
>> don't do this, then when CPU enters idle, it will immediately do a wake up via
>> this call:
>>
>> 	rcu_nocb_need_deferred_wakeup(rdp_gp, RCU_NOCB_WAKE)
> But if the timer is in RCU_NOCB_WAKE_LAZY mode, that shouldn't be a problem.
> 
>> That was almost always causing lazy CBs to be non-lazy thus negating all the
>> benefits.
>>
>> Note that bypass will also have same issue where the bypass CB will not wait for
>> intended bypass duration. To make it consistent with lazy, I made bypass also
>> not override nocb_defer_wakeup.
> I'm surprised because rcu_nocb_flush_deferred_wakeup() should only do the wake up
> if the timer is RCU_NOCB_WAKE or RCU_NOCB_WAKE_FORCE. Or is that code buggy
> somehow?
> Actually your change is modifying the timer delay without changing the timer
> mode, which may shortcut rcu_nocb_flush_deferred_wakeup() check and actually
> make it perform early upon idle loop entry.
> 
> Or am I missing something?
> 

You could very well have a point and I am not sure now (I happen to 'forget' the
issue once the code was working). I distinctly remember not being able to be
lazy without doing this. Maybe there is some other path. I am kicking myself for
not commenting in the code or change log enough about the issue.

I will test again sync'ing the lazy timer and the ->nocb_defer_wakeup field
properly and see if I can trigger the issue.

Thanks!

 - Joel
Joel Fernandes Sept. 5, 2022, 8:20 p.m. UTC | #9
Hi Frederic,

On 9/4/2022 5:01 PM, Frederic Weisbecker wrote:
> On Sat, Sep 03, 2022 at 10:00:29PM +0000, Joel Fernandes wrote:
>> On Fri, Sep 02, 2022 at 05:21:32PM +0200, Frederic Weisbecker wrote:
>> +
>> +	raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags);
> 
> This is locking during the whole group iteration potentially contending call_rcu(),
> idle loop, resume to userspace on all rdp in the group...
> 
> How about not overwriting timers instead and only set the RCU_NOCB_WAKE_LAZY
> timer when it is previously in RCU_NOCB_WAKE_NOT state? After all if the timer is armed,
> it's because we have regular callbacks queued and thus we don't need to wait
> before processing the lazy callbacks since we are going to start a grace period
> anyway.
> 
> Thanks.

I like your idea better. I think it will work well to resolve the race you
described.

Thanks!

 - Joel


> 
> 
>>  	list_for_each_entry_rcu(rdp, &my_rdp->nocb_head_rdp, nocb_entry_rdp, 1) {
>>  		bool needwake_state = false;
>>  		bool flush_bypass = false;
>> @@ -855,14 +870,15 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
>>  		// If bypass list only has lazy CBs. Add a deferred
>>  		// lazy wake up.
>>  		if (lazy && !bypass) {
>> -			wake_nocb_gp_defer(my_rdp, RCU_NOCB_WAKE_LAZY,
>> +			wake_nocb_gp_defer_locked(my_rdp, RCU_NOCB_WAKE_LAZY,
>>  					TPS("WakeLazyIsDeferred"));
>>  		// Otherwise add a deferred bypass wake up.
>>  		} else if (bypass) {
>> -			wake_nocb_gp_defer(my_rdp, RCU_NOCB_WAKE_BYPASS,
>> +			wake_nocb_gp_defer_locked(my_rdp, RCU_NOCB_WAKE_BYPASS,
>>  					TPS("WakeBypassIsDeferred"));
>>  		}
>>  	}
>> +	raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags);
>>  
>>  	if (rcu_nocb_poll) {
>>  		/* Polling, so trace if first poll in the series. */
>> -- 
>> 2.37.2.789.g6183377224-goog
>>
Joel Fernandes Sept. 5, 2022, 8:32 p.m. UTC | #10
On 9/5/2022 8:59 AM, Frederic Weisbecker wrote:
> I'd rather see an updated patch (not the whole patchset but just this one) rather
> than deltas, just to make sure I'm not missing something in the whole picture.
> 
> Thanks.

There is also the previous patch which needs a fix up you suggested.

I will reply to individual patches with in-line patch (for you), as well as
refreshing whole series since it might be easier for Paul to apply them together.

thanks,

- Joel
Joel Fernandes Sept. 6, 2022, 3:05 a.m. UTC | #11
On Fri, Sep 02, 2022 at 05:21:32PM +0200, Frederic Weisbecker wrote:
> On Thu, Sep 01, 2022 at 10:17:08PM +0000, Joel Fernandes (Google) wrote:
> > Implement timer-based RCU lazy callback batching. The batch is flushed
> > whenever a certain amount of time has passed, or the batch on a
> > particular CPU grows too big. Also memory pressure will flush it in a
> > future patch.
> > 
> > To handle several corner cases automagically (such as rcu_barrier() and
> > hotplug), we re-use bypass lists to handle lazy CBs. The bypass list
> > length has the lazy CB length included in it. A separate lazy CB length
> > counter is also introduced to keep track of the number of lazy CBs.
> > 
> > Suggested-by: Paul McKenney <paulmck@kernel.org>
> > Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
> > ---

Here is the updated version of this patch for further testing and review.
Paul, you could consider updating your test branch. I have tested it in
ChromeOS as well, and rcuscale. The laziness and boot times are looking good.
There was at least one bug that I fixed that got introduced with the moving
of the length field to rcu_data.  Thanks a lot Frederic for the review
comments.

I will look at the rcu torture issue next... I suspect the length field issue
may have been causing it.

---8<-----------------------

From: "Joel Fernandes (Google)" <joel@joelfernandes.org>
Subject: [PATCH v6] rcu: Introduce call_rcu_lazy() API implementation

Implement timer-based RCU lazy callback batching. The batch is flushed
whenever a certain amount of time has passed, or the batch on a
particular CPU grows too big. Also memory pressure will flush it in a
future patch.

To handle several corner cases automagically (such as rcu_barrier() and
hotplug), we re-use bypass lists to handle lazy CBs. The bypass list
length has the lazy CB length included in it. A separate lazy CB length
counter is also introduced to keep track of the number of lazy CBs.

v5->v6:

[ Frederic Weisbec: Program the lazy timer only if WAKE_NOT, since other
  deferral levels wake much earlier so for those it is not needed. ]

[ Frederic Weisbec: Use flush flags to keep bypass API code clean. ]

[ Joel: Fix issue where I was not resetting lazy_len after moving it to rdp ]

Suggested-by: Paul McKenney <paulmck@kernel.org>
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
---
 include/linux/rcupdate.h |   6 ++
 kernel/rcu/Kconfig       |   8 ++
 kernel/rcu/rcu.h         |  11 +++
 kernel/rcu/tree.c        | 133 +++++++++++++++++++----------
 kernel/rcu/tree.h        |  17 +++-
 kernel/rcu/tree_nocb.h   | 175 ++++++++++++++++++++++++++++++++-------
 6 files changed, 269 insertions(+), 81 deletions(-)

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 08605ce7379d..82e8a07e0856 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -108,6 +108,12 @@ static inline int rcu_preempt_depth(void)
 
 #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
 
+#ifdef CONFIG_RCU_LAZY
+void call_rcu_lazy(struct rcu_head *head, rcu_callback_t func);
+#else
+#define call_rcu_lazy(head, func) call_rcu(head, func)
+#endif
+
 /* Internal to kernel */
 void rcu_init(void);
 extern int rcu_scheduler_active;
diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
index d471d22a5e21..3128d01427cb 100644
--- a/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@ -311,4 +311,12 @@ config TASKS_TRACE_RCU_READ_MB
 	  Say N here if you hate read-side memory barriers.
 	  Take the default if you are unsure.
 
+config RCU_LAZY
+	bool "RCU callback lazy invocation functionality"
+	depends on RCU_NOCB_CPU
+	default n
+	help
+	  To save power, batch RCU callbacks and flush after delay, memory
+	  pressure or callback list growing too big.
+
 endmenu # "RCU Subsystem"
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index be5979da07f5..94675f14efe8 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -474,6 +474,14 @@ enum rcutorture_type {
 	INVALID_RCU_FLAVOR
 };
 
+#if defined(CONFIG_RCU_LAZY)
+unsigned long rcu_lazy_get_jiffies_till_flush(void);
+void rcu_lazy_set_jiffies_till_flush(unsigned long j);
+#else
+static inline unsigned long rcu_lazy_get_jiffies_till_flush(void) { return 0; }
+static inline void rcu_lazy_set_jiffies_till_flush(unsigned long j) { }
+#endif
+
 #if defined(CONFIG_TREE_RCU)
 void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
 			    unsigned long *gp_seq);
@@ -483,6 +491,8 @@ void do_trace_rcu_torture_read(const char *rcutorturename,
 			       unsigned long c_old,
 			       unsigned long c);
 void rcu_gp_set_torture_wait(int duration);
+void rcu_force_call_rcu_to_lazy(bool force);
+
 #else
 static inline void rcutorture_get_gp_data(enum rcutorture_type test_type,
 					  int *flags, unsigned long *gp_seq)
@@ -501,6 +511,7 @@ void do_trace_rcu_torture_read(const char *rcutorturename,
 	do { } while (0)
 #endif
 static inline void rcu_gp_set_torture_wait(int duration) { }
+static inline void rcu_force_call_rcu_to_lazy(bool force) { }
 #endif
 
 #if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST)
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 9fe581be8696..dbd25b8c080e 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2728,47 +2728,8 @@ static void check_cb_ovld(struct rcu_data *rdp)
 	raw_spin_unlock_rcu_node(rnp);
 }
 
-/**
- * call_rcu() - Queue an RCU callback for invocation after a grace period.
- * @head: structure to be used for queueing the RCU updates.
- * @func: actual callback function to be invoked after the grace period
- *
- * The callback function will be invoked some time after a full grace
- * period elapses, in other words after all pre-existing RCU read-side
- * critical sections have completed.  However, the callback function
- * might well execute concurrently with RCU read-side critical sections
- * that started after call_rcu() was invoked.
- *
- * RCU read-side critical sections are delimited by rcu_read_lock()
- * and rcu_read_unlock(), and may be nested.  In addition, but only in
- * v5.0 and later, regions of code across which interrupts, preemption,
- * or softirqs have been disabled also serve as RCU read-side critical
- * sections.  This includes hardware interrupt handlers, softirq handlers,
- * and NMI handlers.
- *
- * Note that all CPUs must agree that the grace period extended beyond
- * all pre-existing RCU read-side critical section.  On systems with more
- * than one CPU, this means that when "func()" is invoked, each CPU is
- * guaranteed to have executed a full memory barrier since the end of its
- * last RCU read-side critical section whose beginning preceded the call
- * to call_rcu().  It also means that each CPU executing an RCU read-side
- * critical section that continues beyond the start of "func()" must have
- * executed a memory barrier after the call_rcu() but before the beginning
- * of that RCU read-side critical section.  Note that these guarantees
- * include CPUs that are offline, idle, or executing in user mode, as
- * well as CPUs that are executing in the kernel.
- *
- * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the
- * resulting RCU callback function "func()", then both CPU A and CPU B are
- * guaranteed to execute a full memory barrier during the time interval
- * between the call to call_rcu() and the invocation of "func()" -- even
- * if CPU A and CPU B are the same CPU (but again only if the system has
- * more than one CPU).
- *
- * Implementation of these memory-ordering guarantees is described here:
- * Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst.
- */
-void call_rcu(struct rcu_head *head, rcu_callback_t func)
+static void
+__call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy)
 {
 	static atomic_t doublefrees;
 	unsigned long flags;
@@ -2818,7 +2779,7 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func)
 		trace_rcu_callback(rcu_state.name, head,
 				   rcu_segcblist_n_cbs(&rdp->cblist));
 
-	if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags))
+	if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags, lazy))
 		return; // Enqueued onto ->nocb_bypass, so just leave.
 	// If no-CBs CPU gets here, rcu_nocb_try_bypass() acquired ->nocb_lock.
 	rcu_segcblist_enqueue(&rdp->cblist, head);
@@ -2833,8 +2794,86 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func)
 		local_irq_restore(flags);
 	}
 }
-EXPORT_SYMBOL_GPL(call_rcu);
 
+#ifdef CONFIG_RCU_LAZY
+/**
+ * call_rcu_lazy() - Lazily queue RCU callback for invocation after grace period.
+ * @head: structure to be used for queueing the RCU updates.
+ * @func: actual callback function to be invoked after the grace period
+ *
+ * The callback function will be invoked some time after a full grace
+ * period elapses, in other words after all pre-existing RCU read-side
+ * critical sections have completed.
+ *
+ * Use this API instead of call_rcu() if you don't mind the callback being
+ * invoked after very long periods of time on systems without memory pressure
+ * and on systems which are lightly loaded or mostly idle.
+ *
+ * Other than the extra delay in callbacks being invoked, this function is
+ * identical to, and reuses call_rcu()'s logic. Refer to call_rcu() for more
+ * details about memory ordering and other functionality.
+ */
+void call_rcu_lazy(struct rcu_head *head, rcu_callback_t func)
+{
+	return __call_rcu_common(head, func, true);
+}
+EXPORT_SYMBOL_GPL(call_rcu_lazy);
+#endif
+
+static bool force_call_rcu_to_lazy;
+
+void rcu_force_call_rcu_to_lazy(bool force)
+{
+	if (IS_ENABLED(CONFIG_RCU_SCALE_TEST))
+		WRITE_ONCE(force_call_rcu_to_lazy, force);
+}
+EXPORT_SYMBOL_GPL(rcu_force_call_rcu_to_lazy);
+
+/**
+ * call_rcu() - Queue an RCU callback for invocation after a grace period.
+ * @head: structure to be used for queueing the RCU updates.
+ * @func: actual callback function to be invoked after the grace period
+ *
+ * The callback function will be invoked some time after a full grace
+ * period elapses, in other words after all pre-existing RCU read-side
+ * critical sections have completed.  However, the callback function
+ * might well execute concurrently with RCU read-side critical sections
+ * that started after call_rcu() was invoked.
+ *
+ * RCU read-side critical sections are delimited by rcu_read_lock()
+ * and rcu_read_unlock(), and may be nested.  In addition, but only in
+ * v5.0 and later, regions of code across which interrupts, preemption,
+ * or softirqs have been disabled also serve as RCU read-side critical
+ * sections.  This includes hardware interrupt handlers, softirq handlers,
+ * and NMI handlers.
+ *
+ * Note that all CPUs must agree that the grace period extended beyond
+ * all pre-existing RCU read-side critical section.  On systems with more
+ * than one CPU, this means that when "func()" is invoked, each CPU is
+ * guaranteed to have executed a full memory barrier since the end of its
+ * last RCU read-side critical section whose beginning preceded the call
+ * to call_rcu().  It also means that each CPU executing an RCU read-side
+ * critical section that continues beyond the start of "func()" must have
+ * executed a memory barrier after the call_rcu() but before the beginning
+ * of that RCU read-side critical section.  Note that these guarantees
+ * include CPUs that are offline, idle, or executing in user mode, as
+ * well as CPUs that are executing in the kernel.
+ *
+ * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the
+ * resulting RCU callback function "func()", then both CPU A and CPU B are
+ * guaranteed to execute a full memory barrier during the time interval
+ * between the call to call_rcu() and the invocation of "func()" -- even
+ * if CPU A and CPU B are the same CPU (but again only if the system has
+ * more than one CPU).
+ *
+ * Implementation of these memory-ordering guarantees is described here:
+ * Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst.
+ */
+void call_rcu(struct rcu_head *head, rcu_callback_t func)
+{
+	return __call_rcu_common(head, func, force_call_rcu_to_lazy);
+}
+EXPORT_SYMBOL_GPL(call_rcu);
 
 /* Maximum number of jiffies to wait before draining a batch. */
 #define KFREE_DRAIN_JIFFIES (5 * HZ)
@@ -3904,7 +3943,11 @@ static void rcu_barrier_entrain(struct rcu_data *rdp)
 	rdp->barrier_head.func = rcu_barrier_callback;
 	debug_rcu_head_queue(&rdp->barrier_head);
 	rcu_nocb_lock(rdp);
-	WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies));
+        /*
+         * Flush the bypass list, but also wake up the GP thread as otherwise
+         * bypass/lazy CBs maynot be noticed, and can cause real long delays!
+         */
+	WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies, FLUSH_BP_WAKE));
 	if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head)) {
 		atomic_inc(&rcu_state.barrier_cpu_count);
 	} else {
@@ -4325,7 +4368,7 @@ void rcutree_migrate_callbacks(int cpu)
 	my_rdp = this_cpu_ptr(&rcu_data);
 	my_rnp = my_rdp->mynode;
 	rcu_nocb_lock(my_rdp); /* irqs already disabled. */
-	WARN_ON_ONCE(!rcu_nocb_flush_bypass(my_rdp, NULL, jiffies));
+	WARN_ON_ONCE(!rcu_nocb_flush_bypass(my_rdp, NULL, jiffies, FLUSH_BP_NONE));
 	raw_spin_lock_rcu_node(my_rnp); /* irqs already disabled. */
 	/* Leverage recent GPs and set GP for new callbacks. */
 	needwake = rcu_advance_cbs(my_rnp, rdp) ||
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index d4a97e40ea9c..361c41d642c7 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -263,14 +263,16 @@ struct rcu_data {
 	unsigned long last_fqs_resched;	/* Time of last rcu_resched(). */
 	unsigned long last_sched_clock;	/* Jiffies of last rcu_sched_clock_irq(). */
 
+	long lazy_len;			/* Length of buffered lazy callbacks. */
 	int cpu;
 };
 
 /* Values for nocb_defer_wakeup field in struct rcu_data. */
 #define RCU_NOCB_WAKE_NOT	0
 #define RCU_NOCB_WAKE_BYPASS	1
-#define RCU_NOCB_WAKE		2
-#define RCU_NOCB_WAKE_FORCE	3
+#define RCU_NOCB_WAKE_LAZY	2
+#define RCU_NOCB_WAKE		3
+#define RCU_NOCB_WAKE_FORCE	4
 
 #define RCU_JIFFIES_TILL_FORCE_QS (1 + (HZ > 250) + (HZ > 500))
 					/* For jiffies_till_first_fqs and */
@@ -439,10 +441,17 @@ static void zero_cpu_stall_ticks(struct rcu_data *rdp);
 static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp);
 static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq);
 static void rcu_init_one_nocb(struct rcu_node *rnp);
+
+#define FLUSH_BP_NONE 0
+/* Is the CB being enqueued after the flush, a lazy CB? */
+#define FLUSH_BP_LAZY BIT(0)
+/* Wake up nocb-GP thread after flush? */
+#define FLUSH_BP_WAKE BIT(1)
 static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
-				  unsigned long j);
+				  unsigned long j, unsigned long flush_flags);
 static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
-				bool *was_alldone, unsigned long flags);
+				bool *was_alldone, unsigned long flags,
+				bool lazy);
 static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty,
 				 unsigned long flags);
 static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level);
diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index 4dc86274b3e8..b201606f7c4f 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -256,6 +256,31 @@ static bool wake_nocb_gp(struct rcu_data *rdp, bool force)
 	return __wake_nocb_gp(rdp_gp, rdp, force, flags);
 }
 
+/*
+ * LAZY_FLUSH_JIFFIES decides the maximum amount of time that
+ * can elapse before lazy callbacks are flushed. Lazy callbacks
+ * could be flushed much earlier for a number of other reasons
+ * however, LAZY_FLUSH_JIFFIES will ensure no lazy callbacks are
+ * left unsubmitted to RCU after those many jiffies.
+ */
+#define LAZY_FLUSH_JIFFIES (10 * HZ)
+unsigned long jiffies_till_flush = LAZY_FLUSH_JIFFIES;
+
+#ifdef CONFIG_RCU_LAZY
+// To be called only from test code.
+void rcu_lazy_set_jiffies_till_flush(unsigned long jif)
+{
+	jiffies_till_flush = jif;
+}
+EXPORT_SYMBOL(rcu_lazy_set_jiffies_till_flush);
+
+unsigned long rcu_lazy_get_jiffies_till_flush(void)
+{
+	return jiffies_till_flush;
+}
+EXPORT_SYMBOL(rcu_lazy_get_jiffies_till_flush);
+#endif
+
 /*
  * Arrange to wake the GP kthread for this NOCB group at some future
  * time when it is safe to do so.
@@ -269,10 +294,14 @@ static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype,
 	raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
 
 	/*
-	 * Bypass wakeup overrides previous deferments. In case
-	 * of callback storm, no need to wake up too early.
+	 * Bypass wakeup overrides previous deferments. In case of
+	 * callback storm, no need to wake up too early.
 	 */
-	if (waketype == RCU_NOCB_WAKE_BYPASS) {
+	if (waketype == RCU_NOCB_WAKE_LAZY
+		&& READ_ONCE(rdp->nocb_defer_wakeup) == RCU_NOCB_WAKE_NOT) {
+		mod_timer(&rdp_gp->nocb_timer, jiffies + jiffies_till_flush);
+		WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype);
+	} else if (waketype == RCU_NOCB_WAKE_BYPASS) {
 		mod_timer(&rdp_gp->nocb_timer, jiffies + 2);
 		WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype);
 	} else {
@@ -293,12 +322,16 @@ static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype,
  * proves to be initially empty, just return false because the no-CB GP
  * kthread may need to be awakened in this case.
  *
+ * Return true if there was something to be flushed and it succeeded, otherwise
+ * false.
+ *
  * Note that this function always returns true if rhp is NULL.
  */
 static bool rcu_nocb_do_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
-				     unsigned long j)
+				     unsigned long j, unsigned long flush_flags)
 {
 	struct rcu_cblist rcl;
+	bool lazy = flush_flags & FLUSH_BP_LAZY;
 
 	WARN_ON_ONCE(!rcu_rdp_is_offloaded(rdp));
 	rcu_lockdep_assert_cblist_protected(rdp);
@@ -310,7 +343,20 @@ static bool rcu_nocb_do_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
 	/* Note: ->cblist.len already accounts for ->nocb_bypass contents. */
 	if (rhp)
 		rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */
-	rcu_cblist_flush_enqueue(&rcl, &rdp->nocb_bypass, rhp);
+
+	/*
+	 * If the new CB requested was a lazy one, queue it onto the main
+	 * ->cblist so we can take advantage of a sooner grade period.
+	 */
+	if (lazy && rhp) {
+		rcu_cblist_flush_enqueue(&rcl, &rdp->nocb_bypass, NULL);
+		rcu_cblist_enqueue(&rcl, rhp);
+		WRITE_ONCE(rdp->lazy_len, 0);
+	} else {
+		rcu_cblist_flush_enqueue(&rcl, &rdp->nocb_bypass, rhp);
+		WRITE_ONCE(rdp->lazy_len, 0);
+	}
+
 	rcu_segcblist_insert_pend_cbs(&rdp->cblist, &rcl);
 	WRITE_ONCE(rdp->nocb_bypass_first, j);
 	rcu_nocb_bypass_unlock(rdp);
@@ -326,13 +372,20 @@ static bool rcu_nocb_do_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
  * Note that this function always returns true if rhp is NULL.
  */
 static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
-				  unsigned long j)
+				  unsigned long j, unsigned long flush_flags)
 {
+	bool ret;
+
 	if (!rcu_rdp_is_offloaded(rdp))
 		return true;
 	rcu_lockdep_assert_cblist_protected(rdp);
 	rcu_nocb_bypass_lock(rdp);
-	return rcu_nocb_do_flush_bypass(rdp, rhp, j);
+	ret = rcu_nocb_do_flush_bypass(rdp, rhp, j, flush_flags);
+
+	if (flush_flags & FLUSH_BP_WAKE)
+		wake_nocb_gp(rdp, true);
+
+	return ret;
 }
 
 /*
@@ -345,7 +398,7 @@ static void rcu_nocb_try_flush_bypass(struct rcu_data *rdp, unsigned long j)
 	if (!rcu_rdp_is_offloaded(rdp) ||
 	    !rcu_nocb_bypass_trylock(rdp))
 		return;
-	WARN_ON_ONCE(!rcu_nocb_do_flush_bypass(rdp, NULL, j));
+	WARN_ON_ONCE(!rcu_nocb_do_flush_bypass(rdp, NULL, j, FLUSH_BP_NONE));
 }
 
 /*
@@ -367,12 +420,14 @@ static void rcu_nocb_try_flush_bypass(struct rcu_data *rdp, unsigned long j)
  * there is only one CPU in operation.
  */
 static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
-				bool *was_alldone, unsigned long flags)
+				bool *was_alldone, unsigned long flags,
+				bool lazy)
 {
 	unsigned long c;
 	unsigned long cur_gp_seq;
 	unsigned long j = jiffies;
 	long ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
+	bool bypass_is_lazy = (ncbs == READ_ONCE(rdp->lazy_len));
 
 	lockdep_assert_irqs_disabled();
 
@@ -417,25 +472,30 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
 	// If there hasn't yet been all that many ->cblist enqueues
 	// this jiffy, tell the caller to enqueue onto ->cblist.  But flush
 	// ->nocb_bypass first.
-	if (rdp->nocb_nobypass_count < nocb_nobypass_lim_per_jiffy) {
+	// Lazy CBs throttle this back and do immediate bypass queuing.
+	if (rdp->nocb_nobypass_count < nocb_nobypass_lim_per_jiffy && !lazy) {
 		rcu_nocb_lock(rdp);
 		*was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
 		if (*was_alldone)
 			trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
 					    TPS("FirstQ"));
-		WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, j));
+
+		WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, j, FLUSH_BP_NONE));
 		WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
 		return false; // Caller must enqueue the callback.
 	}
 
 	// If ->nocb_bypass has been used too long or is too full,
 	// flush ->nocb_bypass to ->cblist.
-	if ((ncbs && j != READ_ONCE(rdp->nocb_bypass_first)) ||
+	if ((ncbs && !bypass_is_lazy && j != READ_ONCE(rdp->nocb_bypass_first)) ||
+	    (ncbs &&  bypass_is_lazy &&
+		(time_after(j, READ_ONCE(rdp->nocb_bypass_first) + jiffies_till_flush))) ||
 	    ncbs >= qhimark) {
 		rcu_nocb_lock(rdp);
 		*was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
 
-		if (!rcu_nocb_flush_bypass(rdp, rhp, j)) {
+		if (!rcu_nocb_flush_bypass(rdp, rhp, j,
+					   lazy ? FLUSH_BP_LAZY : FLUSH_BP_NONE)) {
 			if (*was_alldone)
 				trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
 						    TPS("FirstQ"));
@@ -461,16 +521,29 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
 	// We need to use the bypass.
 	rcu_nocb_wait_contended(rdp);
 	rcu_nocb_bypass_lock(rdp);
+
 	ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
 	rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */
 	rcu_cblist_enqueue(&rdp->nocb_bypass, rhp);
+
+	if (IS_ENABLED(CONFIG_RCU_LAZY) && lazy)
+		WRITE_ONCE(rdp->lazy_len, rdp->lazy_len + 1);
+
 	if (!ncbs) {
 		WRITE_ONCE(rdp->nocb_bypass_first, j);
 		trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("FirstBQ"));
 	}
+
 	rcu_nocb_bypass_unlock(rdp);
 	smp_mb(); /* Order enqueue before wake. */
-	if (ncbs) {
+
+	// We had CBs in the bypass list before. There is nothing else to do if:
+	// There were only non-lazy CBs before, in this case, the bypass timer
+	// or GP-thread will handle the CBs including any new lazy ones.
+	// Or, the new CB is lazy and the old bypass-CBs were also lazy. In this
+	// case the old lazy timer would have been setup. When that expires,
+	// the new lazy one will be handled.
+	if (ncbs && (!bypass_is_lazy || lazy)) {
 		local_irq_restore(flags);
 	} else {
 		// No-CBs GP kthread might be indefinitely asleep, if so, wake.
@@ -479,6 +552,10 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
 			trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
 					    TPS("FirstBQwake"));
 			__call_rcu_nocb_wake(rdp, true, flags);
+		} else if (bypass_is_lazy && !lazy) {
+			trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+					    TPS("FirstBQwakeLazy2Non"));
+			__call_rcu_nocb_wake(rdp, true, flags);
 		} else {
 			trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
 					    TPS("FirstBQnoWake"));
@@ -500,7 +577,7 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone,
 {
 	unsigned long cur_gp_seq;
 	unsigned long j;
-	long len;
+	long len, lazy_len, bypass_len;
 	struct task_struct *t;
 
 	// If we are being polled or there is no kthread, just leave.
@@ -513,9 +590,16 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone,
 	}
 	// Need to actually to a wakeup.
 	len = rcu_segcblist_n_cbs(&rdp->cblist);
+	bypass_len = rcu_cblist_n_cbs(&rdp->nocb_bypass);
+	lazy_len = READ_ONCE(rdp->lazy_len);
 	if (was_alldone) {
 		rdp->qlen_last_fqs_check = len;
-		if (!irqs_disabled_flags(flags)) {
+		// Only lazy CBs in bypass list
+		if (lazy_len && bypass_len == lazy_len) {
+			rcu_nocb_unlock_irqrestore(rdp, flags);
+			wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_LAZY,
+					   TPS("WakeLazy"));
+		} else if (!irqs_disabled_flags(flags)) {
 			/* ... if queue was empty ... */
 			rcu_nocb_unlock_irqrestore(rdp, flags);
 			wake_nocb_gp(rdp, false);
@@ -605,8 +689,8 @@ static void nocb_gp_sleep(struct rcu_data *my_rdp, int cpu)
  */
 static void nocb_gp_wait(struct rcu_data *my_rdp)
 {
-	bool bypass = false;
-	long bypass_ncbs;
+	bool bypass = false, lazy = false;
+	long bypass_ncbs, lazy_ncbs;
 	int __maybe_unused cpu = my_rdp->cpu;
 	unsigned long cur_gp_seq;
 	unsigned long flags;
@@ -641,24 +725,41 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
 	 * won't be ignored for long.
 	 */
 	list_for_each_entry(rdp, &my_rdp->nocb_head_rdp, nocb_entry_rdp) {
+		bool flush_bypass = false;
+
 		trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Check"));
 		rcu_nocb_lock_irqsave(rdp, flags);
 		lockdep_assert_held(&rdp->nocb_lock);
 		bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
-		if (bypass_ncbs &&
+		lazy_ncbs = READ_ONCE(rdp->lazy_len);
+
+		if (bypass_ncbs && (lazy_ncbs == bypass_ncbs) &&
+		    (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + jiffies_till_flush) ||
+		     bypass_ncbs > 2 * qhimark)) {
+			flush_bypass = true;
+		} else if (bypass_ncbs && (lazy_ncbs != bypass_ncbs) &&
 		    (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + 1) ||
 		     bypass_ncbs > 2 * qhimark)) {
-			// Bypass full or old, so flush it.
-			(void)rcu_nocb_try_flush_bypass(rdp, j);
-			bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
+			flush_bypass = true;
 		} else if (!bypass_ncbs && rcu_segcblist_empty(&rdp->cblist)) {
 			rcu_nocb_unlock_irqrestore(rdp, flags);
 			continue; /* No callbacks here, try next. */
 		}
+
+		if (flush_bypass) {
+			// Bypass full or old, so flush it.
+			(void)rcu_nocb_try_flush_bypass(rdp, j);
+			bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
+			lazy_ncbs = READ_ONCE(rdp->lazy_len);
+		}
+
 		if (bypass_ncbs) {
 			trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
-					    TPS("Bypass"));
-			bypass = true;
+				    bypass_ncbs == lazy_ncbs ? TPS("Lazy") : TPS("Bypass"));
+			if (bypass_ncbs == lazy_ncbs)
+				lazy = true;
+			else
+				bypass = true;
 		}
 		rnp = rdp->mynode;
 
@@ -706,12 +807,21 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
 	my_rdp->nocb_gp_gp = needwait_gp;
 	my_rdp->nocb_gp_seq = needwait_gp ? wait_gp_seq : 0;
 
-	if (bypass && !rcu_nocb_poll) {
-		// At least one child with non-empty ->nocb_bypass, so set
-		// timer in order to avoid stranding its callbacks.
-		wake_nocb_gp_defer(my_rdp, RCU_NOCB_WAKE_BYPASS,
-				   TPS("WakeBypassIsDeferred"));
+	// At least one child with non-empty ->nocb_bypass, so set
+	// timer in order to avoid stranding its callbacks.
+	if (!rcu_nocb_poll) {
+		// If bypass list only has lazy CBs. Add a deferred
+		// lazy wake up.
+		if (lazy && !bypass) {
+			wake_nocb_gp_defer(my_rdp, RCU_NOCB_WAKE_LAZY,
+					TPS("WakeLazyIsDeferred"));
+		// Otherwise add a deferred bypass wake up.
+		} else if (bypass) {
+			wake_nocb_gp_defer(my_rdp, RCU_NOCB_WAKE_BYPASS,
+					TPS("WakeBypassIsDeferred"));
+		}
 	}
+
 	if (rcu_nocb_poll) {
 		/* Polling, so trace if first poll in the series. */
 		if (gotcbs)
@@ -1037,7 +1147,7 @@ static long rcu_nocb_rdp_deoffload(void *arg)
 	 * return false, which means that future calls to rcu_nocb_try_bypass()
 	 * will refuse to put anything into the bypass.
 	 */
-	WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies));
+	WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies, FLUSH_BP_NONE));
 	/*
 	 * Start with invoking rcu_core() early. This way if the current thread
 	 * happens to preempt an ongoing call to rcu_core() in the middle,
@@ -1291,6 +1401,7 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
 	raw_spin_lock_init(&rdp->nocb_gp_lock);
 	timer_setup(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer, 0);
 	rcu_cblist_init(&rdp->nocb_bypass);
+	WRITE_ONCE(rdp->lazy_len, 0);
 	mutex_init(&rdp->nocb_gp_kthread_mutex);
 }
 
@@ -1572,13 +1683,13 @@ static void rcu_init_one_nocb(struct rcu_node *rnp)
 }
 
 static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
-				  unsigned long j)
+				  unsigned long j, unsigned long flush_flags)
 {
 	return true;
 }
 
 static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
-				bool *was_alldone, unsigned long flags)
+				bool *was_alldone, unsigned long flags, bool lazy)
 {
 	return false;
 }
Paul E. McKenney Sept. 6, 2022, 8:55 a.m. UTC | #12
On Mon, Sep 05, 2022 at 04:32:26PM -0400, Joel Fernandes wrote:
> 
> 
> On 9/5/2022 8:59 AM, Frederic Weisbecker wrote:
> > I'd rather see an updated patch (not the whole patchset but just this one) rather
> > than deltas, just to make sure I'm not missing something in the whole picture.
> > 
> > Thanks.
> 
> There is also the previous patch which needs a fix up you suggested.
> 
> I will reply to individual patches with in-line patch (for you), as well as
> refreshing whole series since it might be easier for Paul to apply them together.

To say nothing of the greater probability that the result will match
your intent.  ;-)

							Thanx, Paul
Frederic Weisbecker Sept. 6, 2022, 3:17 p.m. UTC | #13
On Tue, Sep 06, 2022 at 03:05:46AM +0000, Joel Fernandes wrote:
> diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
> index 4dc86274b3e8..b201606f7c4f 100644
> --- a/kernel/rcu/tree_nocb.h
> +++ b/kernel/rcu/tree_nocb.h
> @@ -256,6 +256,31 @@ static bool wake_nocb_gp(struct rcu_data *rdp, bool force)
>  	return __wake_nocb_gp(rdp_gp, rdp, force, flags);
>  }
>  
> +/*
> + * LAZY_FLUSH_JIFFIES decides the maximum amount of time that
> + * can elapse before lazy callbacks are flushed. Lazy callbacks
> + * could be flushed much earlier for a number of other reasons
> + * however, LAZY_FLUSH_JIFFIES will ensure no lazy callbacks are
> + * left unsubmitted to RCU after those many jiffies.
> + */
> +#define LAZY_FLUSH_JIFFIES (10 * HZ)
> +unsigned long jiffies_till_flush = LAZY_FLUSH_JIFFIES;

Still not static.

> @@ -293,12 +322,16 @@ static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype,
>   * proves to be initially empty, just return false because the no-CB GP
>   * kthread may need to be awakened in this case.
>   *
> + * Return true if there was something to be flushed and it succeeded, otherwise
> + * false.
> + *

This kind of contradict the comment that follows. Not sure you need to add
that line because the existing comment seem to cover it.

>   * Note that this function always returns true if rhp is NULL.

>   */
>  static bool rcu_nocb_do_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
> -				     unsigned long j)
> +				     unsigned long j, unsigned long flush_flags)
>  {
>  	struct rcu_cblist rcl;
> +	bool lazy = flush_flags & FLUSH_BP_LAZY;
>  
>  	WARN_ON_ONCE(!rcu_rdp_is_offloaded(rdp));
>  	rcu_lockdep_assert_cblist_protected(rdp);
> @@ -326,13 +372,20 @@ static bool rcu_nocb_do_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
>   * Note that this function always returns true if rhp is NULL.
>   */
>  static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
> -				  unsigned long j)
> +				  unsigned long j, unsigned long flush_flags)
>  {
> +	bool ret;
> +
>  	if (!rcu_rdp_is_offloaded(rdp))
>  		return true;
>  	rcu_lockdep_assert_cblist_protected(rdp);
>  	rcu_nocb_bypass_lock(rdp);
> -	return rcu_nocb_do_flush_bypass(rdp, rhp, j);
> +	ret = rcu_nocb_do_flush_bypass(rdp, rhp, j, flush_flags);
> +
> +	if (flush_flags & FLUSH_BP_WAKE)
> +		wake_nocb_gp(rdp, true);

Why the true above?

Also should we check if the wake up is really necessary (otherwise it means we
force a wake up for all rdp's from rcu_barrier())?

       was_alldone = rcu_segcblist_pend_cbs(&rdp->cblist);
       ret = rcu_nocb_do_flush_bypass(rdp, rhp, j, flush_flags);
       if (was_alldone && rcu_segcblist_pend_cbs(&rdp->cblist))
       	  wake_nocb_gp(rdp, false);


> @@ -461,16 +521,29 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
>  	// We need to use the bypass.
>  	rcu_nocb_wait_contended(rdp);
>  	rcu_nocb_bypass_lock(rdp);
> +
>  	ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
>  	rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */
>  	rcu_cblist_enqueue(&rdp->nocb_bypass, rhp);
> +
> +	if (IS_ENABLED(CONFIG_RCU_LAZY) && lazy)
> +		WRITE_ONCE(rdp->lazy_len, rdp->lazy_len + 1);
> +
>  	if (!ncbs) {
>  		WRITE_ONCE(rdp->nocb_bypass_first, j);
>  		trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("FirstBQ"));
>  	}
> +
>  	rcu_nocb_bypass_unlock(rdp);
>  	smp_mb(); /* Order enqueue before wake. */
> -	if (ncbs) {
> +
> +	// We had CBs in the bypass list before. There is nothing else to do if:
> +	// There were only non-lazy CBs before, in this case, the bypass timer

Kind of misleading. I would replace "There were only non-lazy CBs before" with
"There was at least one non-lazy CBs before".

> +	// or GP-thread will handle the CBs including any new lazy ones.
> +	// Or, the new CB is lazy and the old bypass-CBs were also lazy. In this
> +	// case the old lazy timer would have been setup. When that expires,
> +	// the new lazy one will be handled.
> +	if (ncbs && (!bypass_is_lazy || lazy)) {
>  		local_irq_restore(flags);
>  	} else {
>  		// No-CBs GP kthread might be indefinitely asleep, if so, wake.
> @@ -479,6 +552,10 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
>  			trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
>  					    TPS("FirstBQwake"));
>  			__call_rcu_nocb_wake(rdp, true, flags);
> +		} else if (bypass_is_lazy && !lazy) {
> +			trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
> +					    TPS("FirstBQwakeLazy2Non"));
> +			__call_rcu_nocb_wake(rdp, true, flags);

Not sure we need this chunk. Since there are pending callbacks anyway,
nocb_gp_wait() should be handling them and it will set the appropriate
timer on the next loop.

Thanks.

>  		} else {
>  			trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
>  					    TPS("FirstBQnoWake"));
Joel Fernandes Sept. 6, 2022, 4:15 p.m. UTC | #14
On 9/6/2022 11:17 AM, Frederic Weisbecker wrote:
> On Tue, Sep 06, 2022 at 03:05:46AM +0000, Joel Fernandes wrote:
>> diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
>> index 4dc86274b3e8..b201606f7c4f 100644
>> --- a/kernel/rcu/tree_nocb.h
>> +++ b/kernel/rcu/tree_nocb.h
>> @@ -256,6 +256,31 @@ static bool wake_nocb_gp(struct rcu_data *rdp, bool force)
>>  	return __wake_nocb_gp(rdp_gp, rdp, force, flags);
>>  }
>>  
>> +/*
>> + * LAZY_FLUSH_JIFFIES decides the maximum amount of time that
>> + * can elapse before lazy callbacks are flushed. Lazy callbacks
>> + * could be flushed much earlier for a number of other reasons
>> + * however, LAZY_FLUSH_JIFFIES will ensure no lazy callbacks are
>> + * left unsubmitted to RCU after those many jiffies.
>> + */
>> +#define LAZY_FLUSH_JIFFIES (10 * HZ)
>> +unsigned long jiffies_till_flush = LAZY_FLUSH_JIFFIES;
> 
> Still not static.

Oops will fix.

>> @@ -293,12 +322,16 @@ static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype,
>>   * proves to be initially empty, just return false because the no-CB GP
>>   * kthread may need to be awakened in this case.
>>   *
>> + * Return true if there was something to be flushed and it succeeded, otherwise
>> + * false.
>> + *
> 
> This kind of contradict the comment that follows. Not sure you need to add
> that line because the existing comment seem to cover it.
> 
>>   * Note that this function always returns true if rhp is NULL.
> 
>>   */
>>  static bool rcu_nocb_do_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
>> -				     unsigned long j)
>> +				     unsigned long j, unsigned long flush_flags)
>>  {
>>  	struct rcu_cblist rcl;
>> +	bool lazy = flush_flags & FLUSH_BP_LAZY;
>>  
>>  	WARN_ON_ONCE(!rcu_rdp_is_offloaded(rdp));
>>  	rcu_lockdep_assert_cblist_protected(rdp);
>> @@ -326,13 +372,20 @@ static bool rcu_nocb_do_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
>>   * Note that this function always returns true if rhp is NULL.
>>   */
>>  static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
>> -				  unsigned long j)
>> +				  unsigned long j, unsigned long flush_flags)
>>  {
>> +	bool ret;
>> +
>>  	if (!rcu_rdp_is_offloaded(rdp))
>>  		return true;
>>  	rcu_lockdep_assert_cblist_protected(rdp);
>>  	rcu_nocb_bypass_lock(rdp);
>> -	return rcu_nocb_do_flush_bypass(rdp, rhp, j);
>> +	ret = rcu_nocb_do_flush_bypass(rdp, rhp, j, flush_flags);
>> +
>> +	if (flush_flags & FLUSH_BP_WAKE)
>> +		wake_nocb_gp(rdp, true);
> 
> Why the true above?
> 
> Also should we check if the wake up is really necessary (otherwise it means we
> force a wake up for all rdp's from rcu_barrier())?

Good point. I need to look into your suggested optimization here more, it is
possible the wake up is not needed some of the times, but considering that
rcu_barrier() is a slow path, I will probably aim for robustness here over
mathematically correct code. And the cost of a missed wake up here can be
serious as I saw by the RCU_SCALE test I added. Also the wake up pf the rcuog
thread is square root of CPUs because of the rdp grouping right?

Still, it'd be good to not do something unnecessary, so your point is well
taken. I'll spend some time on this and get back to you.

>        was_alldone = rcu_segcblist_pend_cbs(&rdp->cblist);
>        ret = rcu_nocb_do_flush_bypass(rdp, rhp, j, flush_flags);
>        if (was_alldone && rcu_segcblist_pend_cbs(&rdp->cblist))
>        	  wake_nocb_gp(rdp, false);
> 
> 
>> @@ -461,16 +521,29 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
>>  	// We need to use the bypass.
>>  	rcu_nocb_wait_contended(rdp);
>>  	rcu_nocb_bypass_lock(rdp);
>> +
>>  	ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
>>  	rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */
>>  	rcu_cblist_enqueue(&rdp->nocb_bypass, rhp);
>> +
>> +	if (IS_ENABLED(CONFIG_RCU_LAZY) && lazy)
>> +		WRITE_ONCE(rdp->lazy_len, rdp->lazy_len + 1);
>> +
>>  	if (!ncbs) {
>>  		WRITE_ONCE(rdp->nocb_bypass_first, j);
>>  		trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("FirstBQ"));
>>  	}
>> +
>>  	rcu_nocb_bypass_unlock(rdp);
>>  	smp_mb(); /* Order enqueue before wake. */
>> -	if (ncbs) {
>> +
>> +	// We had CBs in the bypass list before. There is nothing else to do if:
>> +	// There were only non-lazy CBs before, in this case, the bypass timer
> 
> Kind of misleading. I would replace "There were only non-lazy CBs before" with
> "There was at least one non-lazy CBs before".

I really mean "There were only non-lazy CBs ever queued in the bypass list
before". That's the bypass_is_lazy variable. So I did not fully understand your
suggested comment change.

>> +	// or GP-thread will handle the CBs including any new lazy ones.
>> +	// Or, the new CB is lazy and the old bypass-CBs were also lazy. In this
>> +	// case the old lazy timer would have been setup. When that expires,
>> +	// the new lazy one will be handled.
>> +	if (ncbs && (!bypass_is_lazy || lazy)) {
>>  		local_irq_restore(flags);
>>  	} else {
>>  		// No-CBs GP kthread might be indefinitely asleep, if so, wake.
>> @@ -479,6 +552,10 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
>>  			trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
>>  					    TPS("FirstBQwake"));
>>  			__call_rcu_nocb_wake(rdp, true, flags);
>> +		} else if (bypass_is_lazy && !lazy) {
>> +			trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
>> +					    TPS("FirstBQwakeLazy2Non"));
>> +			__call_rcu_nocb_wake(rdp, true, flags);
> 
> Not sure we need this chunk. Since there are pending callbacks anyway,
> nocb_gp_wait() should be handling them and it will set the appropriate
> timer on the next loop.

We do because those pending callbacks could be because of a bypass list flush
and not because there were pending CBs before, right? I do recall missed wake
ups of non-lazy CBs, and them having to wait for the full lazy timer duration
and slowing down synchronize_rcu() which is on the ChromeOS boot critical path!

Thanks,

 - Joel
Joel Fernandes Sept. 6, 2022, 4:16 p.m. UTC | #15
On 9/6/2022 4:55 AM, Paul E. McKenney wrote:
> On Mon, Sep 05, 2022 at 04:32:26PM -0400, Joel Fernandes wrote:
>>
>>
>> On 9/5/2022 8:59 AM, Frederic Weisbecker wrote:
>>> I'd rather see an updated patch (not the whole patchset but just this one) rather
>>> than deltas, just to make sure I'm not missing something in the whole picture.
>>>
>>> Thanks.
>>
>> There is also the previous patch which needs a fix up you suggested.
>>
>> I will reply to individual patches with in-line patch (for you), as well as
>> refreshing whole series since it might be easier for Paul to apply them together.
> 
> To say nothing of the greater probability that the result will match
> your intent.  ;-)

It is shaping up well ;-)

thanks,

 - Joel
Joel Fernandes Sept. 6, 2022, 4:31 p.m. UTC | #16
On 9/6/2022 12:15 PM, Joel Fernandes wrote:
>>> @@ -461,16 +521,29 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
>>>  	// We need to use the bypass.
>>>  	rcu_nocb_wait_contended(rdp);
>>>  	rcu_nocb_bypass_lock(rdp);
>>> +
>>>  	ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
>>>  	rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */
>>>  	rcu_cblist_enqueue(&rdp->nocb_bypass, rhp);
>>> +
>>> +	if (IS_ENABLED(CONFIG_RCU_LAZY) && lazy)
>>> +		WRITE_ONCE(rdp->lazy_len, rdp->lazy_len + 1);
>>> +
>>>  	if (!ncbs) {
>>>  		WRITE_ONCE(rdp->nocb_bypass_first, j);
>>>  		trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("FirstBQ"));
>>>  	}
>>> +
>>>  	rcu_nocb_bypass_unlock(rdp);
>>>  	smp_mb(); /* Order enqueue before wake. */
>>> -	if (ncbs) {
>>> +
>>> +	// We had CBs in the bypass list before. There is nothing else to do if:
>>> +	// There were only non-lazy CBs before, in this case, the bypass timer
>> Kind of misleading. I would replace "There were only non-lazy CBs before" with
>> "There was at least one non-lazy CBs before".
> I really mean "There were only non-lazy CBs ever queued in the bypass list
> before". That's the bypass_is_lazy variable. So I did not fully understand your
> suggested comment change.
> 
>>> +	// or GP-thread will handle the CBs including any new lazy ones.
>>> +	// Or, the new CB is lazy and the old bypass-CBs were also lazy. In this
>>> +	// case the old lazy timer would have been setup. When that expires,
>>> +	// the new lazy one will be handled.
>>> +	if (ncbs && (!bypass_is_lazy || lazy)) {
>>>  		local_irq_restore(flags);
>>>  	} else {
>>>  		// No-CBs GP kthread might be indefinitely asleep, if so, wake.
>>> @@ -479,6 +552,10 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
>>>  			trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
>>>  					    TPS("FirstBQwake"));
>>>  			__call_rcu_nocb_wake(rdp, true, flags);
>>> +		} else if (bypass_is_lazy && !lazy) {
>>> +			trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
>>> +					    TPS("FirstBQwakeLazy2Non"));
>>> +			__call_rcu_nocb_wake(rdp, true, flags);
>>
>> Not sure we need this chunk. Since there are pending callbacks anyway,
>> nocb_gp_wait() should be handling them and it will set the appropriate
>> timer on the next loop.
>
> We do because those pending callbacks could be because of a bypass list flush
> and not because there were pending CBs before, right? I do recall missed wake
> ups of non-lazy CBs, and them having to wait for the full lazy timer duration
> and slowing down synchronize_rcu() which is on the ChromeOS boot critical path!
> 

Just to add more details, consider the series of events:

1. Only lazy CBs are ever queued. Timer is armed for multiple seconds.
rcu_segcblist_pend_cbs remains false.

2. First non-lazy CB triggers to code that does the bypyass rate-limit thing.

3. By pass list is flushed because it is non-lazy CB and we need to start GP
processing soon.

4. Due to flush, rcu_segcblist_pend_cbs() is now true.

5. We reach this "else if" clause because bypass_is_lazy means only lazy CBs
were ever buffered. We need to reprogram the timer or do an immediate wake up.
That's the intention of __call_rcu_nocb_wake().

I really saw #1 and #2 trigger during boot up itself and cause a multi-second
boot regression.

The chunk is needed to handle this case. I indeed do not see a boot regression
any more. Did I miss something?

Thanks,

 - Joel
Joel Fernandes Sept. 6, 2022, 4:38 p.m. UTC | #17
On 9/6/2022 12:31 PM, Joel Fernandes wrote:
> 
> 
> On 9/6/2022 12:15 PM, Joel Fernandes wrote:
>>>> @@ -461,16 +521,29 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
>>>>  	// We need to use the bypass.
>>>>  	rcu_nocb_wait_contended(rdp);
>>>>  	rcu_nocb_bypass_lock(rdp);
>>>> +
>>>>  	ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
>>>>  	rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */
>>>>  	rcu_cblist_enqueue(&rdp->nocb_bypass, rhp);
>>>> +
>>>> +	if (IS_ENABLED(CONFIG_RCU_LAZY) && lazy)
>>>> +		WRITE_ONCE(rdp->lazy_len, rdp->lazy_len + 1);
>>>> +
>>>>  	if (!ncbs) {
>>>>  		WRITE_ONCE(rdp->nocb_bypass_first, j);
>>>>  		trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("FirstBQ"));
>>>>  	}
>>>> +
>>>>  	rcu_nocb_bypass_unlock(rdp);
>>>>  	smp_mb(); /* Order enqueue before wake. */
>>>> -	if (ncbs) {
>>>> +
>>>> +	// We had CBs in the bypass list before. There is nothing else to do if:
>>>> +	// There were only non-lazy CBs before, in this case, the bypass timer
>>> Kind of misleading. I would replace "There were only non-lazy CBs before" with
>>> "There was at least one non-lazy CBs before".
>> I really mean "There were only non-lazy CBs ever queued in the bypass list
>> before". That's the bypass_is_lazy variable. So I did not fully understand your
>> suggested comment change.
>>
>>>> +	// or GP-thread will handle the CBs including any new lazy ones.
>>>> +	// Or, the new CB is lazy and the old bypass-CBs were also lazy. In this
>>>> +	// case the old lazy timer would have been setup. When that expires,
>>>> +	// the new lazy one will be handled.
>>>> +	if (ncbs && (!bypass_is_lazy || lazy)) {
>>>>  		local_irq_restore(flags);
>>>>  	} else {
>>>>  		// No-CBs GP kthread might be indefinitely asleep, if so, wake.
>>>> @@ -479,6 +552,10 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
>>>>  			trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
>>>>  					    TPS("FirstBQwake"));
>>>>  			__call_rcu_nocb_wake(rdp, true, flags);
>>>> +		} else if (bypass_is_lazy && !lazy) {
>>>> +			trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
>>>> +					    TPS("FirstBQwakeLazy2Non"));
>>>> +			__call_rcu_nocb_wake(rdp, true, flags);
>>>
>>> Not sure we need this chunk. Since there are pending callbacks anyway,
>>> nocb_gp_wait() should be handling them and it will set the appropriate
>>> timer on the next loop.
>>
>> We do because those pending callbacks could be because of a bypass list flush
>> and not because there were pending CBs before, right? I do recall missed wake
>> ups of non-lazy CBs, and them having to wait for the full lazy timer duration
>> and slowing down synchronize_rcu() which is on the ChromeOS boot critical path!
>>
> 
> Just to add more details, consider the series of events:
> 
> 1. Only lazy CBs are ever queued. Timer is armed for multiple seconds.
> rcu_segcblist_pend_cbs remains false.
> 
> 2. First non-lazy CB triggers to code that does the bypyass rate-limit thing.
> 
> 3. By pass list is flushed because it is non-lazy CB and we need to start GP
> processing soon.

Correcting the events, #3 does not happen if we got here.

> 
> 4. Due to flush, rcu_segcblist_pend_cbs() is now true.

So rcu_segcblist_pend_cbs() cannot be true.

> 5. We reach this "else if" clause because bypass_is_lazy means only lazy CBs
> were ever buffered. We need to reprogram the timer or do an immediate wake up.
> That's the intention of __call_rcu_nocb_wake().
> 
> I really saw #1 and #2 trigger during boot up itself and cause a multi-second
> boot regression.

So may be this hunk is needed not needed any more and the boot regression is
fine. I can try to drop this hunk and run the tests again...

Thanks!

 - Joel
Joel Fernandes Sept. 6, 2022, 4:43 p.m. UTC | #18
On 9/6/2022 12:38 PM, Joel Fernandes wrote:
> 
> 
> On 9/6/2022 12:31 PM, Joel Fernandes wrote:
>>
>>
>> On 9/6/2022 12:15 PM, Joel Fernandes wrote:
>>>>> @@ -461,16 +521,29 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
>>>>>  	// We need to use the bypass.
>>>>>  	rcu_nocb_wait_contended(rdp);
>>>>>  	rcu_nocb_bypass_lock(rdp);
>>>>> +
>>>>>  	ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
>>>>>  	rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */
>>>>>  	rcu_cblist_enqueue(&rdp->nocb_bypass, rhp);
>>>>> +
>>>>> +	if (IS_ENABLED(CONFIG_RCU_LAZY) && lazy)
>>>>> +		WRITE_ONCE(rdp->lazy_len, rdp->lazy_len + 1);
>>>>> +
>>>>>  	if (!ncbs) {
>>>>>  		WRITE_ONCE(rdp->nocb_bypass_first, j);
>>>>>  		trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("FirstBQ"));
>>>>>  	}
>>>>> +
>>>>>  	rcu_nocb_bypass_unlock(rdp);
>>>>>  	smp_mb(); /* Order enqueue before wake. */
>>>>> -	if (ncbs) {
>>>>> +
>>>>> +	// We had CBs in the bypass list before. There is nothing else to do if:
>>>>> +	// There were only non-lazy CBs before, in this case, the bypass timer
>>>> Kind of misleading. I would replace "There were only non-lazy CBs before" with
>>>> "There was at least one non-lazy CBs before".
>>> I really mean "There were only non-lazy CBs ever queued in the bypass list
>>> before". That's the bypass_is_lazy variable. So I did not fully understand your
>>> suggested comment change.
>>>
>>>>> +	// or GP-thread will handle the CBs including any new lazy ones.
>>>>> +	// Or, the new CB is lazy and the old bypass-CBs were also lazy. In this
>>>>> +	// case the old lazy timer would have been setup. When that expires,
>>>>> +	// the new lazy one will be handled.
>>>>> +	if (ncbs && (!bypass_is_lazy || lazy)) {
>>>>>  		local_irq_restore(flags);
>>>>>  	} else {
>>>>>  		// No-CBs GP kthread might be indefinitely asleep, if so, wake.
>>>>> @@ -479,6 +552,10 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
>>>>>  			trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
>>>>>  					    TPS("FirstBQwake"));
>>>>>  			__call_rcu_nocb_wake(rdp, true, flags);
>>>>> +		} else if (bypass_is_lazy && !lazy) {
>>>>> +			trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
>>>>> +					    TPS("FirstBQwakeLazy2Non"));
>>>>> +			__call_rcu_nocb_wake(rdp, true, flags);
>>>>
>>>> Not sure we need this chunk. Since there are pending callbacks anyway,
>>>> nocb_gp_wait() should be handling them and it will set the appropriate
>>>> timer on the next loop.
>>>
>>> We do because those pending callbacks could be because of a bypass list flush
>>> and not because there were pending CBs before, right? I do recall missed wake
>>> ups of non-lazy CBs, and them having to wait for the full lazy timer duration
>>> and slowing down synchronize_rcu() which is on the ChromeOS boot critical path!
>>>
>>
>> Just to add more details, consider the series of events:
>>
>> 1. Only lazy CBs are ever queued. Timer is armed for multiple seconds.
>> rcu_segcblist_pend_cbs remains false.
>>
>> 2. First non-lazy CB triggers to code that does the bypyass rate-limit thing.
>>
>> 3. By pass list is flushed because it is non-lazy CB and we need to start GP
>> processing soon.
> 
> Correcting the events, #3 does not happen if we got here.
> 
>>
>> 4. Due to flush, rcu_segcblist_pend_cbs() is now true.
> 
> So rcu_segcblist_pend_cbs() cannot be true.
> 
>> 5. We reach this "else if" clause because bypass_is_lazy means only lazy CBs
>> were ever buffered. We need to reprogram the timer or do an immediate wake up.
>> That's the intention of __call_rcu_nocb_wake().
>>
>> I really saw #1 and #2 trigger during boot up itself and cause a multi-second
>> boot regression.
> 
> So may be this hunk is needed not needed any more and the boot regression is
> fine. I can try to drop this hunk and run the tests again...

Ah, now I know why I got confused. I *used* to flush the bypass list before when
!lazy CBs showed up. Paul suggested this is overkill. In this old overkill
method, I was missing a wake up which was likely causing the boot regression.
Forcing a wake up fixed that. Now in v5 I make it such that I don't do the flush
on a !lazy rate-limit.

I am sorry for the confusion. Either way, in my defense this is just an extra
bit of code that I have to delete. This code is hard. I have mostly relied on a
test-driven development. But now thanks to this review and I am learning the
code more and more...

Thanks,

 - Joel
Paul E. McKenney Sept. 6, 2022, 5:05 p.m. UTC | #19
On Tue, Sep 06, 2022 at 12:16:19PM -0400, Joel Fernandes wrote:
> On 9/6/2022 4:55 AM, Paul E. McKenney wrote:
> > On Mon, Sep 05, 2022 at 04:32:26PM -0400, Joel Fernandes wrote:
> >>
> >>
> >> On 9/5/2022 8:59 AM, Frederic Weisbecker wrote:
> >>> I'd rather see an updated patch (not the whole patchset but just this one) rather
> >>> than deltas, just to make sure I'm not missing something in the whole picture.
> >>>
> >>> Thanks.
> >>
> >> There is also the previous patch which needs a fix up you suggested.
> >>
> >> I will reply to individual patches with in-line patch (for you), as well as
> >> refreshing whole series since it might be easier for Paul to apply them together.
> > 
> > To say nothing of the greater probability that the result will match
> > your intent.  ;-)
> 
> It is shaping up well ;-)

Very good, looking forward to the next round!

							Thanx, Paul
Uladzislau Rezki Sept. 6, 2022, 6:16 p.m. UTC | #20
> On Fri, Sep 02, 2022 at 05:21:32PM +0200, Frederic Weisbecker wrote:
> > On Thu, Sep 01, 2022 at 10:17:08PM +0000, Joel Fernandes (Google) wrote:
> > > Implement timer-based RCU lazy callback batching. The batch is flushed
> > > whenever a certain amount of time has passed, or the batch on a
> > > particular CPU grows too big. Also memory pressure will flush it in a
> > > future patch.
> > > 
> > > To handle several corner cases automagically (such as rcu_barrier() and
> > > hotplug), we re-use bypass lists to handle lazy CBs. The bypass list
> > > length has the lazy CB length included in it. A separate lazy CB length
> > > counter is also introduced to keep track of the number of lazy CBs.
> > > 
> > > Suggested-by: Paul McKenney <paulmck@kernel.org>
> > > Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
> > > ---
> 
> Here is the updated version of this patch for further testing and review.
> Paul, you could consider updating your test branch. I have tested it in
> ChromeOS as well, and rcuscale. The laziness and boot times are looking good.
> There was at least one bug that I fixed that got introduced with the moving
> of the length field to rcu_data.  Thanks a lot Frederic for the review
> comments.
> 
> I will look at the rcu torture issue next... I suspect the length field issue
> may have been causing it.
> 
> ---8<-----------------------
> 
> From: "Joel Fernandes (Google)" <joel@joelfernandes.org>
> Subject: [PATCH v6] rcu: Introduce call_rcu_lazy() API implementation
> 
> Implement timer-based RCU lazy callback batching. The batch is flushed
> whenever a certain amount of time has passed, or the batch on a
> particular CPU grows too big. Also memory pressure will flush it in a
> future patch.
> 
> To handle several corner cases automagically (such as rcu_barrier() and
> hotplug), we re-use bypass lists to handle lazy CBs. The bypass list
> length has the lazy CB length included in it. A separate lazy CB length
> counter is also introduced to keep track of the number of lazy CBs.
> 
> v5->v6:
> 
> [ Frederic Weisbec: Program the lazy timer only if WAKE_NOT, since other
>   deferral levels wake much earlier so for those it is not needed. ]
> 
> [ Frederic Weisbec: Use flush flags to keep bypass API code clean. ]
> 
> [ Joel: Fix issue where I was not resetting lazy_len after moving it to rdp ]
> 
> Suggested-by: Paul McKenney <paulmck@kernel.org>
> Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
> ---
>  include/linux/rcupdate.h |   6 ++
>  kernel/rcu/Kconfig       |   8 ++
>  kernel/rcu/rcu.h         |  11 +++
>  kernel/rcu/tree.c        | 133 +++++++++++++++++++----------
>  kernel/rcu/tree.h        |  17 +++-
>  kernel/rcu/tree_nocb.h   | 175 ++++++++++++++++++++++++++++++++-------
>  6 files changed, 269 insertions(+), 81 deletions(-)
> 
> diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
> index 08605ce7379d..82e8a07e0856 100644
> --- a/include/linux/rcupdate.h
> +++ b/include/linux/rcupdate.h
> @@ -108,6 +108,12 @@ static inline int rcu_preempt_depth(void)
>  
>  #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
>  
> +#ifdef CONFIG_RCU_LAZY
> +void call_rcu_lazy(struct rcu_head *head, rcu_callback_t func);
> +#else
> +#define call_rcu_lazy(head, func) call_rcu(head, func)
> +#endif
> +
>  /* Internal to kernel */
>  void rcu_init(void);
>  extern int rcu_scheduler_active;
> diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
> index d471d22a5e21..3128d01427cb 100644
> --- a/kernel/rcu/Kconfig
> +++ b/kernel/rcu/Kconfig
> @@ -311,4 +311,12 @@ config TASKS_TRACE_RCU_READ_MB
>  	  Say N here if you hate read-side memory barriers.
>  	  Take the default if you are unsure.
>  
> +config RCU_LAZY
> +	bool "RCU callback lazy invocation functionality"
> +	depends on RCU_NOCB_CPU
> +	default n
> +	help
> +	  To save power, batch RCU callbacks and flush after delay, memory
> +	  pressure or callback list growing too big.
> +
>  endmenu # "RCU Subsystem"
> diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
> index be5979da07f5..94675f14efe8 100644
> --- a/kernel/rcu/rcu.h
> +++ b/kernel/rcu/rcu.h
> @@ -474,6 +474,14 @@ enum rcutorture_type {
>  	INVALID_RCU_FLAVOR
>  };
>  
> +#if defined(CONFIG_RCU_LAZY)
> +unsigned long rcu_lazy_get_jiffies_till_flush(void);
> +void rcu_lazy_set_jiffies_till_flush(unsigned long j);
> +#else
> +static inline unsigned long rcu_lazy_get_jiffies_till_flush(void) { return 0; }
> +static inline void rcu_lazy_set_jiffies_till_flush(unsigned long j) { }
> +#endif
> +
>  #if defined(CONFIG_TREE_RCU)
>  void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
>  			    unsigned long *gp_seq);
> @@ -483,6 +491,8 @@ void do_trace_rcu_torture_read(const char *rcutorturename,
>  			       unsigned long c_old,
>  			       unsigned long c);
>  void rcu_gp_set_torture_wait(int duration);
> +void rcu_force_call_rcu_to_lazy(bool force);
> +
>  #else
>  static inline void rcutorture_get_gp_data(enum rcutorture_type test_type,
>  					  int *flags, unsigned long *gp_seq)
> @@ -501,6 +511,7 @@ void do_trace_rcu_torture_read(const char *rcutorturename,
>  	do { } while (0)
>  #endif
>  static inline void rcu_gp_set_torture_wait(int duration) { }
> +static inline void rcu_force_call_rcu_to_lazy(bool force) { }
>  #endif
>  
>  #if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST)
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index 9fe581be8696..dbd25b8c080e 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -2728,47 +2728,8 @@ static void check_cb_ovld(struct rcu_data *rdp)
>  	raw_spin_unlock_rcu_node(rnp);
>  }
>  
> -/**
> - * call_rcu() - Queue an RCU callback for invocation after a grace period.
> - * @head: structure to be used for queueing the RCU updates.
> - * @func: actual callback function to be invoked after the grace period
> - *
> - * The callback function will be invoked some time after a full grace
> - * period elapses, in other words after all pre-existing RCU read-side
> - * critical sections have completed.  However, the callback function
> - * might well execute concurrently with RCU read-side critical sections
> - * that started after call_rcu() was invoked.
> - *
> - * RCU read-side critical sections are delimited by rcu_read_lock()
> - * and rcu_read_unlock(), and may be nested.  In addition, but only in
> - * v5.0 and later, regions of code across which interrupts, preemption,
> - * or softirqs have been disabled also serve as RCU read-side critical
> - * sections.  This includes hardware interrupt handlers, softirq handlers,
> - * and NMI handlers.
> - *
> - * Note that all CPUs must agree that the grace period extended beyond
> - * all pre-existing RCU read-side critical section.  On systems with more
> - * than one CPU, this means that when "func()" is invoked, each CPU is
> - * guaranteed to have executed a full memory barrier since the end of its
> - * last RCU read-side critical section whose beginning preceded the call
> - * to call_rcu().  It also means that each CPU executing an RCU read-side
> - * critical section that continues beyond the start of "func()" must have
> - * executed a memory barrier after the call_rcu() but before the beginning
> - * of that RCU read-side critical section.  Note that these guarantees
> - * include CPUs that are offline, idle, or executing in user mode, as
> - * well as CPUs that are executing in the kernel.
> - *
> - * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the
> - * resulting RCU callback function "func()", then both CPU A and CPU B are
> - * guaranteed to execute a full memory barrier during the time interval
> - * between the call to call_rcu() and the invocation of "func()" -- even
> - * if CPU A and CPU B are the same CPU (but again only if the system has
> - * more than one CPU).
> - *
> - * Implementation of these memory-ordering guarantees is described here:
> - * Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst.
> - */
> -void call_rcu(struct rcu_head *head, rcu_callback_t func)
> +static void
> +__call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy)
>  {
>  	static atomic_t doublefrees;
>  	unsigned long flags;
> @@ -2818,7 +2779,7 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func)
>  		trace_rcu_callback(rcu_state.name, head,
>  				   rcu_segcblist_n_cbs(&rdp->cblist));
>  
> -	if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags))
> +	if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags, lazy))
>  		return; // Enqueued onto ->nocb_bypass, so just leave.
>  	// If no-CBs CPU gets here, rcu_nocb_try_bypass() acquired ->nocb_lock.
>  	rcu_segcblist_enqueue(&rdp->cblist, head);
> @@ -2833,8 +2794,86 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func)
>  		local_irq_restore(flags);
>  	}
>  }
> -EXPORT_SYMBOL_GPL(call_rcu);
>  
> +#ifdef CONFIG_RCU_LAZY
> +/**
> + * call_rcu_lazy() - Lazily queue RCU callback for invocation after grace period.
> + * @head: structure to be used for queueing the RCU updates.
> + * @func: actual callback function to be invoked after the grace period
> + *
> + * The callback function will be invoked some time after a full grace
> + * period elapses, in other words after all pre-existing RCU read-side
> + * critical sections have completed.
> + *
> + * Use this API instead of call_rcu() if you don't mind the callback being
> + * invoked after very long periods of time on systems without memory pressure
> + * and on systems which are lightly loaded or mostly idle.
> + *
> + * Other than the extra delay in callbacks being invoked, this function is
> + * identical to, and reuses call_rcu()'s logic. Refer to call_rcu() for more
> + * details about memory ordering and other functionality.
> + */
> +void call_rcu_lazy(struct rcu_head *head, rcu_callback_t func)
> +{
> +	return __call_rcu_common(head, func, true);
> +}
> +EXPORT_SYMBOL_GPL(call_rcu_lazy);
> +#endif
> +
> +static bool force_call_rcu_to_lazy;
> +
> +void rcu_force_call_rcu_to_lazy(bool force)
> +{
> +	if (IS_ENABLED(CONFIG_RCU_SCALE_TEST))
> +		WRITE_ONCE(force_call_rcu_to_lazy, force);
> +}
> +EXPORT_SYMBOL_GPL(rcu_force_call_rcu_to_lazy);
> +
> +/**
> + * call_rcu() - Queue an RCU callback for invocation after a grace period.
> + * @head: structure to be used for queueing the RCU updates.
> + * @func: actual callback function to be invoked after the grace period
> + *
> + * The callback function will be invoked some time after a full grace
> + * period elapses, in other words after all pre-existing RCU read-side
> + * critical sections have completed.  However, the callback function
> + * might well execute concurrently with RCU read-side critical sections
> + * that started after call_rcu() was invoked.
> + *
> + * RCU read-side critical sections are delimited by rcu_read_lock()
> + * and rcu_read_unlock(), and may be nested.  In addition, but only in
> + * v5.0 and later, regions of code across which interrupts, preemption,
> + * or softirqs have been disabled also serve as RCU read-side critical
> + * sections.  This includes hardware interrupt handlers, softirq handlers,
> + * and NMI handlers.
> + *
> + * Note that all CPUs must agree that the grace period extended beyond
> + * all pre-existing RCU read-side critical section.  On systems with more
> + * than one CPU, this means that when "func()" is invoked, each CPU is
> + * guaranteed to have executed a full memory barrier since the end of its
> + * last RCU read-side critical section whose beginning preceded the call
> + * to call_rcu().  It also means that each CPU executing an RCU read-side
> + * critical section that continues beyond the start of "func()" must have
> + * executed a memory barrier after the call_rcu() but before the beginning
> + * of that RCU read-side critical section.  Note that these guarantees
> + * include CPUs that are offline, idle, or executing in user mode, as
> + * well as CPUs that are executing in the kernel.
> + *
> + * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the
> + * resulting RCU callback function "func()", then both CPU A and CPU B are
> + * guaranteed to execute a full memory barrier during the time interval
> + * between the call to call_rcu() and the invocation of "func()" -- even
> + * if CPU A and CPU B are the same CPU (but again only if the system has
> + * more than one CPU).
> + *
> + * Implementation of these memory-ordering guarantees is described here:
> + * Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst.
> + */
> +void call_rcu(struct rcu_head *head, rcu_callback_t func)
> +{
> +	return __call_rcu_common(head, func, force_call_rcu_to_lazy);
> +}
> +EXPORT_SYMBOL_GPL(call_rcu);
>  
>  /* Maximum number of jiffies to wait before draining a batch. */
>  #define KFREE_DRAIN_JIFFIES (5 * HZ)
> @@ -3904,7 +3943,11 @@ static void rcu_barrier_entrain(struct rcu_data *rdp)
>  	rdp->barrier_head.func = rcu_barrier_callback;
>  	debug_rcu_head_queue(&rdp->barrier_head);
>  	rcu_nocb_lock(rdp);
> -	WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies));
> +        /*
> +         * Flush the bypass list, but also wake up the GP thread as otherwise
> +         * bypass/lazy CBs maynot be noticed, and can cause real long delays!
> +         */
> +	WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies, FLUSH_BP_WAKE));
>  	if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head)) {
>  		atomic_inc(&rcu_state.barrier_cpu_count);
>  	} else {
> @@ -4325,7 +4368,7 @@ void rcutree_migrate_callbacks(int cpu)
>  	my_rdp = this_cpu_ptr(&rcu_data);
>  	my_rnp = my_rdp->mynode;
>  	rcu_nocb_lock(my_rdp); /* irqs already disabled. */
> -	WARN_ON_ONCE(!rcu_nocb_flush_bypass(my_rdp, NULL, jiffies));
> +	WARN_ON_ONCE(!rcu_nocb_flush_bypass(my_rdp, NULL, jiffies, FLUSH_BP_NONE));
>  	raw_spin_lock_rcu_node(my_rnp); /* irqs already disabled. */
>  	/* Leverage recent GPs and set GP for new callbacks. */
>  	needwake = rcu_advance_cbs(my_rnp, rdp) ||
> diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
> index d4a97e40ea9c..361c41d642c7 100644
> --- a/kernel/rcu/tree.h
> +++ b/kernel/rcu/tree.h
> @@ -263,14 +263,16 @@ struct rcu_data {
>  	unsigned long last_fqs_resched;	/* Time of last rcu_resched(). */
>  	unsigned long last_sched_clock;	/* Jiffies of last rcu_sched_clock_irq(). */
>  
> +	long lazy_len;			/* Length of buffered lazy callbacks. */
>  	int cpu;
>  };
>  
>  /* Values for nocb_defer_wakeup field in struct rcu_data. */
>  #define RCU_NOCB_WAKE_NOT	0
>  #define RCU_NOCB_WAKE_BYPASS	1
> -#define RCU_NOCB_WAKE		2
> -#define RCU_NOCB_WAKE_FORCE	3
> +#define RCU_NOCB_WAKE_LAZY	2
> +#define RCU_NOCB_WAKE		3
> +#define RCU_NOCB_WAKE_FORCE	4
>  
>  #define RCU_JIFFIES_TILL_FORCE_QS (1 + (HZ > 250) + (HZ > 500))
>  					/* For jiffies_till_first_fqs and */
> @@ -439,10 +441,17 @@ static void zero_cpu_stall_ticks(struct rcu_data *rdp);
>  static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp);
>  static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq);
>  static void rcu_init_one_nocb(struct rcu_node *rnp);
> +
> +#define FLUSH_BP_NONE 0
> +/* Is the CB being enqueued after the flush, a lazy CB? */
> +#define FLUSH_BP_LAZY BIT(0)
> +/* Wake up nocb-GP thread after flush? */
> +#define FLUSH_BP_WAKE BIT(1)
>  static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
> -				  unsigned long j);
> +				  unsigned long j, unsigned long flush_flags);
>  static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
> -				bool *was_alldone, unsigned long flags);
> +				bool *was_alldone, unsigned long flags,
> +				bool lazy);
>  static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty,
>  				 unsigned long flags);
>  static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level);
> diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
> index 4dc86274b3e8..b201606f7c4f 100644
> --- a/kernel/rcu/tree_nocb.h
> +++ b/kernel/rcu/tree_nocb.h
> @@ -256,6 +256,31 @@ static bool wake_nocb_gp(struct rcu_data *rdp, bool force)
>  	return __wake_nocb_gp(rdp_gp, rdp, force, flags);
>  }
>  
> +/*
> + * LAZY_FLUSH_JIFFIES decides the maximum amount of time that
> + * can elapse before lazy callbacks are flushed. Lazy callbacks
> + * could be flushed much earlier for a number of other reasons
> + * however, LAZY_FLUSH_JIFFIES will ensure no lazy callbacks are
> + * left unsubmitted to RCU after those many jiffies.
> + */
> +#define LAZY_FLUSH_JIFFIES (10 * HZ)
> +unsigned long jiffies_till_flush = LAZY_FLUSH_JIFFIES;
> +
> +#ifdef CONFIG_RCU_LAZY
> +// To be called only from test code.
> +void rcu_lazy_set_jiffies_till_flush(unsigned long jif)
> +{
> +	jiffies_till_flush = jif;
> +}
> +EXPORT_SYMBOL(rcu_lazy_set_jiffies_till_flush);
> +
> +unsigned long rcu_lazy_get_jiffies_till_flush(void)
> +{
> +	return jiffies_till_flush;
> +}
> +EXPORT_SYMBOL(rcu_lazy_get_jiffies_till_flush);
> +#endif
> +
>  /*
>   * Arrange to wake the GP kthread for this NOCB group at some future
>   * time when it is safe to do so.
> @@ -269,10 +294,14 @@ static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype,
>  	raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
>  
>  	/*
> -	 * Bypass wakeup overrides previous deferments. In case
> -	 * of callback storm, no need to wake up too early.
> +	 * Bypass wakeup overrides previous deferments. In case of
> +	 * callback storm, no need to wake up too early.
>  	 */
> -	if (waketype == RCU_NOCB_WAKE_BYPASS) {
> +	if (waketype == RCU_NOCB_WAKE_LAZY
> +		&& READ_ONCE(rdp->nocb_defer_wakeup) == RCU_NOCB_WAKE_NOT) {
> +		mod_timer(&rdp_gp->nocb_timer, jiffies + jiffies_till_flush);
> +		WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype);
> +	} else if (waketype == RCU_NOCB_WAKE_BYPASS) {
>  		mod_timer(&rdp_gp->nocb_timer, jiffies + 2);
>  		WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype);
>  	} else {
>
Joel, i have a question here. I see that lazy callback just makes the GP
start later where a regular call_rcu() API will instead cut it back.
Could you please clarify how it solves the sequence like:

CPU:

1) task A -> call_rcu_lazy();
2) task B -> call_rcu();

so lazily decision about running GP later a task B overrides to initiate it sooner? 

or am i missing something here?

--
Uladzislau Rezki
Joel Fernandes Sept. 6, 2022, 6:21 p.m. UTC | #21
On 9/6/2022 2:16 PM, Uladzislau Rezki wrote:
>> On Fri, Sep 02, 2022 at 05:21:32PM +0200, Frederic Weisbecker wrote:
>>> On Thu, Sep 01, 2022 at 10:17:08PM +0000, Joel Fernandes (Google) wrote:
>>>> Implement timer-based RCU lazy callback batching. The batch is flushed
>>>> whenever a certain amount of time has passed, or the batch on a
>>>> particular CPU grows too big. Also memory pressure will flush it in a
>>>> future patch.
>>>>
>>>> To handle several corner cases automagically (such as rcu_barrier() and
>>>> hotplug), we re-use bypass lists to handle lazy CBs. The bypass list
>>>> length has the lazy CB length included in it. A separate lazy CB length
>>>> counter is also introduced to keep track of the number of lazy CBs.
>>>>
>>>> Suggested-by: Paul McKenney <paulmck@kernel.org>
>>>> Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
>>>> ---
>>
>> Here is the updated version of this patch for further testing and review.
>> Paul, you could consider updating your test branch. I have tested it in
>> ChromeOS as well, and rcuscale. The laziness and boot times are looking good.
>> There was at least one bug that I fixed that got introduced with the moving
>> of the length field to rcu_data.  Thanks a lot Frederic for the review
>> comments.
>>
>> I will look at the rcu torture issue next... I suspect the length field issue
>> may have been causing it.
>>
>> ---8<-----------------------
>>
>> From: "Joel Fernandes (Google)" <joel@joelfernandes.org>
>> Subject: [PATCH v6] rcu: Introduce call_rcu_lazy() API implementation
>>
>> Implement timer-based RCU lazy callback batching. The batch is flushed
>> whenever a certain amount of time has passed, or the batch on a
>> particular CPU grows too big. Also memory pressure will flush it in a
>> future patch.
>>
>> To handle several corner cases automagically (such as rcu_barrier() and
>> hotplug), we re-use bypass lists to handle lazy CBs. The bypass list
>> length has the lazy CB length included in it. A separate lazy CB length
>> counter is also introduced to keep track of the number of lazy CBs.
>>
>> v5->v6:
>>
>> [ Frederic Weisbec: Program the lazy timer only if WAKE_NOT, since other
>>   deferral levels wake much earlier so for those it is not needed. ]
>>
>> [ Frederic Weisbec: Use flush flags to keep bypass API code clean. ]
>>
>> [ Joel: Fix issue where I was not resetting lazy_len after moving it to rdp ]
>>
>> Suggested-by: Paul McKenney <paulmck@kernel.org>
>> Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
>> ---
>>  include/linux/rcupdate.h |   6 ++
>>  kernel/rcu/Kconfig       |   8 ++
>>  kernel/rcu/rcu.h         |  11 +++
>>  kernel/rcu/tree.c        | 133 +++++++++++++++++++----------
>>  kernel/rcu/tree.h        |  17 +++-
>>  kernel/rcu/tree_nocb.h   | 175 ++++++++++++++++++++++++++++++++-------
>>  6 files changed, 269 insertions(+), 81 deletions(-)
>>
>> diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
>> index 08605ce7379d..82e8a07e0856 100644
>> --- a/include/linux/rcupdate.h
>> +++ b/include/linux/rcupdate.h
>> @@ -108,6 +108,12 @@ static inline int rcu_preempt_depth(void)
>>  
>>  #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
>>  
>> +#ifdef CONFIG_RCU_LAZY
>> +void call_rcu_lazy(struct rcu_head *head, rcu_callback_t func);
>> +#else
>> +#define call_rcu_lazy(head, func) call_rcu(head, func)
>> +#endif
>> +
>>  /* Internal to kernel */
>>  void rcu_init(void);
>>  extern int rcu_scheduler_active;
>> diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
>> index d471d22a5e21..3128d01427cb 100644
>> --- a/kernel/rcu/Kconfig
>> +++ b/kernel/rcu/Kconfig
>> @@ -311,4 +311,12 @@ config TASKS_TRACE_RCU_READ_MB
>>  	  Say N here if you hate read-side memory barriers.
>>  	  Take the default if you are unsure.
>>  
>> +config RCU_LAZY
>> +	bool "RCU callback lazy invocation functionality"
>> +	depends on RCU_NOCB_CPU
>> +	default n
>> +	help
>> +	  To save power, batch RCU callbacks and flush after delay, memory
>> +	  pressure or callback list growing too big.
>> +
>>  endmenu # "RCU Subsystem"
>> diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
>> index be5979da07f5..94675f14efe8 100644
>> --- a/kernel/rcu/rcu.h
>> +++ b/kernel/rcu/rcu.h
>> @@ -474,6 +474,14 @@ enum rcutorture_type {
>>  	INVALID_RCU_FLAVOR
>>  };
>>  
>> +#if defined(CONFIG_RCU_LAZY)
>> +unsigned long rcu_lazy_get_jiffies_till_flush(void);
>> +void rcu_lazy_set_jiffies_till_flush(unsigned long j);
>> +#else
>> +static inline unsigned long rcu_lazy_get_jiffies_till_flush(void) { return 0; }
>> +static inline void rcu_lazy_set_jiffies_till_flush(unsigned long j) { }
>> +#endif
>> +
>>  #if defined(CONFIG_TREE_RCU)
>>  void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
>>  			    unsigned long *gp_seq);
>> @@ -483,6 +491,8 @@ void do_trace_rcu_torture_read(const char *rcutorturename,
>>  			       unsigned long c_old,
>>  			       unsigned long c);
>>  void rcu_gp_set_torture_wait(int duration);
>> +void rcu_force_call_rcu_to_lazy(bool force);
>> +
>>  #else
>>  static inline void rcutorture_get_gp_data(enum rcutorture_type test_type,
>>  					  int *flags, unsigned long *gp_seq)
>> @@ -501,6 +511,7 @@ void do_trace_rcu_torture_read(const char *rcutorturename,
>>  	do { } while (0)
>>  #endif
>>  static inline void rcu_gp_set_torture_wait(int duration) { }
>> +static inline void rcu_force_call_rcu_to_lazy(bool force) { }
>>  #endif
>>  
>>  #if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST)
>> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
>> index 9fe581be8696..dbd25b8c080e 100644
>> --- a/kernel/rcu/tree.c
>> +++ b/kernel/rcu/tree.c
>> @@ -2728,47 +2728,8 @@ static void check_cb_ovld(struct rcu_data *rdp)
>>  	raw_spin_unlock_rcu_node(rnp);
>>  }
>>  
>> -/**
>> - * call_rcu() - Queue an RCU callback for invocation after a grace period.
>> - * @head: structure to be used for queueing the RCU updates.
>> - * @func: actual callback function to be invoked after the grace period
>> - *
>> - * The callback function will be invoked some time after a full grace
>> - * period elapses, in other words after all pre-existing RCU read-side
>> - * critical sections have completed.  However, the callback function
>> - * might well execute concurrently with RCU read-side critical sections
>> - * that started after call_rcu() was invoked.
>> - *
>> - * RCU read-side critical sections are delimited by rcu_read_lock()
>> - * and rcu_read_unlock(), and may be nested.  In addition, but only in
>> - * v5.0 and later, regions of code across which interrupts, preemption,
>> - * or softirqs have been disabled also serve as RCU read-side critical
>> - * sections.  This includes hardware interrupt handlers, softirq handlers,
>> - * and NMI handlers.
>> - *
>> - * Note that all CPUs must agree that the grace period extended beyond
>> - * all pre-existing RCU read-side critical section.  On systems with more
>> - * than one CPU, this means that when "func()" is invoked, each CPU is
>> - * guaranteed to have executed a full memory barrier since the end of its
>> - * last RCU read-side critical section whose beginning preceded the call
>> - * to call_rcu().  It also means that each CPU executing an RCU read-side
>> - * critical section that continues beyond the start of "func()" must have
>> - * executed a memory barrier after the call_rcu() but before the beginning
>> - * of that RCU read-side critical section.  Note that these guarantees
>> - * include CPUs that are offline, idle, or executing in user mode, as
>> - * well as CPUs that are executing in the kernel.
>> - *
>> - * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the
>> - * resulting RCU callback function "func()", then both CPU A and CPU B are
>> - * guaranteed to execute a full memory barrier during the time interval
>> - * between the call to call_rcu() and the invocation of "func()" -- even
>> - * if CPU A and CPU B are the same CPU (but again only if the system has
>> - * more than one CPU).
>> - *
>> - * Implementation of these memory-ordering guarantees is described here:
>> - * Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst.
>> - */
>> -void call_rcu(struct rcu_head *head, rcu_callback_t func)
>> +static void
>> +__call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy)
>>  {
>>  	static atomic_t doublefrees;
>>  	unsigned long flags;
>> @@ -2818,7 +2779,7 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func)
>>  		trace_rcu_callback(rcu_state.name, head,
>>  				   rcu_segcblist_n_cbs(&rdp->cblist));
>>  
>> -	if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags))
>> +	if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags, lazy))
>>  		return; // Enqueued onto ->nocb_bypass, so just leave.
>>  	// If no-CBs CPU gets here, rcu_nocb_try_bypass() acquired ->nocb_lock.
>>  	rcu_segcblist_enqueue(&rdp->cblist, head);
>> @@ -2833,8 +2794,86 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func)
>>  		local_irq_restore(flags);
>>  	}
>>  }
>> -EXPORT_SYMBOL_GPL(call_rcu);
>>  
>> +#ifdef CONFIG_RCU_LAZY
>> +/**
>> + * call_rcu_lazy() - Lazily queue RCU callback for invocation after grace period.
>> + * @head: structure to be used for queueing the RCU updates.
>> + * @func: actual callback function to be invoked after the grace period
>> + *
>> + * The callback function will be invoked some time after a full grace
>> + * period elapses, in other words after all pre-existing RCU read-side
>> + * critical sections have completed.
>> + *
>> + * Use this API instead of call_rcu() if you don't mind the callback being
>> + * invoked after very long periods of time on systems without memory pressure
>> + * and on systems which are lightly loaded or mostly idle.
>> + *
>> + * Other than the extra delay in callbacks being invoked, this function is
>> + * identical to, and reuses call_rcu()'s logic. Refer to call_rcu() for more
>> + * details about memory ordering and other functionality.
>> + */
>> +void call_rcu_lazy(struct rcu_head *head, rcu_callback_t func)
>> +{
>> +	return __call_rcu_common(head, func, true);
>> +}
>> +EXPORT_SYMBOL_GPL(call_rcu_lazy);
>> +#endif
>> +
>> +static bool force_call_rcu_to_lazy;
>> +
>> +void rcu_force_call_rcu_to_lazy(bool force)
>> +{
>> +	if (IS_ENABLED(CONFIG_RCU_SCALE_TEST))
>> +		WRITE_ONCE(force_call_rcu_to_lazy, force);
>> +}
>> +EXPORT_SYMBOL_GPL(rcu_force_call_rcu_to_lazy);
>> +
>> +/**
>> + * call_rcu() - Queue an RCU callback for invocation after a grace period.
>> + * @head: structure to be used for queueing the RCU updates.
>> + * @func: actual callback function to be invoked after the grace period
>> + *
>> + * The callback function will be invoked some time after a full grace
>> + * period elapses, in other words after all pre-existing RCU read-side
>> + * critical sections have completed.  However, the callback function
>> + * might well execute concurrently with RCU read-side critical sections
>> + * that started after call_rcu() was invoked.
>> + *
>> + * RCU read-side critical sections are delimited by rcu_read_lock()
>> + * and rcu_read_unlock(), and may be nested.  In addition, but only in
>> + * v5.0 and later, regions of code across which interrupts, preemption,
>> + * or softirqs have been disabled also serve as RCU read-side critical
>> + * sections.  This includes hardware interrupt handlers, softirq handlers,
>> + * and NMI handlers.
>> + *
>> + * Note that all CPUs must agree that the grace period extended beyond
>> + * all pre-existing RCU read-side critical section.  On systems with more
>> + * than one CPU, this means that when "func()" is invoked, each CPU is
>> + * guaranteed to have executed a full memory barrier since the end of its
>> + * last RCU read-side critical section whose beginning preceded the call
>> + * to call_rcu().  It also means that each CPU executing an RCU read-side
>> + * critical section that continues beyond the start of "func()" must have
>> + * executed a memory barrier after the call_rcu() but before the beginning
>> + * of that RCU read-side critical section.  Note that these guarantees
>> + * include CPUs that are offline, idle, or executing in user mode, as
>> + * well as CPUs that are executing in the kernel.
>> + *
>> + * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the
>> + * resulting RCU callback function "func()", then both CPU A and CPU B are
>> + * guaranteed to execute a full memory barrier during the time interval
>> + * between the call to call_rcu() and the invocation of "func()" -- even
>> + * if CPU A and CPU B are the same CPU (but again only if the system has
>> + * more than one CPU).
>> + *
>> + * Implementation of these memory-ordering guarantees is described here:
>> + * Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst.
>> + */
>> +void call_rcu(struct rcu_head *head, rcu_callback_t func)
>> +{
>> +	return __call_rcu_common(head, func, force_call_rcu_to_lazy);
>> +}
>> +EXPORT_SYMBOL_GPL(call_rcu);
>>  
>>  /* Maximum number of jiffies to wait before draining a batch. */
>>  #define KFREE_DRAIN_JIFFIES (5 * HZ)
>> @@ -3904,7 +3943,11 @@ static void rcu_barrier_entrain(struct rcu_data *rdp)
>>  	rdp->barrier_head.func = rcu_barrier_callback;
>>  	debug_rcu_head_queue(&rdp->barrier_head);
>>  	rcu_nocb_lock(rdp);
>> -	WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies));
>> +        /*
>> +         * Flush the bypass list, but also wake up the GP thread as otherwise
>> +         * bypass/lazy CBs maynot be noticed, and can cause real long delays!
>> +         */
>> +	WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies, FLUSH_BP_WAKE));
>>  	if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head)) {
>>  		atomic_inc(&rcu_state.barrier_cpu_count);
>>  	} else {
>> @@ -4325,7 +4368,7 @@ void rcutree_migrate_callbacks(int cpu)
>>  	my_rdp = this_cpu_ptr(&rcu_data);
>>  	my_rnp = my_rdp->mynode;
>>  	rcu_nocb_lock(my_rdp); /* irqs already disabled. */
>> -	WARN_ON_ONCE(!rcu_nocb_flush_bypass(my_rdp, NULL, jiffies));
>> +	WARN_ON_ONCE(!rcu_nocb_flush_bypass(my_rdp, NULL, jiffies, FLUSH_BP_NONE));
>>  	raw_spin_lock_rcu_node(my_rnp); /* irqs already disabled. */
>>  	/* Leverage recent GPs and set GP for new callbacks. */
>>  	needwake = rcu_advance_cbs(my_rnp, rdp) ||
>> diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
>> index d4a97e40ea9c..361c41d642c7 100644
>> --- a/kernel/rcu/tree.h
>> +++ b/kernel/rcu/tree.h
>> @@ -263,14 +263,16 @@ struct rcu_data {
>>  	unsigned long last_fqs_resched;	/* Time of last rcu_resched(). */
>>  	unsigned long last_sched_clock;	/* Jiffies of last rcu_sched_clock_irq(). */
>>  
>> +	long lazy_len;			/* Length of buffered lazy callbacks. */
>>  	int cpu;
>>  };
>>  
>>  /* Values for nocb_defer_wakeup field in struct rcu_data. */
>>  #define RCU_NOCB_WAKE_NOT	0
>>  #define RCU_NOCB_WAKE_BYPASS	1
>> -#define RCU_NOCB_WAKE		2
>> -#define RCU_NOCB_WAKE_FORCE	3
>> +#define RCU_NOCB_WAKE_LAZY	2
>> +#define RCU_NOCB_WAKE		3
>> +#define RCU_NOCB_WAKE_FORCE	4
>>  
>>  #define RCU_JIFFIES_TILL_FORCE_QS (1 + (HZ > 250) + (HZ > 500))
>>  					/* For jiffies_till_first_fqs and */
>> @@ -439,10 +441,17 @@ static void zero_cpu_stall_ticks(struct rcu_data *rdp);
>>  static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp);
>>  static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq);
>>  static void rcu_init_one_nocb(struct rcu_node *rnp);
>> +
>> +#define FLUSH_BP_NONE 0
>> +/* Is the CB being enqueued after the flush, a lazy CB? */
>> +#define FLUSH_BP_LAZY BIT(0)
>> +/* Wake up nocb-GP thread after flush? */
>> +#define FLUSH_BP_WAKE BIT(1)
>>  static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
>> -				  unsigned long j);
>> +				  unsigned long j, unsigned long flush_flags);
>>  static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
>> -				bool *was_alldone, unsigned long flags);
>> +				bool *was_alldone, unsigned long flags,
>> +				bool lazy);
>>  static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty,
>>  				 unsigned long flags);
>>  static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level);
>> diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
>> index 4dc86274b3e8..b201606f7c4f 100644
>> --- a/kernel/rcu/tree_nocb.h
>> +++ b/kernel/rcu/tree_nocb.h
>> @@ -256,6 +256,31 @@ static bool wake_nocb_gp(struct rcu_data *rdp, bool force)
>>  	return __wake_nocb_gp(rdp_gp, rdp, force, flags);
>>  }
>>  
>> +/*
>> + * LAZY_FLUSH_JIFFIES decides the maximum amount of time that
>> + * can elapse before lazy callbacks are flushed. Lazy callbacks
>> + * could be flushed much earlier for a number of other reasons
>> + * however, LAZY_FLUSH_JIFFIES will ensure no lazy callbacks are
>> + * left unsubmitted to RCU after those many jiffies.
>> + */
>> +#define LAZY_FLUSH_JIFFIES (10 * HZ)
>> +unsigned long jiffies_till_flush = LAZY_FLUSH_JIFFIES;
>> +
>> +#ifdef CONFIG_RCU_LAZY
>> +// To be called only from test code.
>> +void rcu_lazy_set_jiffies_till_flush(unsigned long jif)
>> +{
>> +	jiffies_till_flush = jif;
>> +}
>> +EXPORT_SYMBOL(rcu_lazy_set_jiffies_till_flush);
>> +
>> +unsigned long rcu_lazy_get_jiffies_till_flush(void)
>> +{
>> +	return jiffies_till_flush;
>> +}
>> +EXPORT_SYMBOL(rcu_lazy_get_jiffies_till_flush);
>> +#endif
>> +
>>  /*
>>   * Arrange to wake the GP kthread for this NOCB group at some future
>>   * time when it is safe to do so.
>> @@ -269,10 +294,14 @@ static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype,
>>  	raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
>>  
>>  	/*
>> -	 * Bypass wakeup overrides previous deferments. In case
>> -	 * of callback storm, no need to wake up too early.
>> +	 * Bypass wakeup overrides previous deferments. In case of
>> +	 * callback storm, no need to wake up too early.
>>  	 */
>> -	if (waketype == RCU_NOCB_WAKE_BYPASS) {
>> +	if (waketype == RCU_NOCB_WAKE_LAZY
>> +		&& READ_ONCE(rdp->nocb_defer_wakeup) == RCU_NOCB_WAKE_NOT) {
>> +		mod_timer(&rdp_gp->nocb_timer, jiffies + jiffies_till_flush);
>> +		WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype);
>> +	} else if (waketype == RCU_NOCB_WAKE_BYPASS) {
>>  		mod_timer(&rdp_gp->nocb_timer, jiffies + 2);
>>  		WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype);
>>  	} else {
>>
> Joel, i have a question here. I see that lazy callback just makes the GP
> start later where a regular call_rcu() API will instead cut it back.
> Could you please clarify how it solves the sequence like:
> 
> CPU:
> 
> 1) task A -> call_rcu_lazy();
> 2) task B -> call_rcu();
> 
> so lazily decision about running GP later a task B overrides to initiate it sooner? 

Is your question that task B will make task A's CB run sooner? Yes that's right.
The reason is that the point of call_rcu_lazy() is to delay GP start as much as
possible to keep system quiet. However, if a call_rcu() comes in any way, we
have to start a GP soon. So we might as well take advantage of that for the lazy
one as we are paying the full price for the new GP.

It is kind of like RCU's amortizing behavior in that sense, however we put that
into overdrive to be greedy for power.

Does that answer your question?

thanks,

 - Joel
Frederic Weisbecker Sept. 6, 2022, 7:11 p.m. UTC | #22
On Tue, Sep 06, 2022 at 12:43:52PM -0400, Joel Fernandes wrote:
> On 9/6/2022 12:38 PM, Joel Fernandes wrote:
> Ah, now I know why I got confused. I *used* to flush the bypass list before when
> !lazy CBs showed up. Paul suggested this is overkill. In this old overkill
> method, I was missing a wake up which was likely causing the boot regression.
> Forcing a wake up fixed that. Now in v5 I make it such that I don't do the flush
> on a !lazy rate-limit.
> 
> I am sorry for the confusion. Either way, in my defense this is just an extra
> bit of code that I have to delete. This code is hard. I have mostly relied on a
> test-driven development. But now thanks to this review and I am learning the
> code more and more...

Yeah this code is hard.

Especially as it's possible to flush from both sides and queue the timer
from both sides. And both sides read the bypass/lazy counter locklessly.
But only call_rcu_*() can queue/increase the bypass size whereas only
nocb_gp_wait() can cancel the timer. Phew!

Among the many possible dances between rcu_nocb_try_bypass()
and nocb_gp_wait(), I haven't found a way yet for the timer to be
set to LAZY when it should be BYPASS (or other kind of accident such
as an ignored callback).

In the worst case we may arm an earlier timer than necessary
(RCU_NOCB_WAKE_BYPASS instead of RCU_NOCB_WAKE_LAZY for example).

Famous last words...


> Thanks,
> 
>  - Joel
> 
> 
>
Joel Fernandes Sept. 7, 2022, 12:06 a.m. UTC | #23
On Tue, Sep 06, 2022 at 05:17:57PM +0200, Frederic Weisbecker wrote:
> On Tue, Sep 06, 2022 at 03:05:46AM +0000, Joel Fernandes wrote:
> > diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
> > index 4dc86274b3e8..b201606f7c4f 100644
> > --- a/kernel/rcu/tree_nocb.h
> > +++ b/kernel/rcu/tree_nocb.h
> > @@ -256,6 +256,31 @@ static bool wake_nocb_gp(struct rcu_data *rdp, bool force)
> >  	return __wake_nocb_gp(rdp_gp, rdp, force, flags);
> >  }
> >  
> > +/*
> > + * LAZY_FLUSH_JIFFIES decides the maximum amount of time that
> > + * can elapse before lazy callbacks are flushed. Lazy callbacks
> > + * could be flushed much earlier for a number of other reasons
> > + * however, LAZY_FLUSH_JIFFIES will ensure no lazy callbacks are
> > + * left unsubmitted to RCU after those many jiffies.
> > + */
> > +#define LAZY_FLUSH_JIFFIES (10 * HZ)
> > +unsigned long jiffies_till_flush = LAZY_FLUSH_JIFFIES;
> 
> Still not static.
> 
> > @@ -293,12 +322,16 @@ static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype,
> >   * proves to be initially empty, just return false because the no-CB GP
> >   * kthread may need to be awakened in this case.
> >   *
> > + * Return true if there was something to be flushed and it succeeded, otherwise
> > + * false.
> > + *
> 
> This kind of contradict the comment that follows. Not sure you need to add
> that line because the existing comment seem to cover it.
> 
> >   * Note that this function always returns true if rhp is NULL.
> 
> >   */
> >  static bool rcu_nocb_do_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
> > -				     unsigned long j)
> > +				     unsigned long j, unsigned long flush_flags)
> >  {
> >  	struct rcu_cblist rcl;
> > +	bool lazy = flush_flags & FLUSH_BP_LAZY;
> >  
> >  	WARN_ON_ONCE(!rcu_rdp_is_offloaded(rdp));
> >  	rcu_lockdep_assert_cblist_protected(rdp);
> > @@ -326,13 +372,20 @@ static bool rcu_nocb_do_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
> >   * Note that this function always returns true if rhp is NULL.
> >   */
> >  static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
> > -				  unsigned long j)
> > +				  unsigned long j, unsigned long flush_flags)
> >  {
> > +	bool ret;
> > +
> >  	if (!rcu_rdp_is_offloaded(rdp))
> >  		return true;
> >  	rcu_lockdep_assert_cblist_protected(rdp);
> >  	rcu_nocb_bypass_lock(rdp);
> > -	return rcu_nocb_do_flush_bypass(rdp, rhp, j);
> > +	ret = rcu_nocb_do_flush_bypass(rdp, rhp, j, flush_flags);
> > +
> > +	if (flush_flags & FLUSH_BP_WAKE)
> > +		wake_nocb_gp(rdp, true);
> 
> Why the true above?
> 
> Also should we check if the wake up is really necessary (otherwise it means we
> force a wake up for all rdp's from rcu_barrier())?
> 
>        was_alldone = rcu_segcblist_pend_cbs(&rdp->cblist);
>        ret = rcu_nocb_do_flush_bypass(rdp, rhp, j, flush_flags);
>        if (was_alldone && rcu_segcblist_pend_cbs(&rdp->cblist))
>        	  wake_nocb_gp(rdp, false);

You mean something like the following right? Though I'm thinking if its
better to call wake_nocb_gp() from tree.c in entrain() and let that handle
the wake. That way, we can get rid of the extra FLUSH_BP flags as well and
let the flush callers deal with the wakeups..

Anyway, for testing this should be good...

---8<-----------------------

diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index bd8f39ee2cd0..e3344c262672 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -382,15 +382,19 @@ static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
 				  unsigned long j, unsigned long flush_flags)
 {
 	bool ret;
+	bool was_alldone;
 
 	if (!rcu_rdp_is_offloaded(rdp))
 		return true;
 	rcu_lockdep_assert_cblist_protected(rdp);
 	rcu_nocb_bypass_lock(rdp);
+	if (flush_flags & FLUSH_BP_WAKE)
+		was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
+
 	ret = rcu_nocb_do_flush_bypass(rdp, rhp, j, flush_flags);
 
-	if (flush_flags & FLUSH_BP_WAKE)
-		wake_nocb_gp(rdp, true);
+	if (flush_flags & FLUSH_BP_WAKE && was_alldone)
+		wake_nocb_gp(rdp, false);
 
 	return ret;
 }
Joel Fernandes Sept. 7, 2022, 2:56 a.m. UTC | #24
On 9/6/2022 3:11 PM, Frederic Weisbecker wrote:
> On Tue, Sep 06, 2022 at 12:43:52PM -0400, Joel Fernandes wrote:
>> On 9/6/2022 12:38 PM, Joel Fernandes wrote:
>> Ah, now I know why I got confused. I *used* to flush the bypass list before when
>> !lazy CBs showed up. Paul suggested this is overkill. In this old overkill
>> method, I was missing a wake up which was likely causing the boot regression.
>> Forcing a wake up fixed that. Now in v5 I make it such that I don't do the flush
>> on a !lazy rate-limit.
>>
>> I am sorry for the confusion. Either way, in my defense this is just an extra
>> bit of code that I have to delete. This code is hard. I have mostly relied on a
>> test-driven development. But now thanks to this review and I am learning the
>> code more and more...
> 
> Yeah this code is hard.
> 
> Especially as it's possible to flush from both sides and queue the timer
> from both sides. And both sides read the bypass/lazy counter locklessly.
> But only call_rcu_*() can queue/increase the bypass size whereas only
> nocb_gp_wait() can cancel the timer. Phew!
> 

Haha, Indeed ;-)

> Among the many possible dances between rcu_nocb_try_bypass()
> and nocb_gp_wait(), I haven't found a way yet for the timer to be
> set to LAZY when it should be BYPASS (or other kind of accident such
> as an ignored callback).
> In the worst case we may arm an earlier timer than necessary
> (RCU_NOCB_WAKE_BYPASS instead of RCU_NOCB_WAKE_LAZY for example).
> 
> Famous last words...

Agreed.

On the issue of regressions with non-lazy things being treated as lazy, I was
thinking of adding a bounded-time-check to:

[PATCH v5 08/18] rcu: Add per-CB tracing for queuing, flush and invocation.

Where, if a non-lazy CB takes an abnormally long time to execute (say it was
subject to a race-condition), it would splat. This can be done because I am
tracking the queue-time in the rcu_head in that patch.

On another note, boot time regressions show up pretty quickly (at least on
ChromeOS) when non-lazy things become lazy and so far with the latest code it
has fortunately been pretty well behaved.

Thanks,

 - Joel
Uladzislau Rezki Sept. 7, 2022, 8:52 a.m. UTC | #25
On Tue, Sep 06, 2022 at 02:21:46PM -0400, Joel Fernandes wrote:
> 
> 
> On 9/6/2022 2:16 PM, Uladzislau Rezki wrote:
> >> On Fri, Sep 02, 2022 at 05:21:32PM +0200, Frederic Weisbecker wrote:
> >>> On Thu, Sep 01, 2022 at 10:17:08PM +0000, Joel Fernandes (Google) wrote:
> >>>> Implement timer-based RCU lazy callback batching. The batch is flushed
> >>>> whenever a certain amount of time has passed, or the batch on a
> >>>> particular CPU grows too big. Also memory pressure will flush it in a
> >>>> future patch.
> >>>>
> >>>> To handle several corner cases automagically (such as rcu_barrier() and
> >>>> hotplug), we re-use bypass lists to handle lazy CBs. The bypass list
> >>>> length has the lazy CB length included in it. A separate lazy CB length
> >>>> counter is also introduced to keep track of the number of lazy CBs.
> >>>>
> >>>> Suggested-by: Paul McKenney <paulmck@kernel.org>
> >>>> Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
> >>>> ---
> >>
> >> Here is the updated version of this patch for further testing and review.
> >> Paul, you could consider updating your test branch. I have tested it in
> >> ChromeOS as well, and rcuscale. The laziness and boot times are looking good.
> >> There was at least one bug that I fixed that got introduced with the moving
> >> of the length field to rcu_data.  Thanks a lot Frederic for the review
> >> comments.
> >>
> >> I will look at the rcu torture issue next... I suspect the length field issue
> >> may have been causing it.
> >>
> >> ---8<-----------------------
> >>
> >> From: "Joel Fernandes (Google)" <joel@joelfernandes.org>
> >> Subject: [PATCH v6] rcu: Introduce call_rcu_lazy() API implementation
> >>
> >> Implement timer-based RCU lazy callback batching. The batch is flushed
> >> whenever a certain amount of time has passed, or the batch on a
> >> particular CPU grows too big. Also memory pressure will flush it in a
> >> future patch.
> >>
> >> To handle several corner cases automagically (such as rcu_barrier() and
> >> hotplug), we re-use bypass lists to handle lazy CBs. The bypass list
> >> length has the lazy CB length included in it. A separate lazy CB length
> >> counter is also introduced to keep track of the number of lazy CBs.
> >>
> >> v5->v6:
> >>
> >> [ Frederic Weisbec: Program the lazy timer only if WAKE_NOT, since other
> >>   deferral levels wake much earlier so for those it is not needed. ]
> >>
> >> [ Frederic Weisbec: Use flush flags to keep bypass API code clean. ]
> >>
> >> [ Joel: Fix issue where I was not resetting lazy_len after moving it to rdp ]
> >>
> >> Suggested-by: Paul McKenney <paulmck@kernel.org>
> >> Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
> >> ---
> >>  include/linux/rcupdate.h |   6 ++
> >>  kernel/rcu/Kconfig       |   8 ++
> >>  kernel/rcu/rcu.h         |  11 +++
> >>  kernel/rcu/tree.c        | 133 +++++++++++++++++++----------
> >>  kernel/rcu/tree.h        |  17 +++-
> >>  kernel/rcu/tree_nocb.h   | 175 ++++++++++++++++++++++++++++++++-------
> >>  6 files changed, 269 insertions(+), 81 deletions(-)
> >>
> >> diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
> >> index 08605ce7379d..82e8a07e0856 100644
> >> --- a/include/linux/rcupdate.h
> >> +++ b/include/linux/rcupdate.h
> >> @@ -108,6 +108,12 @@ static inline int rcu_preempt_depth(void)
> >>  
> >>  #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
> >>  
> >> +#ifdef CONFIG_RCU_LAZY
> >> +void call_rcu_lazy(struct rcu_head *head, rcu_callback_t func);
> >> +#else
> >> +#define call_rcu_lazy(head, func) call_rcu(head, func)
> >> +#endif
> >> +
> >>  /* Internal to kernel */
> >>  void rcu_init(void);
> >>  extern int rcu_scheduler_active;
> >> diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
> >> index d471d22a5e21..3128d01427cb 100644
> >> --- a/kernel/rcu/Kconfig
> >> +++ b/kernel/rcu/Kconfig
> >> @@ -311,4 +311,12 @@ config TASKS_TRACE_RCU_READ_MB
> >>  	  Say N here if you hate read-side memory barriers.
> >>  	  Take the default if you are unsure.
> >>  
> >> +config RCU_LAZY
> >> +	bool "RCU callback lazy invocation functionality"
> >> +	depends on RCU_NOCB_CPU
> >> +	default n
> >> +	help
> >> +	  To save power, batch RCU callbacks and flush after delay, memory
> >> +	  pressure or callback list growing too big.
> >> +
> >>  endmenu # "RCU Subsystem"
> >> diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
> >> index be5979da07f5..94675f14efe8 100644
> >> --- a/kernel/rcu/rcu.h
> >> +++ b/kernel/rcu/rcu.h
> >> @@ -474,6 +474,14 @@ enum rcutorture_type {
> >>  	INVALID_RCU_FLAVOR
> >>  };
> >>  
> >> +#if defined(CONFIG_RCU_LAZY)
> >> +unsigned long rcu_lazy_get_jiffies_till_flush(void);
> >> +void rcu_lazy_set_jiffies_till_flush(unsigned long j);
> >> +#else
> >> +static inline unsigned long rcu_lazy_get_jiffies_till_flush(void) { return 0; }
> >> +static inline void rcu_lazy_set_jiffies_till_flush(unsigned long j) { }
> >> +#endif
> >> +
> >>  #if defined(CONFIG_TREE_RCU)
> >>  void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
> >>  			    unsigned long *gp_seq);
> >> @@ -483,6 +491,8 @@ void do_trace_rcu_torture_read(const char *rcutorturename,
> >>  			       unsigned long c_old,
> >>  			       unsigned long c);
> >>  void rcu_gp_set_torture_wait(int duration);
> >> +void rcu_force_call_rcu_to_lazy(bool force);
> >> +
> >>  #else
> >>  static inline void rcutorture_get_gp_data(enum rcutorture_type test_type,
> >>  					  int *flags, unsigned long *gp_seq)
> >> @@ -501,6 +511,7 @@ void do_trace_rcu_torture_read(const char *rcutorturename,
> >>  	do { } while (0)
> >>  #endif
> >>  static inline void rcu_gp_set_torture_wait(int duration) { }
> >> +static inline void rcu_force_call_rcu_to_lazy(bool force) { }
> >>  #endif
> >>  
> >>  #if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST)
> >> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> >> index 9fe581be8696..dbd25b8c080e 100644
> >> --- a/kernel/rcu/tree.c
> >> +++ b/kernel/rcu/tree.c
> >> @@ -2728,47 +2728,8 @@ static void check_cb_ovld(struct rcu_data *rdp)
> >>  	raw_spin_unlock_rcu_node(rnp);
> >>  }
> >>  
> >> -/**
> >> - * call_rcu() - Queue an RCU callback for invocation after a grace period.
> >> - * @head: structure to be used for queueing the RCU updates.
> >> - * @func: actual callback function to be invoked after the grace period
> >> - *
> >> - * The callback function will be invoked some time after a full grace
> >> - * period elapses, in other words after all pre-existing RCU read-side
> >> - * critical sections have completed.  However, the callback function
> >> - * might well execute concurrently with RCU read-side critical sections
> >> - * that started after call_rcu() was invoked.
> >> - *
> >> - * RCU read-side critical sections are delimited by rcu_read_lock()
> >> - * and rcu_read_unlock(), and may be nested.  In addition, but only in
> >> - * v5.0 and later, regions of code across which interrupts, preemption,
> >> - * or softirqs have been disabled also serve as RCU read-side critical
> >> - * sections.  This includes hardware interrupt handlers, softirq handlers,
> >> - * and NMI handlers.
> >> - *
> >> - * Note that all CPUs must agree that the grace period extended beyond
> >> - * all pre-existing RCU read-side critical section.  On systems with more
> >> - * than one CPU, this means that when "func()" is invoked, each CPU is
> >> - * guaranteed to have executed a full memory barrier since the end of its
> >> - * last RCU read-side critical section whose beginning preceded the call
> >> - * to call_rcu().  It also means that each CPU executing an RCU read-side
> >> - * critical section that continues beyond the start of "func()" must have
> >> - * executed a memory barrier after the call_rcu() but before the beginning
> >> - * of that RCU read-side critical section.  Note that these guarantees
> >> - * include CPUs that are offline, idle, or executing in user mode, as
> >> - * well as CPUs that are executing in the kernel.
> >> - *
> >> - * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the
> >> - * resulting RCU callback function "func()", then both CPU A and CPU B are
> >> - * guaranteed to execute a full memory barrier during the time interval
> >> - * between the call to call_rcu() and the invocation of "func()" -- even
> >> - * if CPU A and CPU B are the same CPU (but again only if the system has
> >> - * more than one CPU).
> >> - *
> >> - * Implementation of these memory-ordering guarantees is described here:
> >> - * Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst.
> >> - */
> >> -void call_rcu(struct rcu_head *head, rcu_callback_t func)
> >> +static void
> >> +__call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy)
> >>  {
> >>  	static atomic_t doublefrees;
> >>  	unsigned long flags;
> >> @@ -2818,7 +2779,7 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func)
> >>  		trace_rcu_callback(rcu_state.name, head,
> >>  				   rcu_segcblist_n_cbs(&rdp->cblist));
> >>  
> >> -	if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags))
> >> +	if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags, lazy))
> >>  		return; // Enqueued onto ->nocb_bypass, so just leave.
> >>  	// If no-CBs CPU gets here, rcu_nocb_try_bypass() acquired ->nocb_lock.
> >>  	rcu_segcblist_enqueue(&rdp->cblist, head);
> >> @@ -2833,8 +2794,86 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func)
> >>  		local_irq_restore(flags);
> >>  	}
> >>  }
> >> -EXPORT_SYMBOL_GPL(call_rcu);
> >>  
> >> +#ifdef CONFIG_RCU_LAZY
> >> +/**
> >> + * call_rcu_lazy() - Lazily queue RCU callback for invocation after grace period.
> >> + * @head: structure to be used for queueing the RCU updates.
> >> + * @func: actual callback function to be invoked after the grace period
> >> + *
> >> + * The callback function will be invoked some time after a full grace
> >> + * period elapses, in other words after all pre-existing RCU read-side
> >> + * critical sections have completed.
> >> + *
> >> + * Use this API instead of call_rcu() if you don't mind the callback being
> >> + * invoked after very long periods of time on systems without memory pressure
> >> + * and on systems which are lightly loaded or mostly idle.
> >> + *
> >> + * Other than the extra delay in callbacks being invoked, this function is
> >> + * identical to, and reuses call_rcu()'s logic. Refer to call_rcu() for more
> >> + * details about memory ordering and other functionality.
> >> + */
> >> +void call_rcu_lazy(struct rcu_head *head, rcu_callback_t func)
> >> +{
> >> +	return __call_rcu_common(head, func, true);
> >> +}
> >> +EXPORT_SYMBOL_GPL(call_rcu_lazy);
> >> +#endif
> >> +
> >> +static bool force_call_rcu_to_lazy;
> >> +
> >> +void rcu_force_call_rcu_to_lazy(bool force)
> >> +{
> >> +	if (IS_ENABLED(CONFIG_RCU_SCALE_TEST))
> >> +		WRITE_ONCE(force_call_rcu_to_lazy, force);
> >> +}
> >> +EXPORT_SYMBOL_GPL(rcu_force_call_rcu_to_lazy);
> >> +
> >> +/**
> >> + * call_rcu() - Queue an RCU callback for invocation after a grace period.
> >> + * @head: structure to be used for queueing the RCU updates.
> >> + * @func: actual callback function to be invoked after the grace period
> >> + *
> >> + * The callback function will be invoked some time after a full grace
> >> + * period elapses, in other words after all pre-existing RCU read-side
> >> + * critical sections have completed.  However, the callback function
> >> + * might well execute concurrently with RCU read-side critical sections
> >> + * that started after call_rcu() was invoked.
> >> + *
> >> + * RCU read-side critical sections are delimited by rcu_read_lock()
> >> + * and rcu_read_unlock(), and may be nested.  In addition, but only in
> >> + * v5.0 and later, regions of code across which interrupts, preemption,
> >> + * or softirqs have been disabled also serve as RCU read-side critical
> >> + * sections.  This includes hardware interrupt handlers, softirq handlers,
> >> + * and NMI handlers.
> >> + *
> >> + * Note that all CPUs must agree that the grace period extended beyond
> >> + * all pre-existing RCU read-side critical section.  On systems with more
> >> + * than one CPU, this means that when "func()" is invoked, each CPU is
> >> + * guaranteed to have executed a full memory barrier since the end of its
> >> + * last RCU read-side critical section whose beginning preceded the call
> >> + * to call_rcu().  It also means that each CPU executing an RCU read-side
> >> + * critical section that continues beyond the start of "func()" must have
> >> + * executed a memory barrier after the call_rcu() but before the beginning
> >> + * of that RCU read-side critical section.  Note that these guarantees
> >> + * include CPUs that are offline, idle, or executing in user mode, as
> >> + * well as CPUs that are executing in the kernel.
> >> + *
> >> + * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the
> >> + * resulting RCU callback function "func()", then both CPU A and CPU B are
> >> + * guaranteed to execute a full memory barrier during the time interval
> >> + * between the call to call_rcu() and the invocation of "func()" -- even
> >> + * if CPU A and CPU B are the same CPU (but again only if the system has
> >> + * more than one CPU).
> >> + *
> >> + * Implementation of these memory-ordering guarantees is described here:
> >> + * Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst.
> >> + */
> >> +void call_rcu(struct rcu_head *head, rcu_callback_t func)
> >> +{
> >> +	return __call_rcu_common(head, func, force_call_rcu_to_lazy);
> >> +}
> >> +EXPORT_SYMBOL_GPL(call_rcu);
> >>  
> >>  /* Maximum number of jiffies to wait before draining a batch. */
> >>  #define KFREE_DRAIN_JIFFIES (5 * HZ)
> >> @@ -3904,7 +3943,11 @@ static void rcu_barrier_entrain(struct rcu_data *rdp)
> >>  	rdp->barrier_head.func = rcu_barrier_callback;
> >>  	debug_rcu_head_queue(&rdp->barrier_head);
> >>  	rcu_nocb_lock(rdp);
> >> -	WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies));
> >> +        /*
> >> +         * Flush the bypass list, but also wake up the GP thread as otherwise
> >> +         * bypass/lazy CBs maynot be noticed, and can cause real long delays!
> >> +         */
> >> +	WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies, FLUSH_BP_WAKE));
> >>  	if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head)) {
> >>  		atomic_inc(&rcu_state.barrier_cpu_count);
> >>  	} else {
> >> @@ -4325,7 +4368,7 @@ void rcutree_migrate_callbacks(int cpu)
> >>  	my_rdp = this_cpu_ptr(&rcu_data);
> >>  	my_rnp = my_rdp->mynode;
> >>  	rcu_nocb_lock(my_rdp); /* irqs already disabled. */
> >> -	WARN_ON_ONCE(!rcu_nocb_flush_bypass(my_rdp, NULL, jiffies));
> >> +	WARN_ON_ONCE(!rcu_nocb_flush_bypass(my_rdp, NULL, jiffies, FLUSH_BP_NONE));
> >>  	raw_spin_lock_rcu_node(my_rnp); /* irqs already disabled. */
> >>  	/* Leverage recent GPs and set GP for new callbacks. */
> >>  	needwake = rcu_advance_cbs(my_rnp, rdp) ||
> >> diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
> >> index d4a97e40ea9c..361c41d642c7 100644
> >> --- a/kernel/rcu/tree.h
> >> +++ b/kernel/rcu/tree.h
> >> @@ -263,14 +263,16 @@ struct rcu_data {
> >>  	unsigned long last_fqs_resched;	/* Time of last rcu_resched(). */
> >>  	unsigned long last_sched_clock;	/* Jiffies of last rcu_sched_clock_irq(). */
> >>  
> >> +	long lazy_len;			/* Length of buffered lazy callbacks. */
> >>  	int cpu;
> >>  };
> >>  
> >>  /* Values for nocb_defer_wakeup field in struct rcu_data. */
> >>  #define RCU_NOCB_WAKE_NOT	0
> >>  #define RCU_NOCB_WAKE_BYPASS	1
> >> -#define RCU_NOCB_WAKE		2
> >> -#define RCU_NOCB_WAKE_FORCE	3
> >> +#define RCU_NOCB_WAKE_LAZY	2
> >> +#define RCU_NOCB_WAKE		3
> >> +#define RCU_NOCB_WAKE_FORCE	4
> >>  
> >>  #define RCU_JIFFIES_TILL_FORCE_QS (1 + (HZ > 250) + (HZ > 500))
> >>  					/* For jiffies_till_first_fqs and */
> >> @@ -439,10 +441,17 @@ static void zero_cpu_stall_ticks(struct rcu_data *rdp);
> >>  static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp);
> >>  static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq);
> >>  static void rcu_init_one_nocb(struct rcu_node *rnp);
> >> +
> >> +#define FLUSH_BP_NONE 0
> >> +/* Is the CB being enqueued after the flush, a lazy CB? */
> >> +#define FLUSH_BP_LAZY BIT(0)
> >> +/* Wake up nocb-GP thread after flush? */
> >> +#define FLUSH_BP_WAKE BIT(1)
> >>  static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
> >> -				  unsigned long j);
> >> +				  unsigned long j, unsigned long flush_flags);
> >>  static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
> >> -				bool *was_alldone, unsigned long flags);
> >> +				bool *was_alldone, unsigned long flags,
> >> +				bool lazy);
> >>  static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty,
> >>  				 unsigned long flags);
> >>  static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level);
> >> diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
> >> index 4dc86274b3e8..b201606f7c4f 100644
> >> --- a/kernel/rcu/tree_nocb.h
> >> +++ b/kernel/rcu/tree_nocb.h
> >> @@ -256,6 +256,31 @@ static bool wake_nocb_gp(struct rcu_data *rdp, bool force)
> >>  	return __wake_nocb_gp(rdp_gp, rdp, force, flags);
> >>  }
> >>  
> >> +/*
> >> + * LAZY_FLUSH_JIFFIES decides the maximum amount of time that
> >> + * can elapse before lazy callbacks are flushed. Lazy callbacks
> >> + * could be flushed much earlier for a number of other reasons
> >> + * however, LAZY_FLUSH_JIFFIES will ensure no lazy callbacks are
> >> + * left unsubmitted to RCU after those many jiffies.
> >> + */
> >> +#define LAZY_FLUSH_JIFFIES (10 * HZ)
> >> +unsigned long jiffies_till_flush = LAZY_FLUSH_JIFFIES;
> >> +
> >> +#ifdef CONFIG_RCU_LAZY
> >> +// To be called only from test code.
> >> +void rcu_lazy_set_jiffies_till_flush(unsigned long jif)
> >> +{
> >> +	jiffies_till_flush = jif;
> >> +}
> >> +EXPORT_SYMBOL(rcu_lazy_set_jiffies_till_flush);
> >> +
> >> +unsigned long rcu_lazy_get_jiffies_till_flush(void)
> >> +{
> >> +	return jiffies_till_flush;
> >> +}
> >> +EXPORT_SYMBOL(rcu_lazy_get_jiffies_till_flush);
> >> +#endif
> >> +
> >>  /*
> >>   * Arrange to wake the GP kthread for this NOCB group at some future
> >>   * time when it is safe to do so.
> >> @@ -269,10 +294,14 @@ static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype,
> >>  	raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
> >>  
> >>  	/*
> >> -	 * Bypass wakeup overrides previous deferments. In case
> >> -	 * of callback storm, no need to wake up too early.
> >> +	 * Bypass wakeup overrides previous deferments. In case of
> >> +	 * callback storm, no need to wake up too early.
> >>  	 */
> >> -	if (waketype == RCU_NOCB_WAKE_BYPASS) {
> >> +	if (waketype == RCU_NOCB_WAKE_LAZY
> >> +		&& READ_ONCE(rdp->nocb_defer_wakeup) == RCU_NOCB_WAKE_NOT) {
> >> +		mod_timer(&rdp_gp->nocb_timer, jiffies + jiffies_till_flush);
> >> +		WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype);
> >> +	} else if (waketype == RCU_NOCB_WAKE_BYPASS) {
> >>  		mod_timer(&rdp_gp->nocb_timer, jiffies + 2);
> >>  		WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype);
> >>  	} else {
> >>
> > Joel, i have a question here. I see that lazy callback just makes the GP
> > start later where a regular call_rcu() API will instead cut it back.
> > Could you please clarify how it solves the sequence like:
> > 
> > CPU:
> > 
> > 1) task A -> call_rcu_lazy();
> > 2) task B -> call_rcu();
> > 
> > so lazily decision about running GP later a task B overrides to initiate it sooner? 
> 
> Is your question that task B will make task A's CB run sooner? Yes that's right.
> The reason is that the point of call_rcu_lazy() is to delay GP start as much as
> possible to keep system quiet. However, if a call_rcu() comes in any way, we
> have to start a GP soon. So we might as well take advantage of that for the lazy
> one as we are paying the full price for the new GP.
> 
> It is kind of like RCU's amortizing behavior in that sense, however we put that
> into overdrive to be greedy for power.
> 
> Does that answer your question?
> 
OK. I get it correctly then :) IMHO, the problem with such approach is that we
may end-up in non-lazy way of moving RCU-core forward even though we do t lazily
and one user not.

According to my observation, our setup suffers from many wake-ups across the CPUs
and mixing non_lazy_way with lazy_way will just be converted into a regular patern. 

I think adding kind of high_water_mark on "entry" would solve it. Any thoughts here?

Joel, do you see that v6 makes the system more idle from RCU perspective on the Intel SoC?

From my side i will port v6 on the 5.10 kernel and give some tests today.

Thanks!

--
Uladzislau Rezki
Frederic Weisbecker Sept. 7, 2022, 9:40 a.m. UTC | #26
On Wed, Sep 07, 2022 at 12:06:26AM +0000, Joel Fernandes wrote:
> > > @@ -326,13 +372,20 @@ static bool rcu_nocb_do_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
> > >   * Note that this function always returns true if rhp is NULL.
> > >   */
> > >  static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
> > > -				  unsigned long j)
> > > +				  unsigned long j, unsigned long flush_flags)
> > >  {
> > > +	bool ret;
> > > +
> > >  	if (!rcu_rdp_is_offloaded(rdp))
> > >  		return true;
> > >  	rcu_lockdep_assert_cblist_protected(rdp);
> > >  	rcu_nocb_bypass_lock(rdp);
> > > -	return rcu_nocb_do_flush_bypass(rdp, rhp, j);
> > > +	ret = rcu_nocb_do_flush_bypass(rdp, rhp, j, flush_flags);
> > > +
> > > +	if (flush_flags & FLUSH_BP_WAKE)
> > > +		wake_nocb_gp(rdp, true);
> > 
> > Why the true above?
> > 
> > Also should we check if the wake up is really necessary (otherwise it means we
> > force a wake up for all rdp's from rcu_barrier())?
> > 
> >        was_alldone = rcu_segcblist_pend_cbs(&rdp->cblist);
> >        ret = rcu_nocb_do_flush_bypass(rdp, rhp, j, flush_flags);
> >        if (was_alldone && rcu_segcblist_pend_cbs(&rdp->cblist))
> >        	  wake_nocb_gp(rdp, false);
> 
> You mean something like the following right? Though I'm thinking if its
> better to call wake_nocb_gp() from tree.c in entrain() and let that handle
> the wake. That way, we can get rid of the extra FLUSH_BP flags as well and
> let the flush callers deal with the wakeups..

Ah yes that could make sense if only one caller cares.

> 
> Anyway, for testing this should be good...
> 
> ---8<-----------------------
> 
> diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
> index bd8f39ee2cd0..e3344c262672 100644
> --- a/kernel/rcu/tree_nocb.h
> +++ b/kernel/rcu/tree_nocb.h
> @@ -382,15 +382,19 @@ static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
>  				  unsigned long j, unsigned long flush_flags)
>  {
>  	bool ret;
> +	bool was_alldone;
>  
>  	if (!rcu_rdp_is_offloaded(rdp))
>  		return true;
>  	rcu_lockdep_assert_cblist_protected(rdp);
>  	rcu_nocb_bypass_lock(rdp);
> +	if (flush_flags & FLUSH_BP_WAKE)
> +		was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
> +

You can check that outside bypass lock (but you still need nocb_lock).

>  	ret = rcu_nocb_do_flush_bypass(rdp, rhp, j, flush_flags);
>  
> -	if (flush_flags & FLUSH_BP_WAKE)
> -		wake_nocb_gp(rdp, true);
> +	if (flush_flags & FLUSH_BP_WAKE && was_alldone)
> +		wake_nocb_gp(rdp, false);

That doesn't check if the bypass list was empty.

Thanks.
Frederic Weisbecker Sept. 7, 2022, 9:56 a.m. UTC | #27
On Tue, Sep 06, 2022 at 10:56:01PM -0400, Joel Fernandes wrote:
> On the issue of regressions with non-lazy things being treated as lazy, I was
> thinking of adding a bounded-time-check to:
> 
> [PATCH v5 08/18] rcu: Add per-CB tracing for queuing, flush and invocation.
> 
> Where, if a non-lazy CB takes an abnormally long time to execute (say it was
> subject to a race-condition), it would splat. This can be done because I am
> tracking the queue-time in the rcu_head in that patch.
> 
> On another note, boot time regressions show up pretty quickly (at least on
> ChromeOS) when non-lazy things become lazy and so far with the latest code it
> has fortunately been pretty well behaved.

Makes sense. We definetly need some sort of detection for delayed non-lazy
callbacks.

Thanks!

> 
> Thanks,
> 
>  - Joel
Frederic Weisbecker Sept. 7, 2022, 10:03 a.m. UTC | #28
On Tue, Sep 06, 2022 at 12:15:19PM -0400, Joel Fernandes wrote:
> >> +
> >> +	// We had CBs in the bypass list before. There is nothing else to do if:
> >> +	// There were only non-lazy CBs before, in this case, the bypass timer
> > 
> > Kind of misleading. I would replace "There were only non-lazy CBs before" with
> > "There was at least one non-lazy CBs before".
> 
> I really mean "There were only non-lazy CBs ever queued in the bypass list
> before". That's the bypass_is_lazy variable. So I did not fully understand your
> suggested comment change.

I may well be missing something but to me it seems that:

bypass_is_lazy = all bypass callbacks are lazy
!bypass_is_lazy = there is at least one non-lazy bypass callback

And indeed as long as there is at least one non-lazy callback, we don't
want to rely on the LAZY timer.

Am I overlooking something?

> 
> >> +	// or GP-thread will handle the CBs including any new lazy ones.
> >> +	// Or, the new CB is lazy and the old bypass-CBs were also lazy. In this
> >> +	// case the old lazy timer would have been setup. When that expires,
> >> +	// the new lazy one will be handled.
> >> +	if (ncbs && (!bypass_is_lazy || lazy)) {
> >>  		local_irq_restore(flags);
> >>  	} else {
> >>  		// No-CBs GP kthread might be indefinitely asleep, if so, wake.
Joel Fernandes Sept. 7, 2022, 1:44 p.m. UTC | #29
Hi Frederic,

On 9/7/2022 5:40 AM, Frederic Weisbecker wrote:
>>
>> diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
>> index bd8f39ee2cd0..e3344c262672 100644
>> --- a/kernel/rcu/tree_nocb.h
>> +++ b/kernel/rcu/tree_nocb.h
>> @@ -382,15 +382,19 @@ static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
>>  				  unsigned long j, unsigned long flush_flags)
>>  {
>>  	bool ret;
>> +	bool was_alldone;
>>  
>>  	if (!rcu_rdp_is_offloaded(rdp))
>>  		return true;
>>  	rcu_lockdep_assert_cblist_protected(rdp);
>>  	rcu_nocb_bypass_lock(rdp);
>> +	if (flush_flags & FLUSH_BP_WAKE)
>> +		was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
>> +
> You can check that outside bypass lock (but you still need nocb_lock).

Right, ok. I can make it outside the bypass lock, and the nocb_lock is implied
by rcu_lockdep_assert_cblist_protected().

> 
>>  	ret = rcu_nocb_do_flush_bypass(rdp, rhp, j, flush_flags);
>>  
>> -	if (flush_flags & FLUSH_BP_WAKE)
>> -		wake_nocb_gp(rdp, true);
>> +	if (flush_flags & FLUSH_BP_WAKE && was_alldone)
>> +		wake_nocb_gp(rdp, false);
> That doesn't check if the bypass list was empty.

thanks, will add your suggested optimization, however I have a general question:

For the case where the bypass list is empty, where does rcu_barrier() do a wake
up of the nocb GP thread after entrain()?

I don't see a call to __call_rcu_nocb_wake() like rcutree_migrate_callbacks()
does. Is the wake up done in some other path?

thanks,

 - Joel
Joel Fernandes Sept. 7, 2022, 2:01 p.m. UTC | #30
On 9/7/2022 6:03 AM, Frederic Weisbecker wrote:
> On Tue, Sep 06, 2022 at 12:15:19PM -0400, Joel Fernandes wrote:
>>>> +
>>>> +	// We had CBs in the bypass list before. There is nothing else to do if:
>>>> +	// There were only non-lazy CBs before, in this case, the bypass timer
>>>
>>> Kind of misleading. I would replace "There were only non-lazy CBs before" with
>>> "There was at least one non-lazy CBs before".
>>
>> I really mean "There were only non-lazy CBs ever queued in the bypass list
>> before". That's the bypass_is_lazy variable. So I did not fully understand your
>> suggested comment change.
> 
> I may well be missing something but to me it seems that:
> 
> bypass_is_lazy = all bypass callbacks are lazy
> !bypass_is_lazy = there is at least one non-lazy bypass callback
> 
> And indeed as long as there is at least one non-lazy callback, we don't
> want to rely on the LAZY timer.
> 
> Am I overlooking something?

No you are not over looking and you are very right that I may need to change the
comment.

To clarify my intent, a wake up or timer adjustment needs to be done only if:

1. Bypass list was fully empty before (this is the first bypass list entry).

Or both the below conditions are met:

1. Bypass list had only lazy CBs before.

2. The new CB is non-lazy.

Instead of saying, "nothing needs to be done if...", I will change it to:

        // A wake up of the grace period kthread or timer adjustment needs to
        // be done only if:
        // 1. Bypass list was empty before (this is the first bypass queue).
        //      Or, both the below conditions are met:
        // 1. Bypass list had only lazy CBs before.
        // 2. The new CB is non-lazy.

That sounds less confusing...

Or, I can just make the edit you suggested... let me know either way!

Thanks!

 - Joel
Joel Fernandes Sept. 7, 2022, 3:23 p.m. UTC | #31
Hi Vlad,

On 9/7/2022 4:52 AM, Uladzislau Rezki wrote:
> On Tue, Sep 06, 2022 at 02:21:46PM -0400, Joel Fernandes wrote:
>>
>>
>> On 9/6/2022 2:16 PM, Uladzislau Rezki wrote:
>>>> On Fri, Sep 02, 2022 at 05:21:32PM +0200, Frederic Weisbecker wrote:
>>>>> On Thu, Sep 01, 2022 at 10:17:08PM +0000, Joel Fernandes (Google) wrote:
>>>>>> Implement timer-based RCU lazy callback batching. The batch is flushed
>>>>>> whenever a certain amount of time has passed, or the batch on a
>>>>>> particular CPU grows too big. Also memory pressure will flush it in a
>>>>>> future patch.
>>>>>>
>>>>>> To handle several corner cases automagically (such as rcu_barrier() and
>>>>>> hotplug), we re-use bypass lists to handle lazy CBs. The bypass list
>>>>>> length has the lazy CB length included in it. A separate lazy CB length
>>>>>> counter is also introduced to keep track of the number of lazy CBs.
>>>>>>
>>>>>> Suggested-by: Paul McKenney <paulmck@kernel.org>
>>>>>> Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
>>>>>> ---
>>>>
>>>> Here is the updated version of this patch for further testing and review.
>>>> Paul, you could consider updating your test branch. I have tested it in
>>>> ChromeOS as well, and rcuscale. The laziness and boot times are looking good.
>>>> There was at least one bug that I fixed that got introduced with the moving
>>>> of the length field to rcu_data.  Thanks a lot Frederic for the review
>>>> comments.
>>>>
>>>> I will look at the rcu torture issue next... I suspect the length field issue
>>>> may have been causing it.
>>>>
>>>> ---8<-----------------------
>>>>
>>>> From: "Joel Fernandes (Google)" <joel@joelfernandes.org>
>>>> Subject: [PATCH v6] rcu: Introduce call_rcu_lazy() API implementation
>>>>
>>>> Implement timer-based RCU lazy callback batching. The batch is flushed
>>>> whenever a certain amount of time has passed, or the batch on a
>>>> particular CPU grows too big. Also memory pressure will flush it in a
>>>> future patch.
>>>>
>>>> To handle several corner cases automagically (such as rcu_barrier() and
>>>> hotplug), we re-use bypass lists to handle lazy CBs. The bypass list
>>>> length has the lazy CB length included in it. A separate lazy CB length
>>>> counter is also introduced to keep track of the number of lazy CBs.
>>>>
>>>> v5->v6:
>>>>
>>>> [ Frederic Weisbec: Program the lazy timer only if WAKE_NOT, since other
>>>>   deferral levels wake much earlier so for those it is not needed. ]
>>>>
>>>> [ Frederic Weisbec: Use flush flags to keep bypass API code clean. ]
>>>>
>>>> [ Joel: Fix issue where I was not resetting lazy_len after moving it to rdp ]
>>>>
>>>> Suggested-by: Paul McKenney <paulmck@kernel.org>
>>>> Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
>>>> ---
>>>>  include/linux/rcupdate.h |   6 ++
>>>>  kernel/rcu/Kconfig       |   8 ++
>>>>  kernel/rcu/rcu.h         |  11 +++
>>>>  kernel/rcu/tree.c        | 133 +++++++++++++++++++----------
>>>>  kernel/rcu/tree.h        |  17 +++-
>>>>  kernel/rcu/tree_nocb.h   | 175 ++++++++++++++++++++++++++++++++-------
>>>>  6 files changed, 269 insertions(+), 81 deletions(-)
>>>>
>>>> diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
>>>> index 08605ce7379d..82e8a07e0856 100644
>>>> --- a/include/linux/rcupdate.h
>>>> +++ b/include/linux/rcupdate.h
>>>> @@ -108,6 +108,12 @@ static inline int rcu_preempt_depth(void)
>>>>  
>>>>  #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
>>>>  
>>>> +#ifdef CONFIG_RCU_LAZY
>>>> +void call_rcu_lazy(struct rcu_head *head, rcu_callback_t func);
>>>> +#else
>>>> +#define call_rcu_lazy(head, func) call_rcu(head, func)
>>>> +#endif
>>>> +
>>>>  /* Internal to kernel */
>>>>  void rcu_init(void);
>>>>  extern int rcu_scheduler_active;
>>>> diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
>>>> index d471d22a5e21..3128d01427cb 100644
>>>> --- a/kernel/rcu/Kconfig
>>>> +++ b/kernel/rcu/Kconfig
>>>> @@ -311,4 +311,12 @@ config TASKS_TRACE_RCU_READ_MB
>>>>  	  Say N here if you hate read-side memory barriers.
>>>>  	  Take the default if you are unsure.
>>>>  
>>>> +config RCU_LAZY
>>>> +	bool "RCU callback lazy invocation functionality"
>>>> +	depends on RCU_NOCB_CPU
>>>> +	default n
>>>> +	help
>>>> +	  To save power, batch RCU callbacks and flush after delay, memory
>>>> +	  pressure or callback list growing too big.
>>>> +
>>>>  endmenu # "RCU Subsystem"
>>>> diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
>>>> index be5979da07f5..94675f14efe8 100644
>>>> --- a/kernel/rcu/rcu.h
>>>> +++ b/kernel/rcu/rcu.h
>>>> @@ -474,6 +474,14 @@ enum rcutorture_type {
>>>>  	INVALID_RCU_FLAVOR
>>>>  };
>>>>  
>>>> +#if defined(CONFIG_RCU_LAZY)
>>>> +unsigned long rcu_lazy_get_jiffies_till_flush(void);
>>>> +void rcu_lazy_set_jiffies_till_flush(unsigned long j);
>>>> +#else
>>>> +static inline unsigned long rcu_lazy_get_jiffies_till_flush(void) { return 0; }
>>>> +static inline void rcu_lazy_set_jiffies_till_flush(unsigned long j) { }
>>>> +#endif
>>>> +
>>>>  #if defined(CONFIG_TREE_RCU)
>>>>  void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
>>>>  			    unsigned long *gp_seq);
>>>> @@ -483,6 +491,8 @@ void do_trace_rcu_torture_read(const char *rcutorturename,
>>>>  			       unsigned long c_old,
>>>>  			       unsigned long c);
>>>>  void rcu_gp_set_torture_wait(int duration);
>>>> +void rcu_force_call_rcu_to_lazy(bool force);
>>>> +
>>>>  #else
>>>>  static inline void rcutorture_get_gp_data(enum rcutorture_type test_type,
>>>>  					  int *flags, unsigned long *gp_seq)
>>>> @@ -501,6 +511,7 @@ void do_trace_rcu_torture_read(const char *rcutorturename,
>>>>  	do { } while (0)
>>>>  #endif
>>>>  static inline void rcu_gp_set_torture_wait(int duration) { }
>>>> +static inline void rcu_force_call_rcu_to_lazy(bool force) { }
>>>>  #endif
>>>>  
>>>>  #if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST)
>>>> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
>>>> index 9fe581be8696..dbd25b8c080e 100644
>>>> --- a/kernel/rcu/tree.c
>>>> +++ b/kernel/rcu/tree.c
>>>> @@ -2728,47 +2728,8 @@ static void check_cb_ovld(struct rcu_data *rdp)
>>>>  	raw_spin_unlock_rcu_node(rnp);
>>>>  }
>>>>  
>>>> -/**
>>>> - * call_rcu() - Queue an RCU callback for invocation after a grace period.
>>>> - * @head: structure to be used for queueing the RCU updates.
>>>> - * @func: actual callback function to be invoked after the grace period
>>>> - *
>>>> - * The callback function will be invoked some time after a full grace
>>>> - * period elapses, in other words after all pre-existing RCU read-side
>>>> - * critical sections have completed.  However, the callback function
>>>> - * might well execute concurrently with RCU read-side critical sections
>>>> - * that started after call_rcu() was invoked.
>>>> - *
>>>> - * RCU read-side critical sections are delimited by rcu_read_lock()
>>>> - * and rcu_read_unlock(), and may be nested.  In addition, but only in
>>>> - * v5.0 and later, regions of code across which interrupts, preemption,
>>>> - * or softirqs have been disabled also serve as RCU read-side critical
>>>> - * sections.  This includes hardware interrupt handlers, softirq handlers,
>>>> - * and NMI handlers.
>>>> - *
>>>> - * Note that all CPUs must agree that the grace period extended beyond
>>>> - * all pre-existing RCU read-side critical section.  On systems with more
>>>> - * than one CPU, this means that when "func()" is invoked, each CPU is
>>>> - * guaranteed to have executed a full memory barrier since the end of its
>>>> - * last RCU read-side critical section whose beginning preceded the call
>>>> - * to call_rcu().  It also means that each CPU executing an RCU read-side
>>>> - * critical section that continues beyond the start of "func()" must have
>>>> - * executed a memory barrier after the call_rcu() but before the beginning
>>>> - * of that RCU read-side critical section.  Note that these guarantees
>>>> - * include CPUs that are offline, idle, or executing in user mode, as
>>>> - * well as CPUs that are executing in the kernel.
>>>> - *
>>>> - * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the
>>>> - * resulting RCU callback function "func()", then both CPU A and CPU B are
>>>> - * guaranteed to execute a full memory barrier during the time interval
>>>> - * between the call to call_rcu() and the invocation of "func()" -- even
>>>> - * if CPU A and CPU B are the same CPU (but again only if the system has
>>>> - * more than one CPU).
>>>> - *
>>>> - * Implementation of these memory-ordering guarantees is described here:
>>>> - * Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst.
>>>> - */
>>>> -void call_rcu(struct rcu_head *head, rcu_callback_t func)
>>>> +static void
>>>> +__call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy)
>>>>  {
>>>>  	static atomic_t doublefrees;
>>>>  	unsigned long flags;
>>>> @@ -2818,7 +2779,7 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func)
>>>>  		trace_rcu_callback(rcu_state.name, head,
>>>>  				   rcu_segcblist_n_cbs(&rdp->cblist));
>>>>  
>>>> -	if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags))
>>>> +	if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags, lazy))
>>>>  		return; // Enqueued onto ->nocb_bypass, so just leave.
>>>>  	// If no-CBs CPU gets here, rcu_nocb_try_bypass() acquired ->nocb_lock.
>>>>  	rcu_segcblist_enqueue(&rdp->cblist, head);
>>>> @@ -2833,8 +2794,86 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func)
>>>>  		local_irq_restore(flags);
>>>>  	}
>>>>  }
>>>> -EXPORT_SYMBOL_GPL(call_rcu);
>>>>  
>>>> +#ifdef CONFIG_RCU_LAZY
>>>> +/**
>>>> + * call_rcu_lazy() - Lazily queue RCU callback for invocation after grace period.
>>>> + * @head: structure to be used for queueing the RCU updates.
>>>> + * @func: actual callback function to be invoked after the grace period
>>>> + *
>>>> + * The callback function will be invoked some time after a full grace
>>>> + * period elapses, in other words after all pre-existing RCU read-side
>>>> + * critical sections have completed.
>>>> + *
>>>> + * Use this API instead of call_rcu() if you don't mind the callback being
>>>> + * invoked after very long periods of time on systems without memory pressure
>>>> + * and on systems which are lightly loaded or mostly idle.
>>>> + *
>>>> + * Other than the extra delay in callbacks being invoked, this function is
>>>> + * identical to, and reuses call_rcu()'s logic. Refer to call_rcu() for more
>>>> + * details about memory ordering and other functionality.
>>>> + */
>>>> +void call_rcu_lazy(struct rcu_head *head, rcu_callback_t func)
>>>> +{
>>>> +	return __call_rcu_common(head, func, true);
>>>> +}
>>>> +EXPORT_SYMBOL_GPL(call_rcu_lazy);
>>>> +#endif
>>>> +
>>>> +static bool force_call_rcu_to_lazy;
>>>> +
>>>> +void rcu_force_call_rcu_to_lazy(bool force)
>>>> +{
>>>> +	if (IS_ENABLED(CONFIG_RCU_SCALE_TEST))
>>>> +		WRITE_ONCE(force_call_rcu_to_lazy, force);
>>>> +}
>>>> +EXPORT_SYMBOL_GPL(rcu_force_call_rcu_to_lazy);
>>>> +
>>>> +/**
>>>> + * call_rcu() - Queue an RCU callback for invocation after a grace period.
>>>> + * @head: structure to be used for queueing the RCU updates.
>>>> + * @func: actual callback function to be invoked after the grace period
>>>> + *
>>>> + * The callback function will be invoked some time after a full grace
>>>> + * period elapses, in other words after all pre-existing RCU read-side
>>>> + * critical sections have completed.  However, the callback function
>>>> + * might well execute concurrently with RCU read-side critical sections
>>>> + * that started after call_rcu() was invoked.
>>>> + *
>>>> + * RCU read-side critical sections are delimited by rcu_read_lock()
>>>> + * and rcu_read_unlock(), and may be nested.  In addition, but only in
>>>> + * v5.0 and later, regions of code across which interrupts, preemption,
>>>> + * or softirqs have been disabled also serve as RCU read-side critical
>>>> + * sections.  This includes hardware interrupt handlers, softirq handlers,
>>>> + * and NMI handlers.
>>>> + *
>>>> + * Note that all CPUs must agree that the grace period extended beyond
>>>> + * all pre-existing RCU read-side critical section.  On systems with more
>>>> + * than one CPU, this means that when "func()" is invoked, each CPU is
>>>> + * guaranteed to have executed a full memory barrier since the end of its
>>>> + * last RCU read-side critical section whose beginning preceded the call
>>>> + * to call_rcu().  It also means that each CPU executing an RCU read-side
>>>> + * critical section that continues beyond the start of "func()" must have
>>>> + * executed a memory barrier after the call_rcu() but before the beginning
>>>> + * of that RCU read-side critical section.  Note that these guarantees
>>>> + * include CPUs that are offline, idle, or executing in user mode, as
>>>> + * well as CPUs that are executing in the kernel.
>>>> + *
>>>> + * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the
>>>> + * resulting RCU callback function "func()", then both CPU A and CPU B are
>>>> + * guaranteed to execute a full memory barrier during the time interval
>>>> + * between the call to call_rcu() and the invocation of "func()" -- even
>>>> + * if CPU A and CPU B are the same CPU (but again only if the system has
>>>> + * more than one CPU).
>>>> + *
>>>> + * Implementation of these memory-ordering guarantees is described here:
>>>> + * Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst.
>>>> + */
>>>> +void call_rcu(struct rcu_head *head, rcu_callback_t func)
>>>> +{
>>>> +	return __call_rcu_common(head, func, force_call_rcu_to_lazy);
>>>> +}
>>>> +EXPORT_SYMBOL_GPL(call_rcu);
>>>>  
>>>>  /* Maximum number of jiffies to wait before draining a batch. */
>>>>  #define KFREE_DRAIN_JIFFIES (5 * HZ)
>>>> @@ -3904,7 +3943,11 @@ static void rcu_barrier_entrain(struct rcu_data *rdp)
>>>>  	rdp->barrier_head.func = rcu_barrier_callback;
>>>>  	debug_rcu_head_queue(&rdp->barrier_head);
>>>>  	rcu_nocb_lock(rdp);
>>>> -	WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies));
>>>> +        /*
>>>> +         * Flush the bypass list, but also wake up the GP thread as otherwise
>>>> +         * bypass/lazy CBs maynot be noticed, and can cause real long delays!
>>>> +         */
>>>> +	WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies, FLUSH_BP_WAKE));
>>>>  	if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head)) {
>>>>  		atomic_inc(&rcu_state.barrier_cpu_count);
>>>>  	} else {
>>>> @@ -4325,7 +4368,7 @@ void rcutree_migrate_callbacks(int cpu)
>>>>  	my_rdp = this_cpu_ptr(&rcu_data);
>>>>  	my_rnp = my_rdp->mynode;
>>>>  	rcu_nocb_lock(my_rdp); /* irqs already disabled. */
>>>> -	WARN_ON_ONCE(!rcu_nocb_flush_bypass(my_rdp, NULL, jiffies));
>>>> +	WARN_ON_ONCE(!rcu_nocb_flush_bypass(my_rdp, NULL, jiffies, FLUSH_BP_NONE));
>>>>  	raw_spin_lock_rcu_node(my_rnp); /* irqs already disabled. */
>>>>  	/* Leverage recent GPs and set GP for new callbacks. */
>>>>  	needwake = rcu_advance_cbs(my_rnp, rdp) ||
>>>> diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
>>>> index d4a97e40ea9c..361c41d642c7 100644
>>>> --- a/kernel/rcu/tree.h
>>>> +++ b/kernel/rcu/tree.h
>>>> @@ -263,14 +263,16 @@ struct rcu_data {
>>>>  	unsigned long last_fqs_resched;	/* Time of last rcu_resched(). */
>>>>  	unsigned long last_sched_clock;	/* Jiffies of last rcu_sched_clock_irq(). */
>>>>  
>>>> +	long lazy_len;			/* Length of buffered lazy callbacks. */
>>>>  	int cpu;
>>>>  };
>>>>  
>>>>  /* Values for nocb_defer_wakeup field in struct rcu_data. */
>>>>  #define RCU_NOCB_WAKE_NOT	0
>>>>  #define RCU_NOCB_WAKE_BYPASS	1
>>>> -#define RCU_NOCB_WAKE		2
>>>> -#define RCU_NOCB_WAKE_FORCE	3
>>>> +#define RCU_NOCB_WAKE_LAZY	2
>>>> +#define RCU_NOCB_WAKE		3
>>>> +#define RCU_NOCB_WAKE_FORCE	4
>>>>  
>>>>  #define RCU_JIFFIES_TILL_FORCE_QS (1 + (HZ > 250) + (HZ > 500))
>>>>  					/* For jiffies_till_first_fqs and */
>>>> @@ -439,10 +441,17 @@ static void zero_cpu_stall_ticks(struct rcu_data *rdp);
>>>>  static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp);
>>>>  static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq);
>>>>  static void rcu_init_one_nocb(struct rcu_node *rnp);
>>>> +
>>>> +#define FLUSH_BP_NONE 0
>>>> +/* Is the CB being enqueued after the flush, a lazy CB? */
>>>> +#define FLUSH_BP_LAZY BIT(0)
>>>> +/* Wake up nocb-GP thread after flush? */
>>>> +#define FLUSH_BP_WAKE BIT(1)
>>>>  static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
>>>> -				  unsigned long j);
>>>> +				  unsigned long j, unsigned long flush_flags);
>>>>  static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
>>>> -				bool *was_alldone, unsigned long flags);
>>>> +				bool *was_alldone, unsigned long flags,
>>>> +				bool lazy);
>>>>  static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty,
>>>>  				 unsigned long flags);
>>>>  static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level);
>>>> diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
>>>> index 4dc86274b3e8..b201606f7c4f 100644
>>>> --- a/kernel/rcu/tree_nocb.h
>>>> +++ b/kernel/rcu/tree_nocb.h
>>>> @@ -256,6 +256,31 @@ static bool wake_nocb_gp(struct rcu_data *rdp, bool force)
>>>>  	return __wake_nocb_gp(rdp_gp, rdp, force, flags);
>>>>  }
>>>>  
>>>> +/*
>>>> + * LAZY_FLUSH_JIFFIES decides the maximum amount of time that
>>>> + * can elapse before lazy callbacks are flushed. Lazy callbacks
>>>> + * could be flushed much earlier for a number of other reasons
>>>> + * however, LAZY_FLUSH_JIFFIES will ensure no lazy callbacks are
>>>> + * left unsubmitted to RCU after those many jiffies.
>>>> + */
>>>> +#define LAZY_FLUSH_JIFFIES (10 * HZ)
>>>> +unsigned long jiffies_till_flush = LAZY_FLUSH_JIFFIES;
>>>> +
>>>> +#ifdef CONFIG_RCU_LAZY
>>>> +// To be called only from test code.
>>>> +void rcu_lazy_set_jiffies_till_flush(unsigned long jif)
>>>> +{
>>>> +	jiffies_till_flush = jif;
>>>> +}
>>>> +EXPORT_SYMBOL(rcu_lazy_set_jiffies_till_flush);
>>>> +
>>>> +unsigned long rcu_lazy_get_jiffies_till_flush(void)
>>>> +{
>>>> +	return jiffies_till_flush;
>>>> +}
>>>> +EXPORT_SYMBOL(rcu_lazy_get_jiffies_till_flush);
>>>> +#endif
>>>> +
>>>>  /*
>>>>   * Arrange to wake the GP kthread for this NOCB group at some future
>>>>   * time when it is safe to do so.
>>>> @@ -269,10 +294,14 @@ static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype,
>>>>  	raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
>>>>  
>>>>  	/*
>>>> -	 * Bypass wakeup overrides previous deferments. In case
>>>> -	 * of callback storm, no need to wake up too early.
>>>> +	 * Bypass wakeup overrides previous deferments. In case of
>>>> +	 * callback storm, no need to wake up too early.
>>>>  	 */
>>>> -	if (waketype == RCU_NOCB_WAKE_BYPASS) {
>>>> +	if (waketype == RCU_NOCB_WAKE_LAZY
>>>> +		&& READ_ONCE(rdp->nocb_defer_wakeup) == RCU_NOCB_WAKE_NOT) {
>>>> +		mod_timer(&rdp_gp->nocb_timer, jiffies + jiffies_till_flush);
>>>> +		WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype);
>>>> +	} else if (waketype == RCU_NOCB_WAKE_BYPASS) {
>>>>  		mod_timer(&rdp_gp->nocb_timer, jiffies + 2);
>>>>  		WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype);
>>>>  	} else {
>>>>
>>> Joel, i have a question here. I see that lazy callback just makes the GP
>>> start later where a regular call_rcu() API will instead cut it back.
>>> Could you please clarify how it solves the sequence like:
>>>
>>> CPU:
>>>
>>> 1) task A -> call_rcu_lazy();
>>> 2) task B -> call_rcu();
>>>
>>> so lazily decision about running GP later a task B overrides to initiate it sooner? 
>>
>> Is your question that task B will make task A's CB run sooner? Yes that's right.
>> The reason is that the point of call_rcu_lazy() is to delay GP start as much as
>> possible to keep system quiet. However, if a call_rcu() comes in any way, we
>> have to start a GP soon. So we might as well take advantage of that for the lazy
>> one as we are paying the full price for the new GP.
>>
>> It is kind of like RCU's amortizing behavior in that sense, however we put that
>> into overdrive to be greedy for power.
>>
>> Does that answer your question?
>>
> OK. I get it correctly then :) IMHO, the problem with such approach is that we
> may end-up in non-lazy way of moving RCU-core forward even though we do t lazily
> and one user not.

Yes, the best we can do seems to be to identify all call_rcu() users that can be
lazified (say using rcutop) and apply the API change to them. It depends on use
case for sure, for example if some usecase is doing frequent synchronize_rcu()
for whatever reason, then that will impede the lazy users. On the Intel ChromeOS
device, we do not see such a problem.

> According to my observation, our setup suffers from many wake-ups across the CPUs
> and mixing non_lazy_way with lazy_way will just be converted into a regular patern. 
> 
> I think adding kind of high_water_mark on "entry" would solve it. Any thoughts here?

By high water mark, do you mean for memory? If memory, we have to consider that
we cannot assume that call_rcu() users are using RCU for memory reclaim only.
There could be usecases like per-cpu refcounts which use call_rcu() for
non-memory purposes. How would use a memory-watermark there?


> Joel, do you see that v6 makes the system more idle from RCU perspective on the Intel SoC?

I did not compare Intel with say ARM, so I don't know whether ARM SoC is less
sensitive to this issue. I think it depends on the firmware as well. On Intel we
see that the firmware does not go as aggressive on SoC-wide power management if
there is RCU activity.. at least that's the observation.

> From my side i will port v6 on the 5.10 kernel and give some tests today.

Great, thanks! Looking forward to it.

- Joel
Frederic Weisbecker Sept. 7, 2022, 3:38 p.m. UTC | #32
On Wed, Sep 07, 2022 at 09:44:01AM -0400, Joel Fernandes wrote:
> Hi Frederic,
> 
> On 9/7/2022 5:40 AM, Frederic Weisbecker wrote:
> >>
> >> diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
> >> index bd8f39ee2cd0..e3344c262672 100644
> >> --- a/kernel/rcu/tree_nocb.h
> >> +++ b/kernel/rcu/tree_nocb.h
> >> @@ -382,15 +382,19 @@ static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
> >>  				  unsigned long j, unsigned long flush_flags)
> >>  {
> >>  	bool ret;
> >> +	bool was_alldone;
> >>  
> >>  	if (!rcu_rdp_is_offloaded(rdp))
> >>  		return true;
> >>  	rcu_lockdep_assert_cblist_protected(rdp);
> >>  	rcu_nocb_bypass_lock(rdp);
> >> +	if (flush_flags & FLUSH_BP_WAKE)
> >> +		was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
> >> +
> > You can check that outside bypass lock (but you still need nocb_lock).
> 
> Right, ok. I can make it outside the bypass lock, and the nocb_lock is implied
> by rcu_lockdep_assert_cblist_protected().
> 
> > 
> >>  	ret = rcu_nocb_do_flush_bypass(rdp, rhp, j, flush_flags);
> >>  
> >> -	if (flush_flags & FLUSH_BP_WAKE)
> >> -		wake_nocb_gp(rdp, true);
> >> +	if (flush_flags & FLUSH_BP_WAKE && was_alldone)
> >> +		wake_nocb_gp(rdp, false);
> > That doesn't check if the bypass list was empty.
> 
> thanks, will add your suggested optimization, however I have a general question:
> 
> For the case where the bypass list is empty, where does rcu_barrier() do a wake
> up of the nocb GP thread after entrain()?
> 
> I don't see a call to __call_rcu_nocb_wake() like rcutree_migrate_callbacks()
> does. Is the wake up done in some other path?

Actually I can't find it. You're right it may be missing. It's not even just
about the bypass list but also the entrain'ed callback. Would you be willing to
cook a fix?

Thanks!

> 
> thanks,
> 
>  - Joel
Joel Fernandes Sept. 7, 2022, 3:39 p.m. UTC | #33
On 9/7/2022 11:38 AM, Frederic Weisbecker wrote:
> On Wed, Sep 07, 2022 at 09:44:01AM -0400, Joel Fernandes wrote:
>> Hi Frederic,
>>
>> On 9/7/2022 5:40 AM, Frederic Weisbecker wrote:
>>>>
>>>> diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
>>>> index bd8f39ee2cd0..e3344c262672 100644
>>>> --- a/kernel/rcu/tree_nocb.h
>>>> +++ b/kernel/rcu/tree_nocb.h
>>>> @@ -382,15 +382,19 @@ static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
>>>>  				  unsigned long j, unsigned long flush_flags)
>>>>  {
>>>>  	bool ret;
>>>> +	bool was_alldone;
>>>>  
>>>>  	if (!rcu_rdp_is_offloaded(rdp))
>>>>  		return true;
>>>>  	rcu_lockdep_assert_cblist_protected(rdp);
>>>>  	rcu_nocb_bypass_lock(rdp);
>>>> +	if (flush_flags & FLUSH_BP_WAKE)
>>>> +		was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
>>>> +
>>> You can check that outside bypass lock (but you still need nocb_lock).
>>
>> Right, ok. I can make it outside the bypass lock, and the nocb_lock is implied
>> by rcu_lockdep_assert_cblist_protected().
>>
>>>
>>>>  	ret = rcu_nocb_do_flush_bypass(rdp, rhp, j, flush_flags);
>>>>  
>>>> -	if (flush_flags & FLUSH_BP_WAKE)
>>>> -		wake_nocb_gp(rdp, true);
>>>> +	if (flush_flags & FLUSH_BP_WAKE && was_alldone)
>>>> +		wake_nocb_gp(rdp, false);
>>> That doesn't check if the bypass list was empty.
>>
>> thanks, will add your suggested optimization, however I have a general question:
>>
>> For the case where the bypass list is empty, where does rcu_barrier() do a wake
>> up of the nocb GP thread after entrain()?
>>
>> I don't see a call to __call_rcu_nocb_wake() like rcutree_migrate_callbacks()
>> does. Is the wake up done in some other path?
> 
> Actually I can't find it. You're right it may be missing. It's not even just
> about the bypass list but also the entrain'ed callback. Would you be willing to
> cook a fix?

Sounds good, yes I am happy to make a fix and include it in the next series
posting. Thanks for confirming the potential issue.

I will be on a plan in a few hours, I can even try to do it on the plane but
will see how it goes ;)

Thanks!

 - Joel
Joel Fernandes Sept. 21, 2022, 11:52 p.m. UTC | #34
On Wed, Sep 07, 2022 at 11:40:14AM +0200, Frederic Weisbecker wrote:
> On Wed, Sep 07, 2022 at 12:06:26AM +0000, Joel Fernandes wrote:
> > > > @@ -326,13 +372,20 @@ static bool rcu_nocb_do_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
> > > >   * Note that this function always returns true if rhp is NULL.
> > > >   */
> > > >  static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
> > > > -				  unsigned long j)
> > > > +				  unsigned long j, unsigned long flush_flags)
> > > >  {
> > > > +	bool ret;
> > > > +
> > > >  	if (!rcu_rdp_is_offloaded(rdp))
> > > >  		return true;
> > > >  	rcu_lockdep_assert_cblist_protected(rdp);
> > > >  	rcu_nocb_bypass_lock(rdp);
> > > > -	return rcu_nocb_do_flush_bypass(rdp, rhp, j);
> > > > +	ret = rcu_nocb_do_flush_bypass(rdp, rhp, j, flush_flags);
> > > > +
> > > > +	if (flush_flags & FLUSH_BP_WAKE)
> > > > +		wake_nocb_gp(rdp, true);
> > > 
> > > Why the true above?
> > > 
> > > Also should we check if the wake up is really necessary (otherwise it means we
> > > force a wake up for all rdp's from rcu_barrier())?
> > > 
> > >        was_alldone = rcu_segcblist_pend_cbs(&rdp->cblist);
> > >        ret = rcu_nocb_do_flush_bypass(rdp, rhp, j, flush_flags);
> > >        if (was_alldone && rcu_segcblist_pend_cbs(&rdp->cblist))
> > >        	  wake_nocb_gp(rdp, false);
> > 
> > You mean something like the following right? Though I'm thinking if its
> > better to call wake_nocb_gp() from tree.c in entrain() and let that handle
> > the wake. That way, we can get rid of the extra FLUSH_BP flags as well and
> > let the flush callers deal with the wakeups..
> 
> Ah yes that could make sense if only one caller cares.
> 
> > 
> > Anyway, for testing this should be good...
> > 
> > ---8<-----------------------
> > 
> > diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
> > index bd8f39ee2cd0..e3344c262672 100644
> > --- a/kernel/rcu/tree_nocb.h
> > +++ b/kernel/rcu/tree_nocb.h
> > @@ -382,15 +382,19 @@ static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
> >  				  unsigned long j, unsigned long flush_flags)
> >  {
> >  	bool ret;
> > +	bool was_alldone;
> >  
> >  	if (!rcu_rdp_is_offloaded(rdp))
> >  		return true;
> >  	rcu_lockdep_assert_cblist_protected(rdp);
> >  	rcu_nocb_bypass_lock(rdp);
> > +	if (flush_flags & FLUSH_BP_WAKE)
> > +		was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
> > +
> 
> You can check that outside bypass lock (but you still need nocb_lock).
> 
> >  	ret = rcu_nocb_do_flush_bypass(rdp, rhp, j, flush_flags);
> >  
> > -	if (flush_flags & FLUSH_BP_WAKE)
> > -		wake_nocb_gp(rdp, true);
> > +	if (flush_flags & FLUSH_BP_WAKE && was_alldone)
> > +		wake_nocb_gp(rdp, false);
> 
> That doesn't check if the bypass list was empty.

I am ending up with something like the below for v6, after discussing with
Paul on IRC he pointed out we only need to do the rcu_barrier() related
wakeup when all the CBs are lazy in the bypass list. Otherwise timer goes
off. I think Frederic mentioned something similar above in different words.

I prefer to keep this logic in tree_nocb.h since rcu_barrier_entrain()
shouldn't have to deal with nocb internals (in theory anyway).

Looks Ok?

thanks,

 - Joel

---8<-----------------------

From: "Joel Fernandes (Google)" <joel@joelfernandes.org>
Subject: [PATCH for v6] fixup! rcu: Introduce call_rcu_lazy() API implementation

Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
---
 kernel/rcu/tree_nocb.h | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index c197534d0c99..fd056358f041 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -375,18 +375,26 @@ static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
 				  unsigned long j, unsigned long flush_flags)
 {
 	bool ret;
-	bool was_alldone;
+	bool was_alldone = false;
+	bool bypass_all_lazy = false;
 
 	if (!rcu_rdp_is_offloaded(rdp))
 		return true;
 	rcu_lockdep_assert_cblist_protected(rdp);
 	rcu_nocb_bypass_lock(rdp);
-	if (flush_flags & FLUSH_BP_WAKE)
+
+	if (flush_flags & FLUSH_BP_WAKE) {
 		was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
+		bypass_all_lazy =
+		  (rcu_cblist_n_cbs(&rdp->nocb_bypass) == rdp->lazy_len);
+	}
 
 	ret = rcu_nocb_do_flush_bypass(rdp, rhp, j, flush_flags);
 
-	if (flush_flags & FLUSH_BP_WAKE && was_alldone)
+	// Wake up the nocb GP thread if needed. GP thread could be sleeping
+	// while waiting for lazy timer to expire (otherwise rcu_barrier may
+	// end up waiting for the duration of the lazy timer).
+	if (flush_flags & FLUSH_BP_WAKE && was_alldone && bypass_all_lazy)
 		wake_nocb_gp(rdp, false);
 
 	return ret;
Joel Fernandes Sept. 22, 2022, 2:39 p.m. UTC | #35
On Wed, Sep 21, 2022 at 11:52:45PM +0000, Joel Fernandes wrote:
[..]
> > > Anyway, for testing this should be good...
> > > 
> > > ---8<-----------------------
> > > 
> > > diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
> > > index bd8f39ee2cd0..e3344c262672 100644
> > > --- a/kernel/rcu/tree_nocb.h
> > > +++ b/kernel/rcu/tree_nocb.h
> > > @@ -382,15 +382,19 @@ static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
> > >  				  unsigned long j, unsigned long flush_flags)
> > >  {
> > >  	bool ret;
> > > +	bool was_alldone;
> > >  
> > >  	if (!rcu_rdp_is_offloaded(rdp))
> > >  		return true;
> > >  	rcu_lockdep_assert_cblist_protected(rdp);
> > >  	rcu_nocb_bypass_lock(rdp);
> > > +	if (flush_flags & FLUSH_BP_WAKE)
> > > +		was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
> > > +
> > 
> > You can check that outside bypass lock (but you still need nocb_lock).
> > 
> > >  	ret = rcu_nocb_do_flush_bypass(rdp, rhp, j, flush_flags);
> > >  
> > > -	if (flush_flags & FLUSH_BP_WAKE)
> > > -		wake_nocb_gp(rdp, true);
> > > +	if (flush_flags & FLUSH_BP_WAKE && was_alldone)
> > > +		wake_nocb_gp(rdp, false);
> > 
> > That doesn't check if the bypass list was empty.
> 
> I am ending up with something like the below for v6, after discussing with
> Paul on IRC he pointed out we only need to do the rcu_barrier() related
> wakeup when all the CBs are lazy in the bypass list. Otherwise timer goes
> off. I think Frederic mentioned something similar above in different words.
> 
> I prefer to keep this logic in tree_nocb.h since rcu_barrier_entrain()
> shouldn't have to deal with nocb internals (in theory anyway).
> 
> Looks Ok?

And I did miss something so here is the updated version, testing it further
now, I think this is the last 'concern' I had before making the other API
changes we discussed so should hopefully be smooth sailing from here. Thank you!

---8<-----------------------

Subject: [PATCH for v6] fixup! rcu: Introduce call_rcu_lazy() API implementation

Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
---
 kernel/rcu/tree_nocb.h | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index c197534d0c99..472ceb8af6d3 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -375,18 +375,27 @@ static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
 				  unsigned long j, unsigned long flush_flags)
 {
 	bool ret;
-	bool was_alldone;
+	bool was_alldone = false;
+	bool bypass_all_lazy = false;
+	long bypass_ncbs;
 
 	if (!rcu_rdp_is_offloaded(rdp))
 		return true;
 	rcu_lockdep_assert_cblist_protected(rdp);
 	rcu_nocb_bypass_lock(rdp);
-	if (flush_flags & FLUSH_BP_WAKE)
+
+	if (flush_flags & FLUSH_BP_WAKE) {
 		was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
+		bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
+		bypass_all_lazy = bypass_ncbs && (bypass_ncbs == rdp->lazy_len);
+	}
 
 	ret = rcu_nocb_do_flush_bypass(rdp, rhp, j, flush_flags);
 
-	if (flush_flags & FLUSH_BP_WAKE && was_alldone)
+	// Wake up the nocb GP thread if needed. GP thread could be sleeping
+	// while waiting for lazy timer to expire (otherwise rcu_barrier may
+	// end up waiting for the duration of the lazy timer).
+	if (flush_flags & FLUSH_BP_WAKE && was_alldone && bypass_all_lazy)
 		wake_nocb_gp(rdp, false);
 
 	return ret;
diff mbox series

Patch

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 08605ce7379d..82e8a07e0856 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -108,6 +108,12 @@  static inline int rcu_preempt_depth(void)
 
 #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
 
+#ifdef CONFIG_RCU_LAZY
+void call_rcu_lazy(struct rcu_head *head, rcu_callback_t func);
+#else
+#define call_rcu_lazy(head, func) call_rcu(head, func)
+#endif
+
 /* Internal to kernel */
 void rcu_init(void);
 extern int rcu_scheduler_active;
diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
index d471d22a5e21..3128d01427cb 100644
--- a/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@ -311,4 +311,12 @@  config TASKS_TRACE_RCU_READ_MB
 	  Say N here if you hate read-side memory barriers.
 	  Take the default if you are unsure.
 
+config RCU_LAZY
+	bool "RCU callback lazy invocation functionality"
+	depends on RCU_NOCB_CPU
+	default n
+	help
+	  To save power, batch RCU callbacks and flush after delay, memory
+	  pressure or callback list growing too big.
+
 endmenu # "RCU Subsystem"
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index be5979da07f5..94675f14efe8 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -474,6 +474,14 @@  enum rcutorture_type {
 	INVALID_RCU_FLAVOR
 };
 
+#if defined(CONFIG_RCU_LAZY)
+unsigned long rcu_lazy_get_jiffies_till_flush(void);
+void rcu_lazy_set_jiffies_till_flush(unsigned long j);
+#else
+static inline unsigned long rcu_lazy_get_jiffies_till_flush(void) { return 0; }
+static inline void rcu_lazy_set_jiffies_till_flush(unsigned long j) { }
+#endif
+
 #if defined(CONFIG_TREE_RCU)
 void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
 			    unsigned long *gp_seq);
@@ -483,6 +491,8 @@  void do_trace_rcu_torture_read(const char *rcutorturename,
 			       unsigned long c_old,
 			       unsigned long c);
 void rcu_gp_set_torture_wait(int duration);
+void rcu_force_call_rcu_to_lazy(bool force);
+
 #else
 static inline void rcutorture_get_gp_data(enum rcutorture_type test_type,
 					  int *flags, unsigned long *gp_seq)
@@ -501,6 +511,7 @@  void do_trace_rcu_torture_read(const char *rcutorturename,
 	do { } while (0)
 #endif
 static inline void rcu_gp_set_torture_wait(int duration) { }
+static inline void rcu_force_call_rcu_to_lazy(bool force) { }
 #endif
 
 #if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST)
diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
index c54ea2b6a36b..55b50e592986 100644
--- a/kernel/rcu/rcu_segcblist.c
+++ b/kernel/rcu/rcu_segcblist.c
@@ -38,7 +38,7 @@  void rcu_cblist_enqueue(struct rcu_cblist *rclp, struct rcu_head *rhp)
  * element of the second rcu_cblist structure, but ensuring that the second
  * rcu_cblist structure, if initially non-empty, always appears non-empty
  * throughout the process.  If rdp is NULL, the second rcu_cblist structure
- * is instead initialized to empty.
+ * is instead initialized to empty. Also account for lazy_len for lazy CBs.
  */
 void rcu_cblist_flush_enqueue(struct rcu_cblist *drclp,
 			      struct rcu_cblist *srclp,
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 9fe581be8696..aaced29a0a71 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2728,47 +2728,8 @@  static void check_cb_ovld(struct rcu_data *rdp)
 	raw_spin_unlock_rcu_node(rnp);
 }
 
-/**
- * call_rcu() - Queue an RCU callback for invocation after a grace period.
- * @head: structure to be used for queueing the RCU updates.
- * @func: actual callback function to be invoked after the grace period
- *
- * The callback function will be invoked some time after a full grace
- * period elapses, in other words after all pre-existing RCU read-side
- * critical sections have completed.  However, the callback function
- * might well execute concurrently with RCU read-side critical sections
- * that started after call_rcu() was invoked.
- *
- * RCU read-side critical sections are delimited by rcu_read_lock()
- * and rcu_read_unlock(), and may be nested.  In addition, but only in
- * v5.0 and later, regions of code across which interrupts, preemption,
- * or softirqs have been disabled also serve as RCU read-side critical
- * sections.  This includes hardware interrupt handlers, softirq handlers,
- * and NMI handlers.
- *
- * Note that all CPUs must agree that the grace period extended beyond
- * all pre-existing RCU read-side critical section.  On systems with more
- * than one CPU, this means that when "func()" is invoked, each CPU is
- * guaranteed to have executed a full memory barrier since the end of its
- * last RCU read-side critical section whose beginning preceded the call
- * to call_rcu().  It also means that each CPU executing an RCU read-side
- * critical section that continues beyond the start of "func()" must have
- * executed a memory barrier after the call_rcu() but before the beginning
- * of that RCU read-side critical section.  Note that these guarantees
- * include CPUs that are offline, idle, or executing in user mode, as
- * well as CPUs that are executing in the kernel.
- *
- * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the
- * resulting RCU callback function "func()", then both CPU A and CPU B are
- * guaranteed to execute a full memory barrier during the time interval
- * between the call to call_rcu() and the invocation of "func()" -- even
- * if CPU A and CPU B are the same CPU (but again only if the system has
- * more than one CPU).
- *
- * Implementation of these memory-ordering guarantees is described here:
- * Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst.
- */
-void call_rcu(struct rcu_head *head, rcu_callback_t func)
+static void
+__call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy)
 {
 	static atomic_t doublefrees;
 	unsigned long flags;
@@ -2818,7 +2779,7 @@  void call_rcu(struct rcu_head *head, rcu_callback_t func)
 		trace_rcu_callback(rcu_state.name, head,
 				   rcu_segcblist_n_cbs(&rdp->cblist));
 
-	if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags))
+	if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags, lazy))
 		return; // Enqueued onto ->nocb_bypass, so just leave.
 	// If no-CBs CPU gets here, rcu_nocb_try_bypass() acquired ->nocb_lock.
 	rcu_segcblist_enqueue(&rdp->cblist, head);
@@ -2833,8 +2794,86 @@  void call_rcu(struct rcu_head *head, rcu_callback_t func)
 		local_irq_restore(flags);
 	}
 }
-EXPORT_SYMBOL_GPL(call_rcu);
 
+#ifdef CONFIG_RCU_LAZY
+/**
+ * call_rcu_lazy() - Lazily queue RCU callback for invocation after grace period.
+ * @head: structure to be used for queueing the RCU updates.
+ * @func: actual callback function to be invoked after the grace period
+ *
+ * The callback function will be invoked some time after a full grace
+ * period elapses, in other words after all pre-existing RCU read-side
+ * critical sections have completed.
+ *
+ * Use this API instead of call_rcu() if you don't mind the callback being
+ * invoked after very long periods of time on systems without memory pressure
+ * and on systems which are lightly loaded or mostly idle.
+ *
+ * Other than the extra delay in callbacks being invoked, this function is
+ * identical to, and reuses call_rcu()'s logic. Refer to call_rcu() for more
+ * details about memory ordering and other functionality.
+ */
+void call_rcu_lazy(struct rcu_head *head, rcu_callback_t func)
+{
+	return __call_rcu_common(head, func, true);
+}
+EXPORT_SYMBOL_GPL(call_rcu_lazy);
+#endif
+
+static bool force_call_rcu_to_lazy;
+
+void rcu_force_call_rcu_to_lazy(bool force)
+{
+	if (IS_ENABLED(CONFIG_RCU_SCALE_TEST))
+		WRITE_ONCE(force_call_rcu_to_lazy, force);
+}
+EXPORT_SYMBOL_GPL(rcu_force_call_rcu_to_lazy);
+
+/**
+ * call_rcu() - Queue an RCU callback for invocation after a grace period.
+ * @head: structure to be used for queueing the RCU updates.
+ * @func: actual callback function to be invoked after the grace period
+ *
+ * The callback function will be invoked some time after a full grace
+ * period elapses, in other words after all pre-existing RCU read-side
+ * critical sections have completed.  However, the callback function
+ * might well execute concurrently with RCU read-side critical sections
+ * that started after call_rcu() was invoked.
+ *
+ * RCU read-side critical sections are delimited by rcu_read_lock()
+ * and rcu_read_unlock(), and may be nested.  In addition, but only in
+ * v5.0 and later, regions of code across which interrupts, preemption,
+ * or softirqs have been disabled also serve as RCU read-side critical
+ * sections.  This includes hardware interrupt handlers, softirq handlers,
+ * and NMI handlers.
+ *
+ * Note that all CPUs must agree that the grace period extended beyond
+ * all pre-existing RCU read-side critical section.  On systems with more
+ * than one CPU, this means that when "func()" is invoked, each CPU is
+ * guaranteed to have executed a full memory barrier since the end of its
+ * last RCU read-side critical section whose beginning preceded the call
+ * to call_rcu().  It also means that each CPU executing an RCU read-side
+ * critical section that continues beyond the start of "func()" must have
+ * executed a memory barrier after the call_rcu() but before the beginning
+ * of that RCU read-side critical section.  Note that these guarantees
+ * include CPUs that are offline, idle, or executing in user mode, as
+ * well as CPUs that are executing in the kernel.
+ *
+ * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the
+ * resulting RCU callback function "func()", then both CPU A and CPU B are
+ * guaranteed to execute a full memory barrier during the time interval
+ * between the call to call_rcu() and the invocation of "func()" -- even
+ * if CPU A and CPU B are the same CPU (but again only if the system has
+ * more than one CPU).
+ *
+ * Implementation of these memory-ordering guarantees is described here:
+ * Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst.
+ */
+void call_rcu(struct rcu_head *head, rcu_callback_t func)
+{
+	return __call_rcu_common(head, func, force_call_rcu_to_lazy);
+}
+EXPORT_SYMBOL_GPL(call_rcu);
 
 /* Maximum number of jiffies to wait before draining a batch. */
 #define KFREE_DRAIN_JIFFIES (5 * HZ)
@@ -3904,7 +3943,8 @@  static void rcu_barrier_entrain(struct rcu_data *rdp)
 	rdp->barrier_head.func = rcu_barrier_callback;
 	debug_rcu_head_queue(&rdp->barrier_head);
 	rcu_nocb_lock(rdp);
-	WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies));
+	WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies, false,
+		     /* wake gp thread */ true));
 	if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head)) {
 		atomic_inc(&rcu_state.barrier_cpu_count);
 	} else {
@@ -4325,7 +4365,7 @@  void rcutree_migrate_callbacks(int cpu)
 	my_rdp = this_cpu_ptr(&rcu_data);
 	my_rnp = my_rdp->mynode;
 	rcu_nocb_lock(my_rdp); /* irqs already disabled. */
-	WARN_ON_ONCE(!rcu_nocb_flush_bypass(my_rdp, NULL, jiffies));
+	WARN_ON_ONCE(!rcu_nocb_flush_bypass(my_rdp, NULL, jiffies, false, false));
 	raw_spin_lock_rcu_node(my_rnp); /* irqs already disabled. */
 	/* Leverage recent GPs and set GP for new callbacks. */
 	needwake = rcu_advance_cbs(my_rnp, rdp) ||
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index d4a97e40ea9c..946d819b23fc 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -263,14 +263,18 @@  struct rcu_data {
 	unsigned long last_fqs_resched;	/* Time of last rcu_resched(). */
 	unsigned long last_sched_clock;	/* Jiffies of last rcu_sched_clock_irq(). */
 
+#ifdef CONFIG_RCU_LAZY
+	long lazy_len;			/* Length of buffered lazy callbacks. */
+#endif
 	int cpu;
 };
 
 /* Values for nocb_defer_wakeup field in struct rcu_data. */
 #define RCU_NOCB_WAKE_NOT	0
 #define RCU_NOCB_WAKE_BYPASS	1
-#define RCU_NOCB_WAKE		2
-#define RCU_NOCB_WAKE_FORCE	3
+#define RCU_NOCB_WAKE_LAZY	2
+#define RCU_NOCB_WAKE		3
+#define RCU_NOCB_WAKE_FORCE	4
 
 #define RCU_JIFFIES_TILL_FORCE_QS (1 + (HZ > 250) + (HZ > 500))
 					/* For jiffies_till_first_fqs and */
@@ -440,9 +444,10 @@  static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp);
 static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq);
 static void rcu_init_one_nocb(struct rcu_node *rnp);
 static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
-				  unsigned long j);
+				  unsigned long j, bool lazy, bool wakegp);
 static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
-				bool *was_alldone, unsigned long flags);
+				bool *was_alldone, unsigned long flags,
+				bool lazy);
 static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty,
 				 unsigned long flags);
 static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level);
diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index 31068dd31315..7e97a7b6e046 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -256,6 +256,31 @@  static bool wake_nocb_gp(struct rcu_data *rdp, bool force)
 	return __wake_nocb_gp(rdp_gp, rdp, force, flags);
 }
 
+/*
+ * LAZY_FLUSH_JIFFIES decides the maximum amount of time that
+ * can elapse before lazy callbacks are flushed. Lazy callbacks
+ * could be flushed much earlier for a number of other reasons
+ * however, LAZY_FLUSH_JIFFIES will ensure no lazy callbacks are
+ * left unsubmitted to RCU after those many jiffies.
+ */
+#define LAZY_FLUSH_JIFFIES (10 * HZ)
+unsigned long jiffies_till_flush = LAZY_FLUSH_JIFFIES;
+
+#ifdef CONFIG_RCU_LAZY
+// To be called only from test code.
+void rcu_lazy_set_jiffies_till_flush(unsigned long jif)
+{
+	jiffies_till_flush = jif;
+}
+EXPORT_SYMBOL(rcu_lazy_set_jiffies_till_flush);
+
+unsigned long rcu_lazy_get_jiffies_till_flush(void)
+{
+	return jiffies_till_flush;
+}
+EXPORT_SYMBOL(rcu_lazy_get_jiffies_till_flush);
+#endif
+
 /*
  * Arrange to wake the GP kthread for this NOCB group at some future
  * time when it is safe to do so.
@@ -265,23 +290,39 @@  static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype,
 {
 	unsigned long flags;
 	struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
+	unsigned long mod_jif = 0;
 
 	raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
 
 	/*
-	 * Bypass wakeup overrides previous deferments. In case
-	 * of callback storm, no need to wake up too early.
+	 * Bypass and lazy wakeup overrides previous deferments. In case of
+	 * callback storm, no need to wake up too early.
 	 */
-	if (waketype == RCU_NOCB_WAKE_BYPASS) {
-		mod_timer(&rdp_gp->nocb_timer, jiffies + 2);
-		WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype);
-	} else {
+	switch (waketype) {
+	case RCU_NOCB_WAKE_LAZY:
+		if (rdp->nocb_defer_wakeup != RCU_NOCB_WAKE_LAZY)
+			mod_jif = jiffies_till_flush;
+		break;
+
+	case RCU_NOCB_WAKE_BYPASS:
+		mod_jif = 2;
+		break;
+
+	case RCU_NOCB_WAKE:
+	case RCU_NOCB_WAKE_FORCE:
+		// For these, make it wake up the soonest if we
+		// were in a bypass or lazy sleep before.
 		if (rdp_gp->nocb_defer_wakeup < RCU_NOCB_WAKE)
-			mod_timer(&rdp_gp->nocb_timer, jiffies + 1);
-		if (rdp_gp->nocb_defer_wakeup < waketype)
-			WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype);
+			mod_jif = 1;
+		break;
 	}
 
+	if (mod_jif)
+		mod_timer(&rdp_gp->nocb_timer, jiffies + mod_jif);
+
+	if (rdp_gp->nocb_defer_wakeup < waketype)
+		WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype);
+
 	raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
 
 	trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, reason);
@@ -293,10 +334,13 @@  static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype,
  * proves to be initially empty, just return false because the no-CB GP
  * kthread may need to be awakened in this case.
  *
+ * Return true if there was something to be flushed and it succeeded, otherwise
+ * false.
+ *
  * Note that this function always returns true if rhp is NULL.
  */
 static bool rcu_nocb_do_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
-				     unsigned long j)
+				     unsigned long j, bool lazy)
 {
 	struct rcu_cblist rcl;
 
@@ -310,7 +354,18 @@  static bool rcu_nocb_do_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
 	/* Note: ->cblist.len already accounts for ->nocb_bypass contents. */
 	if (rhp)
 		rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */
-	rcu_cblist_flush_enqueue(&rcl, &rdp->nocb_bypass, rhp);
+
+	/*
+	 * If the new CB requested was a lazy one, queue it onto the main
+	 * ->cblist so we can take advantage of a sooner grade period.
+	 */
+	if (lazy && rhp) {
+		rcu_cblist_flush_enqueue(&rcl, &rdp->nocb_bypass, NULL);
+		rcu_cblist_enqueue(&rcl, rhp);
+	} else {
+		rcu_cblist_flush_enqueue(&rcl, &rdp->nocb_bypass, rhp);
+	}
+
 	rcu_segcblist_insert_pend_cbs(&rdp->cblist, &rcl);
 	WRITE_ONCE(rdp->nocb_bypass_first, j);
 	rcu_nocb_bypass_unlock(rdp);
@@ -326,13 +381,20 @@  static bool rcu_nocb_do_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
  * Note that this function always returns true if rhp is NULL.
  */
 static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
-				  unsigned long j)
+				  unsigned long j, bool lazy, bool wake_gp)
 {
+	bool ret;
+
 	if (!rcu_rdp_is_offloaded(rdp))
 		return true;
 	rcu_lockdep_assert_cblist_protected(rdp);
 	rcu_nocb_bypass_lock(rdp);
-	return rcu_nocb_do_flush_bypass(rdp, rhp, j);
+	ret = rcu_nocb_do_flush_bypass(rdp, rhp, j, lazy);
+
+	if (wake_gp)
+		wake_nocb_gp(rdp, true);
+
+	return ret;
 }
 
 /*
@@ -345,7 +407,7 @@  static void rcu_nocb_try_flush_bypass(struct rcu_data *rdp, unsigned long j)
 	if (!rcu_rdp_is_offloaded(rdp) ||
 	    !rcu_nocb_bypass_trylock(rdp))
 		return;
-	WARN_ON_ONCE(!rcu_nocb_do_flush_bypass(rdp, NULL, j));
+	WARN_ON_ONCE(!rcu_nocb_do_flush_bypass(rdp, NULL, j, false));
 }
 
 /*
@@ -367,12 +429,14 @@  static void rcu_nocb_try_flush_bypass(struct rcu_data *rdp, unsigned long j)
  * there is only one CPU in operation.
  */
 static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
-				bool *was_alldone, unsigned long flags)
+				bool *was_alldone, unsigned long flags,
+				bool lazy)
 {
 	unsigned long c;
 	unsigned long cur_gp_seq;
 	unsigned long j = jiffies;
 	long ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
+	bool bypass_is_lazy = (ncbs == READ_ONCE(rdp->lazy_len));
 
 	lockdep_assert_irqs_disabled();
 
@@ -417,23 +481,29 @@  static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
 	// If there hasn't yet been all that many ->cblist enqueues
 	// this jiffy, tell the caller to enqueue onto ->cblist.  But flush
 	// ->nocb_bypass first.
-	if (rdp->nocb_nobypass_count < nocb_nobypass_lim_per_jiffy) {
+	// Lazy CBs throttle this back and do immediate bypass queuing.
+	if (rdp->nocb_nobypass_count < nocb_nobypass_lim_per_jiffy && !lazy) {
 		rcu_nocb_lock(rdp);
 		*was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
 		if (*was_alldone)
 			trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
 					    TPS("FirstQ"));
-		WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, j));
+
+		WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, j, false, false));
+
 		WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
 		return false; // Caller must enqueue the callback.
 	}
 
 	// If ->nocb_bypass has been used too long or is too full,
 	// flush ->nocb_bypass to ->cblist.
-	if ((ncbs && j != READ_ONCE(rdp->nocb_bypass_first)) ||
+	if ((ncbs && !bypass_is_lazy && j != READ_ONCE(rdp->nocb_bypass_first)) ||
+	    (ncbs &&  bypass_is_lazy &&
+		(time_after(j, READ_ONCE(rdp->nocb_bypass_first) + jiffies_till_flush))) ||
 	    ncbs >= qhimark) {
 		rcu_nocb_lock(rdp);
-		if (!rcu_nocb_flush_bypass(rdp, rhp, j)) {
+
+		if (!rcu_nocb_flush_bypass(rdp, rhp, j, lazy, false)) {
 			*was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
 			if (*was_alldone)
 				trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
@@ -460,16 +530,29 @@  static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
 	// We need to use the bypass.
 	rcu_nocb_wait_contended(rdp);
 	rcu_nocb_bypass_lock(rdp);
+
 	ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
 	rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */
 	rcu_cblist_enqueue(&rdp->nocb_bypass, rhp);
+
+	if (IS_ENABLED(CONFIG_RCU_LAZY) && lazy)
+		WRITE_ONCE(rdp->lazy_len, rdp->lazy_len + 1);
+
 	if (!ncbs) {
 		WRITE_ONCE(rdp->nocb_bypass_first, j);
 		trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("FirstBQ"));
 	}
+
 	rcu_nocb_bypass_unlock(rdp);
 	smp_mb(); /* Order enqueue before wake. */
-	if (ncbs) {
+
+	// We had CBs in the bypass list before. There is nothing else to do if:
+	// There were only non-lazy CBs before, in this case, the bypass timer
+	// or GP-thread will handle the CBs including any new lazy ones.
+	// Or, the new CB is lazy and the old bypass-CBs were also lazy. In this
+	// case the old lazy timer would have been setup. When that expires,
+	// the new lazy one will be handled.
+	if (ncbs && (!bypass_is_lazy || lazy)) {
 		local_irq_restore(flags);
 	} else {
 		// No-CBs GP kthread might be indefinitely asleep, if so, wake.
@@ -478,6 +561,10 @@  static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
 			trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
 					    TPS("FirstBQwake"));
 			__call_rcu_nocb_wake(rdp, true, flags);
+		} else if (bypass_is_lazy && !lazy) {
+			trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+					    TPS("FirstBQwakeLazy2Non"));
+			__call_rcu_nocb_wake(rdp, true, flags);
 		} else {
 			trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
 					    TPS("FirstBQnoWake"));
@@ -499,7 +586,7 @@  static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone,
 {
 	unsigned long cur_gp_seq;
 	unsigned long j;
-	long len;
+	long len, lazy_len, bypass_len;
 	struct task_struct *t;
 
 	// If we are being polled or there is no kthread, just leave.
@@ -512,9 +599,16 @@  static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone,
 	}
 	// Need to actually to a wakeup.
 	len = rcu_segcblist_n_cbs(&rdp->cblist);
+	bypass_len = rcu_cblist_n_cbs(&rdp->nocb_bypass);
+	lazy_len = READ_ONCE(rdp->lazy_len);
 	if (was_alldone) {
 		rdp->qlen_last_fqs_check = len;
-		if (!irqs_disabled_flags(flags)) {
+		// Only lazy CBs in bypass list
+		if (lazy_len && bypass_len == lazy_len) {
+			rcu_nocb_unlock_irqrestore(rdp, flags);
+			wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_LAZY,
+					   TPS("WakeLazy"));
+		} else if (!irqs_disabled_flags(flags)) {
 			/* ... if queue was empty ... */
 			rcu_nocb_unlock_irqrestore(rdp, flags);
 			wake_nocb_gp(rdp, false);
@@ -604,8 +698,8 @@  static void nocb_gp_sleep(struct rcu_data *my_rdp, int cpu)
  */
 static void nocb_gp_wait(struct rcu_data *my_rdp)
 {
-	bool bypass = false;
-	long bypass_ncbs;
+	bool bypass = false, lazy = false;
+	long bypass_ncbs, lazy_ncbs;
 	int __maybe_unused cpu = my_rdp->cpu;
 	unsigned long cur_gp_seq;
 	unsigned long flags;
@@ -640,24 +734,41 @@  static void nocb_gp_wait(struct rcu_data *my_rdp)
 	 * won't be ignored for long.
 	 */
 	list_for_each_entry(rdp, &my_rdp->nocb_head_rdp, nocb_entry_rdp) {
+		bool flush_bypass = false;
+
 		trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Check"));
 		rcu_nocb_lock_irqsave(rdp, flags);
 		lockdep_assert_held(&rdp->nocb_lock);
 		bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
-		if (bypass_ncbs &&
+		lazy_ncbs = READ_ONCE(rdp->lazy_len);
+
+		if (bypass_ncbs && (lazy_ncbs == bypass_ncbs) &&
+		    (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + jiffies_till_flush) ||
+		     bypass_ncbs > 2 * qhimark)) {
+			flush_bypass = true;
+		} else if (bypass_ncbs && (lazy_ncbs != bypass_ncbs) &&
 		    (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + 1) ||
 		     bypass_ncbs > 2 * qhimark)) {
-			// Bypass full or old, so flush it.
-			(void)rcu_nocb_try_flush_bypass(rdp, j);
-			bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
+			flush_bypass = true;
 		} else if (!bypass_ncbs && rcu_segcblist_empty(&rdp->cblist)) {
 			rcu_nocb_unlock_irqrestore(rdp, flags);
 			continue; /* No callbacks here, try next. */
 		}
+
+		if (flush_bypass) {
+			// Bypass full or old, so flush it.
+			(void)rcu_nocb_try_flush_bypass(rdp, j);
+			bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
+			lazy_ncbs = READ_ONCE(rdp->lazy_len);
+		}
+
 		if (bypass_ncbs) {
 			trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
-					    TPS("Bypass"));
-			bypass = true;
+				    bypass_ncbs == lazy_ncbs ? TPS("Lazy") : TPS("Bypass"));
+			if (bypass_ncbs == lazy_ncbs)
+				lazy = true;
+			else
+				bypass = true;
 		}
 		rnp = rdp->mynode;
 
@@ -705,12 +816,21 @@  static void nocb_gp_wait(struct rcu_data *my_rdp)
 	my_rdp->nocb_gp_gp = needwait_gp;
 	my_rdp->nocb_gp_seq = needwait_gp ? wait_gp_seq : 0;
 
-	if (bypass && !rcu_nocb_poll) {
-		// At least one child with non-empty ->nocb_bypass, so set
-		// timer in order to avoid stranding its callbacks.
-		wake_nocb_gp_defer(my_rdp, RCU_NOCB_WAKE_BYPASS,
-				   TPS("WakeBypassIsDeferred"));
+	// At least one child with non-empty ->nocb_bypass, so set
+	// timer in order to avoid stranding its callbacks.
+	if (!rcu_nocb_poll) {
+		// If bypass list only has lazy CBs. Add a deferred
+		// lazy wake up.
+		if (lazy && !bypass) {
+			wake_nocb_gp_defer(my_rdp, RCU_NOCB_WAKE_LAZY,
+					TPS("WakeLazyIsDeferred"));
+		// Otherwise add a deferred bypass wake up.
+		} else if (bypass) {
+			wake_nocb_gp_defer(my_rdp, RCU_NOCB_WAKE_BYPASS,
+					TPS("WakeBypassIsDeferred"));
+		}
 	}
+
 	if (rcu_nocb_poll) {
 		/* Polling, so trace if first poll in the series. */
 		if (gotcbs)
@@ -1036,7 +1156,7 @@  static long rcu_nocb_rdp_deoffload(void *arg)
 	 * return false, which means that future calls to rcu_nocb_try_bypass()
 	 * will refuse to put anything into the bypass.
 	 */
-	WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies));
+	WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies, false, false));
 	/*
 	 * Start with invoking rcu_core() early. This way if the current thread
 	 * happens to preempt an ongoing call to rcu_core() in the middle,
@@ -1290,6 +1410,7 @@  static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
 	raw_spin_lock_init(&rdp->nocb_gp_lock);
 	timer_setup(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer, 0);
 	rcu_cblist_init(&rdp->nocb_bypass);
+	WRITE_ONCE(rdp->lazy_len, 0);
 	mutex_init(&rdp->nocb_gp_kthread_mutex);
 }
 
@@ -1571,13 +1692,14 @@  static void rcu_init_one_nocb(struct rcu_node *rnp)
 }
 
 static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
-				  unsigned long j)
+				  unsigned long j, bool lazy, bool wakegp)
 {
 	return true;
 }
 
 static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
-				bool *was_alldone, unsigned long flags)
+				bool *was_alldone, unsigned long flags,
+				bool lazy)
 {
 	return false;
 }