diff mbox series

[3/3] rcu/exp: Remove needless CPU up quiescent state report

Message ID 20250213232559.34163-4-frederic@kernel.org (mailing list archive)
State New
Headers show
Series rcu/exp updates | expand

Commit Message

Frederic Weisbecker Feb. 13, 2025, 11:25 p.m. UTC
A CPU coming online checks for an ongoing grace period and reports
a quiescent state accordingly if needed. This special treatment that
shortcuts the expedited IPI finds its origin as an optimization purpose
on the following commit:

	338b0f760e84 (rcu: Better hotplug handling for synchronize_sched_expedited()

The point is to avoid an IPI while waiting for a CPU to become online
or failing to become offline.

However this is pointless and even error prone for several reasons:

* If the CPU has been seen offline in the first round scanning offline
  and idle CPUs, no IPI is even tried and the quiescent state is
  reported on behalf of the CPU.

* This means that if the IPI fails, the CPU just became offline. So
  it's unlikely to become online right away, unless the cpu hotplug
  operation failed and rolled back, which is a rare event that can
  wait a jiffy for a new IPI to be issued.

* But then the "optimization" applying on failing CPU hotplug down only
  applies to !PREEMPT_RCU.

* This force reports a quiescent state even if ->cpu_no_qs.b.exp is not
  set. As a result it can race with remote QS reports on the same rdp.
  Fortunately it happens to be OK but an accident is waiting to happen.

For all those reasons, remove this optimization that doesn't look worthy
to keep around.

Reported-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 kernel/rcu/tree.c     |  2 --
 kernel/rcu/tree_exp.h | 45 ++-----------------------------------------
 2 files changed, 2 insertions(+), 45 deletions(-)

Comments

Paul E. McKenney Feb. 14, 2025, 9:01 a.m. UTC | #1
On Fri, Feb 14, 2025 at 12:25:59AM +0100, Frederic Weisbecker wrote:
> A CPU coming online checks for an ongoing grace period and reports
> a quiescent state accordingly if needed. This special treatment that
> shortcuts the expedited IPI finds its origin as an optimization purpose
> on the following commit:
> 
> 	338b0f760e84 (rcu: Better hotplug handling for synchronize_sched_expedited()
> 
> The point is to avoid an IPI while waiting for a CPU to become online
> or failing to become offline.
> 
> However this is pointless and even error prone for several reasons:
> 
> * If the CPU has been seen offline in the first round scanning offline
>   and idle CPUs, no IPI is even tried and the quiescent state is
>   reported on behalf of the CPU.
> 
> * This means that if the IPI fails, the CPU just became offline. So
>   it's unlikely to become online right away, unless the cpu hotplug
>   operation failed and rolled back, which is a rare event that can
>   wait a jiffy for a new IPI to be issued.
> 
> * But then the "optimization" applying on failing CPU hotplug down only
>   applies to !PREEMPT_RCU.
> 
> * This force reports a quiescent state even if ->cpu_no_qs.b.exp is not
>   set. As a result it can race with remote QS reports on the same rdp.
>   Fortunately it happens to be OK but an accident is waiting to happen.
> 
> For all those reasons, remove this optimization that doesn't look worthy
> to keep around.

Thank you for digging into this!

When I ran tests that removed the call to sync_sched_exp_online_cleanup()
a few months ago, I got grace-period hangs [1].  Has something changed
to make this safe?

							Thanx, Paul

[1] https://docs.google.com/document/d/1-JQ4QYF1qid0TWSLa76O1kusdhER2wgm0dYdwFRUzU8/edit?usp=sharing

> Reported-by: Paul E. McKenney <paulmck@kernel.org>
> Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
> ---
>  kernel/rcu/tree.c     |  2 --
>  kernel/rcu/tree_exp.h | 45 ++-----------------------------------------
>  2 files changed, 2 insertions(+), 45 deletions(-)
> 
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index 8625f616c65a..86935fe00397 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -151,7 +151,6 @@ static void rcu_report_qs_rnp(unsigned long mask, struct rcu_node *rnp,
>  			      unsigned long gps, unsigned long flags);
>  static void invoke_rcu_core(void);
>  static void rcu_report_exp_rdp(struct rcu_data *rdp);
> -static void sync_sched_exp_online_cleanup(int cpu);
>  static void check_cb_ovld_locked(struct rcu_data *rdp, struct rcu_node *rnp);
>  static bool rcu_rdp_is_offloaded(struct rcu_data *rdp);
>  static bool rcu_rdp_cpu_online(struct rcu_data *rdp);
> @@ -4259,7 +4258,6 @@ int rcutree_online_cpu(unsigned int cpu)
>  	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
>  	if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
>  		return 0; /* Too early in boot for scheduler work. */
> -	sync_sched_exp_online_cleanup(cpu);
>  
>  	// Stop-machine done, so allow nohz_full to disable tick.
>  	tick_dep_clear(TICK_DEP_BIT_RCU);
> diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
> index caff16e441d1..a3f962eb7057 100644
> --- a/kernel/rcu/tree_exp.h
> +++ b/kernel/rcu/tree_exp.h
> @@ -751,12 +751,8 @@ static void rcu_exp_handler(void *unused)
>  	struct task_struct *t = current;
>  
>  	/*
> -	 * First, is there no need for a quiescent state from this CPU,
> -	 * or is this CPU already looking for a quiescent state for the
> -	 * current grace period?  If either is the case, just leave.
> -	 * However, this should not happen due to the preemptible
> -	 * sync_sched_exp_online_cleanup() implementation being a no-op,
> -	 * so warn if this does happen.
> +	 * WARN if the CPU is unexpectedly already looking for a
> +	 * QS or has already reported one.
>  	 */
>  	ASSERT_EXCLUSIVE_WRITER_SCOPED(rdp->cpu_no_qs.b.exp);
>  	if (WARN_ON_ONCE(!(READ_ONCE(rnp->expmask) & rdp->grpmask) ||
> @@ -803,11 +799,6 @@ static void rcu_exp_handler(void *unused)
>  	WARN_ON_ONCE(1);
>  }
>  
> -/* PREEMPTION=y, so no PREEMPTION=n expedited grace period to clean up after. */
> -static void sync_sched_exp_online_cleanup(int cpu)
> -{
> -}
> -
>  /*
>   * Scan the current list of tasks blocked within RCU read-side critical
>   * sections, printing out the tid of each that is blocking the current
> @@ -885,38 +876,6 @@ static void rcu_exp_handler(void *unused)
>  	rcu_exp_need_qs();
>  }
>  
> -/* Send IPI for expedited cleanup if needed at end of CPU-hotplug operation. */
> -static void sync_sched_exp_online_cleanup(int cpu)
> -{
> -	unsigned long flags;
> -	int my_cpu;
> -	struct rcu_data *rdp;
> -	int ret;
> -	struct rcu_node *rnp;
> -
> -	rdp = per_cpu_ptr(&rcu_data, cpu);
> -	rnp = rdp->mynode;
> -	my_cpu = get_cpu();
> -	/* Quiescent state either not needed or already requested, leave. */
> -	if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) ||
> -	    READ_ONCE(rdp->cpu_no_qs.b.exp)) {
> -		put_cpu();
> -		return;
> -	}
> -	/* Quiescent state needed on current CPU, so set it up locally. */
> -	if (my_cpu == cpu) {
> -		local_irq_save(flags);
> -		rcu_exp_need_qs();
> -		local_irq_restore(flags);
> -		put_cpu();
> -		return;
> -	}
> -	/* Quiescent state needed on some other CPU, send IPI. */
> -	ret = smp_call_function_single(cpu, rcu_exp_handler, NULL, 0);
> -	put_cpu();
> -	WARN_ON_ONCE(ret);
> -}
> -
>  /*
>   * Because preemptible RCU does not exist, we never have to check for
>   * tasks blocked within RCU read-side critical sections that are
> -- 
> 2.46.0
>
Frederic Weisbecker Feb. 14, 2025, 12:10 p.m. UTC | #2
Le Fri, Feb 14, 2025 at 01:01:56AM -0800, Paul E. McKenney a écrit :
> On Fri, Feb 14, 2025 at 12:25:59AM +0100, Frederic Weisbecker wrote:
> > A CPU coming online checks for an ongoing grace period and reports
> > a quiescent state accordingly if needed. This special treatment that
> > shortcuts the expedited IPI finds its origin as an optimization purpose
> > on the following commit:
> > 
> > 	338b0f760e84 (rcu: Better hotplug handling for synchronize_sched_expedited()
> > 
> > The point is to avoid an IPI while waiting for a CPU to become online
> > or failing to become offline.
> > 
> > However this is pointless and even error prone for several reasons:
> > 
> > * If the CPU has been seen offline in the first round scanning offline
> >   and idle CPUs, no IPI is even tried and the quiescent state is
> >   reported on behalf of the CPU.
> > 
> > * This means that if the IPI fails, the CPU just became offline. So
> >   it's unlikely to become online right away, unless the cpu hotplug
> >   operation failed and rolled back, which is a rare event that can
> >   wait a jiffy for a new IPI to be issued.
> > 
> > * But then the "optimization" applying on failing CPU hotplug down only
> >   applies to !PREEMPT_RCU.
> > 
> > * This force reports a quiescent state even if ->cpu_no_qs.b.exp is not
> >   set. As a result it can race with remote QS reports on the same rdp.
> >   Fortunately it happens to be OK but an accident is waiting to happen.
> > 
> > For all those reasons, remove this optimization that doesn't look worthy
> > to keep around.
> 
> Thank you for digging into this!
> 
> When I ran tests that removed the call to sync_sched_exp_online_cleanup()
> a few months ago, I got grace-period hangs [1].  Has something changed
> to make this safe?

Hmm, but was it before or after "rcu: Fix get_state_synchronize_rcu_full()
GP-start detection" ?

And if after do we know why?

Thanks.
Paul E. McKenney Feb. 15, 2025, 10:38 a.m. UTC | #3
On Fri, Feb 14, 2025 at 01:10:52PM +0100, Frederic Weisbecker wrote:
> Le Fri, Feb 14, 2025 at 01:01:56AM -0800, Paul E. McKenney a écrit :
> > On Fri, Feb 14, 2025 at 12:25:59AM +0100, Frederic Weisbecker wrote:
> > > A CPU coming online checks for an ongoing grace period and reports
> > > a quiescent state accordingly if needed. This special treatment that
> > > shortcuts the expedited IPI finds its origin as an optimization purpose
> > > on the following commit:
> > > 
> > > 	338b0f760e84 (rcu: Better hotplug handling for synchronize_sched_expedited()
> > > 
> > > The point is to avoid an IPI while waiting for a CPU to become online
> > > or failing to become offline.
> > > 
> > > However this is pointless and even error prone for several reasons:
> > > 
> > > * If the CPU has been seen offline in the first round scanning offline
> > >   and idle CPUs, no IPI is even tried and the quiescent state is
> > >   reported on behalf of the CPU.
> > > 
> > > * This means that if the IPI fails, the CPU just became offline. So
> > >   it's unlikely to become online right away, unless the cpu hotplug
> > >   operation failed and rolled back, which is a rare event that can
> > >   wait a jiffy for a new IPI to be issued.

But the expedited grace period might be preempted for an arbitrarily
long period, especially if a hypervisor is in play.  And we do drop
that lock midway through...

> > > * But then the "optimization" applying on failing CPU hotplug down only
> > >   applies to !PREEMPT_RCU.

Yes, definitely only non-preemptible RCU.

> > > * This force reports a quiescent state even if ->cpu_no_qs.b.exp is not
> > >   set. As a result it can race with remote QS reports on the same rdp.
> > >   Fortunately it happens to be OK but an accident is waiting to happen.

To your point, I did in fact incorrectly decide that this was a bug.  ;-)

> > > For all those reasons, remove this optimization that doesn't look worthy
> > > to keep around.
> > 
> > Thank you for digging into this!
> > 
> > When I ran tests that removed the call to sync_sched_exp_online_cleanup()
> > a few months ago, I got grace-period hangs [1].  Has something changed
> > to make this safe?
> 
> Hmm, but was it before or after "rcu: Fix get_state_synchronize_rcu_full()
> GP-start detection" ?

Before.  There was also some buggy debug code in play.  Also, to get the
failure, it was necessary to make TREE03 disable preemption, as stock
TREE03 has an empty sync_sched_exp_online_cleanup() function.

I am rerunning the test with a WARN_ON_ONCE() after the early exit from
the sync_sched_exp_online_cleanup().  Of course, lack of a failure does
not necessairly indicate

> And if after do we know why?

Here are some (possibly bogus) possibilities that came to mind:

1.	There is some coming-online race that deprives the incoming
	CPU of an IPI, but nevertheless marks that CPU as blocking the
	current grace period.

2.	Some strange scenario involves the CPU going offline for just a
	little bit, so that the IPI gets wasted on the outgoing due to
	neither of the "if" conditions in rcu_exp_handler() being true.
	The outgoing CPU just says "I need a QS", then leaves and
	comes back.  (The expedited grace period doesn't retry because
	it believes that it already sent that IPI.)

3.	Your ideas here!  ;-)

							Thanx, Paul
Frederic Weisbecker Feb. 15, 2025, 10:23 p.m. UTC | #4
Le Sat, Feb 15, 2025 at 02:38:04AM -0800, Paul E. McKenney a écrit :
> On Fri, Feb 14, 2025 at 01:10:52PM +0100, Frederic Weisbecker wrote:
> > Le Fri, Feb 14, 2025 at 01:01:56AM -0800, Paul E. McKenney a écrit :
> > > On Fri, Feb 14, 2025 at 12:25:59AM +0100, Frederic Weisbecker wrote:
> > > > A CPU coming online checks for an ongoing grace period and reports
> > > > a quiescent state accordingly if needed. This special treatment that
> > > > shortcuts the expedited IPI finds its origin as an optimization purpose
> > > > on the following commit:
> > > > 
> > > > 	338b0f760e84 (rcu: Better hotplug handling for synchronize_sched_expedited()
> > > > 
> > > > The point is to avoid an IPI while waiting for a CPU to become online
> > > > or failing to become offline.
> > > > 
> > > > However this is pointless and even error prone for several reasons:
> > > > 
> > > > * If the CPU has been seen offline in the first round scanning offline
> > > >   and idle CPUs, no IPI is even tried and the quiescent state is
> > > >   reported on behalf of the CPU.
> > > > 
> > > > * This means that if the IPI fails, the CPU just became offline. So
> > > >   it's unlikely to become online right away, unless the cpu hotplug
> > > >   operation failed and rolled back, which is a rare event that can
> > > >   wait a jiffy for a new IPI to be issued.
> 
> But the expedited grace period might be preempted for an arbitrarily
> long period, especially if a hypervisor is in play.  And we do drop
> that lock midway through...

Well, then that delays the expedited grace period as a whole anyway...

> > > > For all those reasons, remove this optimization that doesn't look worthy
> > > > to keep around.
> > > 
> > > Thank you for digging into this!
> > > 
> > > When I ran tests that removed the call to sync_sched_exp_online_cleanup()
> > > a few months ago, I got grace-period hangs [1].  Has something changed
> > > to make this safe?
> > 
> > Hmm, but was it before or after "rcu: Fix get_state_synchronize_rcu_full()
> > GP-start detection" ?
> 
> Before.  There was also some buggy debug code in play.  Also, to get the
> failure, it was necessary to make TREE03 disable preemption, as stock
> TREE03 has an empty sync_sched_exp_online_cleanup() function.
> 
> I am rerunning the test with a WARN_ON_ONCE() after the early exit from
> the sync_sched_exp_online_cleanup().  Of course, lack of a failure does
> not necessairly indicate

Cool, thanks!

> 
> > And if after do we know why?
> 
> Here are some (possibly bogus) possibilities that came to mind:
> 
> 1.	There is some coming-online race that deprives the incoming
> 	CPU of an IPI, but nevertheless marks that CPU as blocking the
> 	current grace period.

Arguably there is a tiny window between rcutree_report_cpu_starting()
and set_cpu_online() that could make ->qsmaskinitnext visible before
cpu_online() and therefore delay the IPI a bit. But I don't expect
more than a jiffy to fill up the gap. And if that's relevant, note that
only !PREEMPT_RCU is then "fixed" by sync_sched_exp_online_cleanup() here.

> 
> 2.	Some strange scenario involves the CPU going offline for just a
> 	little bit, so that the IPI gets wasted on the outgoing due to
> 	neither of the "if" conditions in rcu_exp_handler() being true.
> 	The outgoing CPU just says "I need a QS", then leaves and
> 	comes back.  (The expedited grace period doesn't retry because
> 	it believes that it already sent that IPI.)

I don't think this is possible. Once the CPU enters CPUHP_TEARDOWN_CPU with
stop_machine, no more IPIs can be issued. The remaining ones are executed
at CPUHP_AP_SMPCFD_DYING, still in stop_machine. So this is the last call
for rcu_exp_handler() execution. And this last call has to be followed
by rcu_note_context_switch() between stop_machine and the final schedule to
idle. And that rcu_note_context_switch() must report the rdp exp context
switch.

One easy way to assert that is:

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 86935fe00397..40d6090a33f5 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -4347,6 +4347,12 @@ void rcutree_report_cpu_dead(void)
 	 * may introduce a new READ-side while it is actually off the QS masks.
 	 */
 	lockdep_assert_irqs_disabled();
+	/*
+	 * CPUHP_AP_SMPCFD_DYING was the last call for rcu_exp_handler() execution.
+	 * The requested QS must have been reported on the last context switch
+	 * from stop machine to idle.
+	 */
+	WARN_ON_ONCE(rdp->cpu_no_qs.b.exp);
 	// Do any dangling deferred wakeups.
 	do_nocb_deferred_wakeup(rdp);
 
> 
> 3.	Your ideas here!  ;-)

:-)

> 
> 							Thanx, Paul
Paul E. McKenney Feb. 19, 2025, 2:58 p.m. UTC | #5
On Sat, Feb 15, 2025 at 11:23:45PM +0100, Frederic Weisbecker wrote:
> Le Sat, Feb 15, 2025 at 02:38:04AM -0800, Paul E. McKenney a écrit :
> > On Fri, Feb 14, 2025 at 01:10:52PM +0100, Frederic Weisbecker wrote:
> > > Le Fri, Feb 14, 2025 at 01:01:56AM -0800, Paul E. McKenney a écrit :
> > > > On Fri, Feb 14, 2025 at 12:25:59AM +0100, Frederic Weisbecker wrote:
> > > > > A CPU coming online checks for an ongoing grace period and reports
> > > > > a quiescent state accordingly if needed. This special treatment that
> > > > > shortcuts the expedited IPI finds its origin as an optimization purpose
> > > > > on the following commit:
> > > > > 
> > > > > 	338b0f760e84 (rcu: Better hotplug handling for synchronize_sched_expedited()
> > > > > 
> > > > > The point is to avoid an IPI while waiting for a CPU to become online
> > > > > or failing to become offline.
> > > > > 
> > > > > However this is pointless and even error prone for several reasons:
> > > > > 
> > > > > * If the CPU has been seen offline in the first round scanning offline
> > > > >   and idle CPUs, no IPI is even tried and the quiescent state is
> > > > >   reported on behalf of the CPU.
> > > > > 
> > > > > * This means that if the IPI fails, the CPU just became offline. So
> > > > >   it's unlikely to become online right away, unless the cpu hotplug
> > > > >   operation failed and rolled back, which is a rare event that can
> > > > >   wait a jiffy for a new IPI to be issued.
> > 
> > But the expedited grace period might be preempted for an arbitrarily
> > long period, especially if a hypervisor is in play.  And we do drop
> > that lock midway through...
> 
> Well, then that delays the expedited grace period as a whole anyway...

Fair enough.  Part of this is the paranoia that has served me so well.
But which can also cause the occasional problem.  On the other hand,
we really do occasionally lose things during CPU hotplug operations...

> > > > > For all those reasons, remove this optimization that doesn't look worthy
> > > > > to keep around.
> > > > 
> > > > Thank you for digging into this!
> > > > 
> > > > When I ran tests that removed the call to sync_sched_exp_online_cleanup()
> > > > a few months ago, I got grace-period hangs [1].  Has something changed
> > > > to make this safe?
> > > 
> > > Hmm, but was it before or after "rcu: Fix get_state_synchronize_rcu_full()
> > > GP-start detection" ?
> > 
> > Before.  There was also some buggy debug code in play.  Also, to get the
> > failure, it was necessary to make TREE03 disable preemption, as stock
> > TREE03 has an empty sync_sched_exp_online_cleanup() function.
> > 
> > I am rerunning the test with a WARN_ON_ONCE() after the early exit from
> > the sync_sched_exp_online_cleanup().  Of course, lack of a failure does
> > not necessairly indicate
> 
> Cool, thanks!

No failures.  But might it be wise to put this WARN_ON_ONCE() in,
let things go for a year or two, and complete the removal if it never
triggers?  Or is the lack of forward progress warning enough?

> > > And if after do we know why?
> > 
> > Here are some (possibly bogus) possibilities that came to mind:
> > 
> > 1.	There is some coming-online race that deprives the incoming
> > 	CPU of an IPI, but nevertheless marks that CPU as blocking the
> > 	current grace period.
> 
> Arguably there is a tiny window between rcutree_report_cpu_starting()
> and set_cpu_online() that could make ->qsmaskinitnext visible before
> cpu_online() and therefore delay the IPI a bit. But I don't expect
> more than a jiffy to fill up the gap. And if that's relevant, note that
> only !PREEMPT_RCU is then "fixed" by sync_sched_exp_online_cleanup() here.

Agreed.  And I vaguely recall that there was some difference due to
preemptible RCU's ability to clean up at the next rcu_read_unlock(),
though more recently, possibly deferred.

> > 2.	Some strange scenario involves the CPU going offline for just a
> > 	little bit, so that the IPI gets wasted on the outgoing due to
> > 	neither of the "if" conditions in rcu_exp_handler() being true.
> > 	The outgoing CPU just says "I need a QS", then leaves and
> > 	comes back.  (The expedited grace period doesn't retry because
> > 	it believes that it already sent that IPI.)
> 
> I don't think this is possible. Once the CPU enters CPUHP_TEARDOWN_CPU with
> stop_machine, no more IPIs can be issued. The remaining ones are executed
> at CPUHP_AP_SMPCFD_DYING, still in stop_machine. So this is the last call
> for rcu_exp_handler() execution. And this last call has to be followed
> by rcu_note_context_switch() between stop_machine and the final schedule to
> idle. And that rcu_note_context_switch() must report the rdp exp context
> switch.

Makes sense to me.

> One easy way to assert that is:
> 
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index 86935fe00397..40d6090a33f5 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -4347,6 +4347,12 @@ void rcutree_report_cpu_dead(void)
>  	 * may introduce a new READ-side while it is actually off the QS masks.
>  	 */
>  	lockdep_assert_irqs_disabled();
> +	/*
> +	 * CPUHP_AP_SMPCFD_DYING was the last call for rcu_exp_handler() execution.
> +	 * The requested QS must have been reported on the last context switch
> +	 * from stop machine to idle.
> +	 */
> +	WARN_ON_ONCE(rdp->cpu_no_qs.b.exp);
>  	// Do any dangling deferred wakeups.
>  	do_nocb_deferred_wakeup(rdp);

I fired off a 30-minute run of 100*TREE03 with this change also.

> > 3.	Your ideas here!  ;-)
> 
> :-)

							Thanx, Paul
Paul E. McKenney Feb. 19, 2025, 3:55 p.m. UTC | #6
On Wed, Feb 19, 2025 at 06:58:36AM -0800, Paul E. McKenney wrote:
> On Sat, Feb 15, 2025 at 11:23:45PM +0100, Frederic Weisbecker wrote:
> > Le Sat, Feb 15, 2025 at 02:38:04AM -0800, Paul E. McKenney a écrit :
> > > On Fri, Feb 14, 2025 at 01:10:52PM +0100, Frederic Weisbecker wrote:
> > > > Le Fri, Feb 14, 2025 at 01:01:56AM -0800, Paul E. McKenney a écrit :
> > > > > On Fri, Feb 14, 2025 at 12:25:59AM +0100, Frederic Weisbecker wrote:
> > > > > > A CPU coming online checks for an ongoing grace period and reports
> > > > > > a quiescent state accordingly if needed. This special treatment that
> > > > > > shortcuts the expedited IPI finds its origin as an optimization purpose
> > > > > > on the following commit:
> > > > > > 
> > > > > > 	338b0f760e84 (rcu: Better hotplug handling for synchronize_sched_expedited()
> > > > > > 
> > > > > > The point is to avoid an IPI while waiting for a CPU to become online
> > > > > > or failing to become offline.
> > > > > > 
> > > > > > However this is pointless and even error prone for several reasons:
> > > > > > 
> > > > > > * If the CPU has been seen offline in the first round scanning offline
> > > > > >   and idle CPUs, no IPI is even tried and the quiescent state is
> > > > > >   reported on behalf of the CPU.
> > > > > > 
> > > > > > * This means that if the IPI fails, the CPU just became offline. So
> > > > > >   it's unlikely to become online right away, unless the cpu hotplug
> > > > > >   operation failed and rolled back, which is a rare event that can
> > > > > >   wait a jiffy for a new IPI to be issued.
> > > 
> > > But the expedited grace period might be preempted for an arbitrarily
> > > long period, especially if a hypervisor is in play.  And we do drop
> > > that lock midway through...
> > 
> > Well, then that delays the expedited grace period as a whole anyway...
> 
> Fair enough.  Part of this is the paranoia that has served me so well.
> But which can also cause the occasional problem.  On the other hand,
> we really do occasionally lose things during CPU hotplug operations...
> 
> > > > > > For all those reasons, remove this optimization that doesn't look worthy
> > > > > > to keep around.
> > > > > 
> > > > > Thank you for digging into this!
> > > > > 
> > > > > When I ran tests that removed the call to sync_sched_exp_online_cleanup()
> > > > > a few months ago, I got grace-period hangs [1].  Has something changed
> > > > > to make this safe?
> > > > 
> > > > Hmm, but was it before or after "rcu: Fix get_state_synchronize_rcu_full()
> > > > GP-start detection" ?
> > > 
> > > Before.  There was also some buggy debug code in play.  Also, to get the
> > > failure, it was necessary to make TREE03 disable preemption, as stock
> > > TREE03 has an empty sync_sched_exp_online_cleanup() function.
> > > 
> > > I am rerunning the test with a WARN_ON_ONCE() after the early exit from
> > > the sync_sched_exp_online_cleanup().  Of course, lack of a failure does
> > > not necessairly indicate
> > 
> > Cool, thanks!
> 
> No failures.  But might it be wise to put this WARN_ON_ONCE() in,
> let things go for a year or two, and complete the removal if it never
> triggers?  Or is the lack of forward progress warning enough?
> 
> > > > And if after do we know why?
> > > 
> > > Here are some (possibly bogus) possibilities that came to mind:
> > > 
> > > 1.	There is some coming-online race that deprives the incoming
> > > 	CPU of an IPI, but nevertheless marks that CPU as blocking the
> > > 	current grace period.
> > 
> > Arguably there is a tiny window between rcutree_report_cpu_starting()
> > and set_cpu_online() that could make ->qsmaskinitnext visible before
> > cpu_online() and therefore delay the IPI a bit. But I don't expect
> > more than a jiffy to fill up the gap. And if that's relevant, note that
> > only !PREEMPT_RCU is then "fixed" by sync_sched_exp_online_cleanup() here.
> 
> Agreed.  And I vaguely recall that there was some difference due to
> preemptible RCU's ability to clean up at the next rcu_read_unlock(),
> though more recently, possibly deferred.
> 
> > > 2.	Some strange scenario involves the CPU going offline for just a
> > > 	little bit, so that the IPI gets wasted on the outgoing due to
> > > 	neither of the "if" conditions in rcu_exp_handler() being true.
> > > 	The outgoing CPU just says "I need a QS", then leaves and
> > > 	comes back.  (The expedited grace period doesn't retry because
> > > 	it believes that it already sent that IPI.)
> > 
> > I don't think this is possible. Once the CPU enters CPUHP_TEARDOWN_CPU with
> > stop_machine, no more IPIs can be issued. The remaining ones are executed
> > at CPUHP_AP_SMPCFD_DYING, still in stop_machine. So this is the last call
> > for rcu_exp_handler() execution. And this last call has to be followed
> > by rcu_note_context_switch() between stop_machine and the final schedule to
> > idle. And that rcu_note_context_switch() must report the rdp exp context
> > switch.
> 
> Makes sense to me.
> 
> > One easy way to assert that is:
> > 
> > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> > index 86935fe00397..40d6090a33f5 100644
> > --- a/kernel/rcu/tree.c
> > +++ b/kernel/rcu/tree.c
> > @@ -4347,6 +4347,12 @@ void rcutree_report_cpu_dead(void)
> >  	 * may introduce a new READ-side while it is actually off the QS masks.
> >  	 */
> >  	lockdep_assert_irqs_disabled();
> > +	/*
> > +	 * CPUHP_AP_SMPCFD_DYING was the last call for rcu_exp_handler() execution.
> > +	 * The requested QS must have been reported on the last context switch
> > +	 * from stop machine to idle.
> > +	 */
> > +	WARN_ON_ONCE(rdp->cpu_no_qs.b.exp);
> >  	// Do any dangling deferred wakeups.
> >  	do_nocb_deferred_wakeup(rdp);
> 
> I fired off a 30-minute run of 100*TREE03 with this change also.

And no failures!

							Thanx, Paul

> > > 3.	Your ideas here!  ;-)
> > 
> > :-)
> 
> 							Thanx, Paul
diff mbox series

Patch

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 8625f616c65a..86935fe00397 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -151,7 +151,6 @@  static void rcu_report_qs_rnp(unsigned long mask, struct rcu_node *rnp,
 			      unsigned long gps, unsigned long flags);
 static void invoke_rcu_core(void);
 static void rcu_report_exp_rdp(struct rcu_data *rdp);
-static void sync_sched_exp_online_cleanup(int cpu);
 static void check_cb_ovld_locked(struct rcu_data *rdp, struct rcu_node *rnp);
 static bool rcu_rdp_is_offloaded(struct rcu_data *rdp);
 static bool rcu_rdp_cpu_online(struct rcu_data *rdp);
@@ -4259,7 +4258,6 @@  int rcutree_online_cpu(unsigned int cpu)
 	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 	if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
 		return 0; /* Too early in boot for scheduler work. */
-	sync_sched_exp_online_cleanup(cpu);
 
 	// Stop-machine done, so allow nohz_full to disable tick.
 	tick_dep_clear(TICK_DEP_BIT_RCU);
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index caff16e441d1..a3f962eb7057 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -751,12 +751,8 @@  static void rcu_exp_handler(void *unused)
 	struct task_struct *t = current;
 
 	/*
-	 * First, is there no need for a quiescent state from this CPU,
-	 * or is this CPU already looking for a quiescent state for the
-	 * current grace period?  If either is the case, just leave.
-	 * However, this should not happen due to the preemptible
-	 * sync_sched_exp_online_cleanup() implementation being a no-op,
-	 * so warn if this does happen.
+	 * WARN if the CPU is unexpectedly already looking for a
+	 * QS or has already reported one.
 	 */
 	ASSERT_EXCLUSIVE_WRITER_SCOPED(rdp->cpu_no_qs.b.exp);
 	if (WARN_ON_ONCE(!(READ_ONCE(rnp->expmask) & rdp->grpmask) ||
@@ -803,11 +799,6 @@  static void rcu_exp_handler(void *unused)
 	WARN_ON_ONCE(1);
 }
 
-/* PREEMPTION=y, so no PREEMPTION=n expedited grace period to clean up after. */
-static void sync_sched_exp_online_cleanup(int cpu)
-{
-}
-
 /*
  * Scan the current list of tasks blocked within RCU read-side critical
  * sections, printing out the tid of each that is blocking the current
@@ -885,38 +876,6 @@  static void rcu_exp_handler(void *unused)
 	rcu_exp_need_qs();
 }
 
-/* Send IPI for expedited cleanup if needed at end of CPU-hotplug operation. */
-static void sync_sched_exp_online_cleanup(int cpu)
-{
-	unsigned long flags;
-	int my_cpu;
-	struct rcu_data *rdp;
-	int ret;
-	struct rcu_node *rnp;
-
-	rdp = per_cpu_ptr(&rcu_data, cpu);
-	rnp = rdp->mynode;
-	my_cpu = get_cpu();
-	/* Quiescent state either not needed or already requested, leave. */
-	if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) ||
-	    READ_ONCE(rdp->cpu_no_qs.b.exp)) {
-		put_cpu();
-		return;
-	}
-	/* Quiescent state needed on current CPU, so set it up locally. */
-	if (my_cpu == cpu) {
-		local_irq_save(flags);
-		rcu_exp_need_qs();
-		local_irq_restore(flags);
-		put_cpu();
-		return;
-	}
-	/* Quiescent state needed on some other CPU, send IPI. */
-	ret = smp_call_function_single(cpu, rcu_exp_handler, NULL, 0);
-	put_cpu();
-	WARN_ON_ONCE(ret);
-}
-
 /*
  * Because preemptible RCU does not exist, we never have to check for
  * tasks blocked within RCU read-side critical sections that are