diff mbox series

[v7,01/11] rcu: Wake up nocb gp thread on rcu_barrier_entrain()

Message ID 20221004024157.2470238-2-joel@joelfernandes.org (mailing list archive)
State Superseded
Headers show
Series rcu: call_rcu() power improvements | expand

Commit Message

Joel Fernandes Oct. 4, 2022, 2:41 a.m. UTC
From: Frederic Weisbecker <frederic@kernel.org>

In preparation of RCU lazy changes, wake up the RCU nocb gp thread if
needed after an entrain. Otherwise, the RCU barrier callback can wait in
the queue for several seconds before the lazy callbacks in front of it
are serviced.

Reported-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 kernel/rcu/tree.c      | 11 +++++++++++
 kernel/rcu/tree_nocb.h |  4 ++++
 2 files changed, 15 insertions(+)

Comments

Frederic Weisbecker Oct. 4, 2022, 10:28 p.m. UTC | #1
On Tue, Oct 04, 2022 at 02:41:47AM +0000, Joel Fernandes (Google) wrote:
> From: Frederic Weisbecker <frederic@kernel.org>
> 
> In preparation of RCU lazy changes, wake up the RCU nocb gp thread if

It's more than just prep work for a new feature, it's a regression fix.

> needed after an entrain. Otherwise, the RCU barrier callback can wait in
> the queue for several seconds before the lazy callbacks in front of it
> are serviced.

It's not about lazy callbacks here (but you can mention the fact that
waking nocb_gp if necessary after flushing bypass is a beneficial side
effect for further lazy implementation).

So here is the possible bad scenario:

1) CPU 0 is nocb, it queues a callback
2) CPU 0 goes idle (or userspace with nohz_full) forever
3) The grace period related to that callback elapses
4) The callback is moved to the done list (but is not invoked yet), there are no more pending for CPU 0
5) CPU 1 calls rcu_barrier() and entrains to CPU 0 cblist
6) CPU 1 waits forever

> 
> Reported-by: Joel Fernandes (Google) <joel@joelfernandes.org>

Fixes: 5d6742b37727 ("rcu/nocb: Use rcu_segcblist for no-CBs CPUs")

Thanks.

> Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
> ---
>  kernel/rcu/tree.c      | 11 +++++++++++
>  kernel/rcu/tree_nocb.h |  4 ++++
>  2 files changed, 15 insertions(+)
> 
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index 5ec97e3f7468..04f33191e5ed 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -3894,6 +3894,8 @@ static void rcu_barrier_entrain(struct rcu_data *rdp)
>  {
>  	unsigned long gseq = READ_ONCE(rcu_state.barrier_sequence);
>  	unsigned long lseq = READ_ONCE(rdp->barrier_seq_snap);
> +	bool wake_nocb = false;
> +	bool was_done = false;
>  
>  	lockdep_assert_held(&rcu_state.barrier_lock);
>  	if (rcu_seq_state(lseq) || !rcu_seq_state(gseq) || rcu_seq_ctr(lseq) != rcu_seq_ctr(gseq))
> @@ -3902,6 +3904,7 @@ static void rcu_barrier_entrain(struct rcu_data *rdp)
>  	rdp->barrier_head.func = rcu_barrier_callback;
>  	debug_rcu_head_queue(&rdp->barrier_head);
>  	rcu_nocb_lock(rdp);
> +	was_done = rcu_rdp_is_offloaded(rdp) && !rcu_segcblist_pend_cbs(&rdp->cblist);
>  	WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies));
>  	if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head)) {
>  		atomic_inc(&rcu_state.barrier_cpu_count);
> @@ -3909,7 +3912,15 @@ static void rcu_barrier_entrain(struct rcu_data *rdp)
>  		debug_rcu_head_unqueue(&rdp->barrier_head);
>  		rcu_barrier_trace(TPS("IRQNQ"), -1, rcu_state.barrier_sequence);
>  	}
> +
> +	/*
> +	 * If bypass list was non-empty, wake up the nocb GP thread otherwise
> +	 * bypass/lazy CBs may not be noticed, and can cause real long delays!
> +	 */
> +	wake_nocb = was_done && rcu_segcblist_pend_cbs(&rdp->cblist);
>  	rcu_nocb_unlock(rdp);
> +	if (wake_nocb)
> +		wake_nocb_gp(rdp, false);
>  	smp_store_release(&rdp->barrier_seq_snap, gseq);
>  }
>  
> diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
> index f77a6d7e1356..6caade0683dd 100644
> --- a/kernel/rcu/tree_nocb.h
> +++ b/kernel/rcu/tree_nocb.h
> @@ -1558,6 +1558,10 @@ static void rcu_init_one_nocb(struct rcu_node *rnp)
>  {
>  }
>  
> +static bool wake_nocb_gp(struct rcu_data *rdp, bool force)
> +{
> +}
> +
>  static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
>  				  unsigned long j)
>  {
> -- 
> 2.38.0.rc1.362.ged0d419d3c-goog
>
Joel Fernandes Oct. 4, 2022, 10:57 p.m. UTC | #2
Hi Frederic,

On 10/4/2022 6:28 PM, Frederic Weisbecker wrote:
> On Tue, Oct 04, 2022 at 02:41:47AM +0000, Joel Fernandes (Google) wrote:
>> From: Frederic Weisbecker <frederic@kernel.org>
>>
>> In preparation of RCU lazy changes, wake up the RCU nocb gp thread if
> 
> It's more than just prep work for a new feature, it's a regression fix.

Oh ok, both our fixes are equivalent but I chose yours since its cleaner. I was
fixing Lazy CBs since I can actually trigger this issue.

>> needed after an entrain. Otherwise, the RCU barrier callback can wait in
>> the queue for several seconds before the lazy callbacks in front of it
>> are serviced.
> 
> It's not about lazy callbacks here (but you can mention the fact that
> waking nocb_gp if necessary after flushing bypass is a beneficial side
> effect for further lazy implementation).
> 
> So here is the possible bad scenario:
> 
> 1) CPU 0 is nocb, it queues a callback
> 2) CPU 0 goes idle (or userspace with nohz_full) forever
> 3) The grace period related to that callback elapses
> 4) The callback is moved to the done list (but is not invoked yet), there are no more pending for CPU 0
> 5) CPU 1 calls rcu_barrier() and entrains to CPU 0 cblist

CPU 1 can only entrain into CPU 0 if the CPU is offline:

		if (!rcu_rdp_cpu_online(rdp)) {
			rcu_barrier_entrain(rdp);
			WARN_ON_ONCE(READ_ONCE(rdp->barrier_seq_snap) != gseq);
			raw_spin_unlock_irqrestore(&rcu_state.barrier_lock,
			...
			continue;
		}

Otherwise an IPI does the entraining. So I do not see how CPU 0 being idle
causes the cross-CPU entraining.

> 6) CPU 1 waits forever

But, I agree it can still wait forever, once the IPI handler does the
entraining, since nothing will do the GP thread wakeup.

>>
>> Reported-by: Joel Fernandes (Google) <joel@joelfernandes.org>
> 
> Fixes: 5d6742b37727 ("rcu/nocb: Use rcu_segcblist for no-CBs CPUs")

So, do you mind writing a proper patch with a proper commit message and Fixes
tag then? It can independent of this series and add my Reported-by tag, thanks!

Thanks!

 - Joel
Frederic Weisbecker Oct. 5, 2022, 10:39 a.m. UTC | #3
On Tue, Oct 04, 2022 at 06:57:59PM -0400, Joel Fernandes wrote:
> >> needed after an entrain. Otherwise, the RCU barrier callback can wait in
> >> the queue for several seconds before the lazy callbacks in front of it
> >> are serviced.
> > 
> > It's not about lazy callbacks here (but you can mention the fact that
> > waking nocb_gp if necessary after flushing bypass is a beneficial side
> > effect for further lazy implementation).
> > 
> > So here is the possible bad scenario:
> > 
> > 1) CPU 0 is nocb, it queues a callback
> > 2) CPU 0 goes idle (or userspace with nohz_full) forever
> > 3) The grace period related to that callback elapses
> > 4) The callback is moved to the done list (but is not invoked yet), there are no more pending for CPU 0
> > 5) CPU 1 calls rcu_barrier() and entrains to CPU 0 cblist
> 
> CPU 1 can only entrain into CPU 0 if the CPU is offline:
> 
> 		if (!rcu_rdp_cpu_online(rdp)) {
> 			rcu_barrier_entrain(rdp);
> 			WARN_ON_ONCE(READ_ONCE(rdp->barrier_seq_snap) != gseq);
> 			raw_spin_unlock_irqrestore(&rcu_state.barrier_lock,
> 			...
> 			continue;
> 		}

Ah good point. So CPU 1 sends an IPI to CPU 0 which entrains itself.
And then looks like the result is the same.


> 
> Otherwise an IPI does the entraining. So I do not see how CPU 0 being idle
> causes the cross-CPU entraining.

It doesn't but it shows that the CPU isn't going to enqueue any further
callback before a while. Though even if it did, it may not even solve the
situation, not until an RCU_NOCB_WAKE_FORCE is issued...

> 
> > 6) CPU 1 waits forever
> 
> But, I agree it can still wait forever, once the IPI handler does the
> entraining, since nothing will do the GP thread wakeup.
> 
> >>
> >> Reported-by: Joel Fernandes (Google) <joel@joelfernandes.org>
> > 
> > Fixes: 5d6742b37727 ("rcu/nocb: Use rcu_segcblist for no-CBs CPUs")
> 
> So, do you mind writing a proper patch with a proper commit message and Fixes
> tag then? It can independent of this series and add my Reported-by tag,
> thanks!

Ok will do.

Thanks!

> 
> Thanks!
> 
>  - Joel
>
Joel Fernandes Oct. 7, 2022, 2:47 a.m. UTC | #4
On Tue, Oct 04, 2022 at 06:57:59PM -0400, Joel Fernandes wrote:
> Hi Frederic,
> 
> On 10/4/2022 6:28 PM, Frederic Weisbecker wrote:
> > On Tue, Oct 04, 2022 at 02:41:47AM +0000, Joel Fernandes (Google) wrote:
> >> From: Frederic Weisbecker <frederic@kernel.org>
> >>
> >> In preparation of RCU lazy changes, wake up the RCU nocb gp thread if
> > 
> > It's more than just prep work for a new feature, it's a regression fix.
> 
> Oh ok, both our fixes are equivalent but I chose yours since its cleaner. I was
> fixing Lazy CBs since I can actually trigger this issue.
> 
> >> needed after an entrain. Otherwise, the RCU barrier callback can wait in
> >> the queue for several seconds before the lazy callbacks in front of it
> >> are serviced.
> > 
> > It's not about lazy callbacks here (but you can mention the fact that
> > waking nocb_gp if necessary after flushing bypass is a beneficial side
> > effect for further lazy implementation).
> > 
> > So here is the possible bad scenario:
> > 
> > 1) CPU 0 is nocb, it queues a callback
> > 2) CPU 0 goes idle (or userspace with nohz_full) forever
> > 3) The grace period related to that callback elapses
> > 4) The callback is moved to the done list (but is not invoked yet), there are no more pending for CPU 0
> > 5) CPU 1 calls rcu_barrier() and entrains to CPU 0 cblist
> 
> CPU 1 can only entrain into CPU 0 if the CPU is offline:
> 
> 		if (!rcu_rdp_cpu_online(rdp)) {
> 			rcu_barrier_entrain(rdp);
> 			WARN_ON_ONCE(READ_ONCE(rdp->barrier_seq_snap) != gseq);
> 			raw_spin_unlock_irqrestore(&rcu_state.barrier_lock,
> 			...
> 			continue;
> 		}
> 
> Otherwise an IPI does the entraining. So I do not see how CPU 0 being idle
> causes the cross-CPU entraining.
> 
> > 6) CPU 1 waits forever
> 
> But, I agree it can still wait forever, once the IPI handler does the
> entraining, since nothing will do the GP thread wakeup.
> 
> >>
> >> Reported-by: Joel Fernandes (Google) <joel@joelfernandes.org>
> > 
> > Fixes: 5d6742b37727 ("rcu/nocb: Use rcu_segcblist for no-CBs CPUs")
> 
> So, do you mind writing a proper patch with a proper commit message and Fixes
> tag then? It can independent of this series and add my Reported-by tag, thanks!

And a small hunk went into the wrong patch so I pulled it back into this one,
here is the updated version of this patch:

(Don't pull this as Frederic will post this one separately, just posting here
 for reference).
---8<-----------------------

From: Frederic Weisbecker <frederic@kernel.org>
Subject: [PATCH] rcu: Wake up nocb gp thread on rcu_barrier_entrain()

In preparation of RCU lazy changes, wake up the RCU nocb gp thread if
needed after an entrain. Otherwise, the RCU barrier callback can wait in
the queue for several seconds before the lazy callbacks in front of it
are serviced.

Reported-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 kernel/rcu/tree.c      | 11 +++++++++++
 kernel/rcu/tree.h      |  1 +
 kernel/rcu/tree_nocb.h |  4 ++++
 3 files changed, 16 insertions(+)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 5ec97e3f7468..04f33191e5ed 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3894,6 +3894,8 @@ static void rcu_barrier_entrain(struct rcu_data *rdp)
 {
 	unsigned long gseq = READ_ONCE(rcu_state.barrier_sequence);
 	unsigned long lseq = READ_ONCE(rdp->barrier_seq_snap);
+	bool wake_nocb = false;
+	bool was_done = false;
 
 	lockdep_assert_held(&rcu_state.barrier_lock);
 	if (rcu_seq_state(lseq) || !rcu_seq_state(gseq) || rcu_seq_ctr(lseq) != rcu_seq_ctr(gseq))
@@ -3902,6 +3904,7 @@ static void rcu_barrier_entrain(struct rcu_data *rdp)
 	rdp->barrier_head.func = rcu_barrier_callback;
 	debug_rcu_head_queue(&rdp->barrier_head);
 	rcu_nocb_lock(rdp);
+	was_done = rcu_rdp_is_offloaded(rdp) && !rcu_segcblist_pend_cbs(&rdp->cblist);
 	WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies));
 	if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head)) {
 		atomic_inc(&rcu_state.barrier_cpu_count);
@@ -3909,7 +3912,15 @@ static void rcu_barrier_entrain(struct rcu_data *rdp)
 		debug_rcu_head_unqueue(&rdp->barrier_head);
 		rcu_barrier_trace(TPS("IRQNQ"), -1, rcu_state.barrier_sequence);
 	}
+
+	/*
+	 * If bypass list was non-empty, wake up the nocb GP thread otherwise
+	 * bypass/lazy CBs may not be noticed, and can cause real long delays!
+	 */
+	wake_nocb = was_done && rcu_segcblist_pend_cbs(&rdp->cblist);
 	rcu_nocb_unlock(rdp);
+	if (wake_nocb)
+		wake_nocb_gp(rdp, false);
 	smp_store_release(&rdp->barrier_seq_snap, gseq);
 }
 
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index d4a97e40ea9c..41faaf8eb872 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -443,6 +443,7 @@ static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
 				  unsigned long j);
 static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
 				bool *was_alldone, unsigned long flags);
+static bool wake_nocb_gp(struct rcu_data *rdp, bool force);
 static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty,
 				 unsigned long flags);
 static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level);
diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index f77a6d7e1356..6caade0683dd 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -1558,6 +1558,10 @@ static void rcu_init_one_nocb(struct rcu_node *rnp)
 {
 }
 
+static bool wake_nocb_gp(struct rcu_data *rdp, bool force)
+{
+}
+
 static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
 				  unsigned long j)
 {
Frederic Weisbecker Oct. 7, 2022, 11:26 a.m. UTC | #5
On Fri, Oct 07, 2022 at 02:47:52AM +0000, Joel Fernandes wrote:
> On Tue, Oct 04, 2022 at 06:57:59PM -0400, Joel Fernandes wrote:
> 
> From: Frederic Weisbecker <frederic@kernel.org>
> Subject: [PATCH] rcu: Wake up nocb gp thread on rcu_barrier_entrain()
> 
> In preparation of RCU lazy changes, wake up the RCU nocb gp thread if
> needed after an entrain. Otherwise, the RCU barrier callback can wait in
> the queue for several seconds before the lazy callbacks in front of it
> are serviced.
> 
> Reported-by: Joel Fernandes (Google) <joel@joelfernandes.org>
> Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
> ---
>  kernel/rcu/tree.c      | 11 +++++++++++
>  kernel/rcu/tree.h      |  1 +
>  kernel/rcu/tree_nocb.h |  4 ++++
>  3 files changed, 16 insertions(+)
> 
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index 5ec97e3f7468..04f33191e5ed 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -3894,6 +3894,8 @@ static void rcu_barrier_entrain(struct rcu_data *rdp)
>  {
>  	unsigned long gseq = READ_ONCE(rcu_state.barrier_sequence);
>  	unsigned long lseq = READ_ONCE(rdp->barrier_seq_snap);
> +	bool wake_nocb = false;
> +	bool was_done = false;
>  
>  	lockdep_assert_held(&rcu_state.barrier_lock);
>  	if (rcu_seq_state(lseq) || !rcu_seq_state(gseq) || rcu_seq_ctr(lseq) != rcu_seq_ctr(gseq))
> @@ -3902,6 +3904,7 @@ static void rcu_barrier_entrain(struct rcu_data *rdp)
>  	rdp->barrier_head.func = rcu_barrier_callback;
>  	debug_rcu_head_queue(&rdp->barrier_head);
>  	rcu_nocb_lock(rdp);
> +	was_done = rcu_rdp_is_offloaded(rdp) && !rcu_segcblist_pend_cbs(&rdp->cblist);
>  	WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies));
>  	if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head)) {
>  		atomic_inc(&rcu_state.barrier_cpu_count);
> @@ -3909,7 +3912,15 @@ static void rcu_barrier_entrain(struct rcu_data *rdp)
>  		debug_rcu_head_unqueue(&rdp->barrier_head);
>  		rcu_barrier_trace(TPS("IRQNQ"), -1, rcu_state.barrier_sequence);
>  	}
> +
> +	/*
> +	 * If bypass list was non-empty, wake up the nocb GP thread otherwise
> +	 * bypass/lazy CBs may not be noticed, and can cause real long delays!
> +	 */
> +	wake_nocb = was_done && rcu_segcblist_pend_cbs(&rdp->cblist);
>  	rcu_nocb_unlock(rdp);
> +	if (wake_nocb)
> +		wake_nocb_gp(rdp, false);
>  	smp_store_release(&rdp->barrier_seq_snap, gseq);
>  }
>  
> diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
> index d4a97e40ea9c..41faaf8eb872 100644
> --- a/kernel/rcu/tree.h
> +++ b/kernel/rcu/tree.h
> @@ -443,6 +443,7 @@ static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
>  				  unsigned long j);
>  static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
>  				bool *was_alldone, unsigned long flags);
> +static bool wake_nocb_gp(struct rcu_data *rdp, bool force);
>  static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty,
>  				 unsigned long flags);
>  static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level);
> diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
> index f77a6d7e1356..6caade0683dd 100644
> --- a/kernel/rcu/tree_nocb.h
> +++ b/kernel/rcu/tree_nocb.h
> @@ -1558,6 +1558,10 @@ static void rcu_init_one_nocb(struct rcu_node *rnp)
>  {
>  }
>  
> +static bool wake_nocb_gp(struct rcu_data *rdp, bool force)
> +{

It misses a return value...

> +}

So I planned to post an updated version (please find below) but I triggered
a rare TREE01 stall related to rcu_barrier(). I just can't remember if it was
before or after the patch :-s

Anyway I'm holding it off until I get more clues.

---
From: Frederic Weisbecker <frederic@kernel.org>
Date: Tue, 4 Oct 2022 02:41:47 +0000
Subject: [PATCH] rcu: Fix missing nocb gp wake on rcu_barrier()

Upon entraining a callback to a NOCB CPU, no further wake up is
performed on the corresponding nocb_gp kthread. As a result, the callback
and all the subsequent ones on that CPU may be ignored, at least until
an RCU_NOCB_WAKE_FORCE timer is ever armed.

Here is a possible bad scenario:

1) CPU 0 is NOCB unlike all other CPUs.
2) CPU 0 queues a callback
2) The grace period related to that callback elapses
3) The callback is moved to the done list (but is not invoked yet),
   there are no more pending callbacks for CPU 0
4) CPU 1 calls rcu_barrier() and sends an IPI to CPU 0
5) CPU 0 entrains the callback but doesn't wake up nocb_gp
6) CPU 1 blocks forever, unless CPU 0 ever queues enough further
   callbacks to arm an RCU_NOCB_WAKE_FORCE timer.

Make sure the necessary wake up is issued if necessary.

Reported-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Fixes: 5d6742b37727 ("rcu/nocb: Use rcu_segcblist for no-CBs CPUs")
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 kernel/rcu/tree.c      | 6 ++++++
 kernel/rcu/tree.h      | 1 +
 kernel/rcu/tree_nocb.h | 5 +++++
 3 files changed, 12 insertions(+)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 96d678c9cfb6..025f59f6f97f 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3914,6 +3914,8 @@ static void rcu_barrier_entrain(struct rcu_data *rdp)
 {
 	unsigned long gseq = READ_ONCE(rcu_state.barrier_sequence);
 	unsigned long lseq = READ_ONCE(rdp->barrier_seq_snap);
+	bool wake_nocb = false;
+	bool was_alldone = false;
 
 	lockdep_assert_held(&rcu_state.barrier_lock);
 	if (rcu_seq_state(lseq) || !rcu_seq_state(gseq) || rcu_seq_ctr(lseq) != rcu_seq_ctr(gseq))
@@ -3922,6 +3924,7 @@ static void rcu_barrier_entrain(struct rcu_data *rdp)
 	rdp->barrier_head.func = rcu_barrier_callback;
 	debug_rcu_head_queue(&rdp->barrier_head);
 	rcu_nocb_lock(rdp);
+	was_alldone = rcu_rdp_is_offloaded(rdp) && !rcu_segcblist_pend_cbs(&rdp->cblist);
 	WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies));
 	if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head)) {
 		atomic_inc(&rcu_state.barrier_cpu_count);
@@ -3929,7 +3932,10 @@ static void rcu_barrier_entrain(struct rcu_data *rdp)
 		debug_rcu_head_unqueue(&rdp->barrier_head);
 		rcu_barrier_trace(TPS("IRQNQ"), -1, rcu_state.barrier_sequence);
 	}
+	wake_nocb = was_alldone && rcu_segcblist_pend_cbs(&rdp->cblist);
 	rcu_nocb_unlock(rdp);
+	if (wake_nocb)
+		wake_nocb_gp(rdp, false);
 	smp_store_release(&rdp->barrier_seq_snap, gseq);
 }
 
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index d4a97e40ea9c..925dd98f8b23 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -439,6 +439,7 @@ static void zero_cpu_stall_ticks(struct rcu_data *rdp);
 static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp);
 static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq);
 static void rcu_init_one_nocb(struct rcu_node *rnp);
+static bool wake_nocb_gp(struct rcu_data *rdp, bool force);
 static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
 				  unsigned long j);
 static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index f77a6d7e1356..094fd454b6c3 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -1558,6 +1558,11 @@ static void rcu_init_one_nocb(struct rcu_node *rnp)
 {
 }
 
+static bool wake_nocb_gp(struct rcu_data *rdp, bool force)
+{
+	return false;
+}
+
 static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
 				  unsigned long j)
 {
Joel Fernandes Oct. 7, 2022, 12:46 p.m. UTC | #6
> On Oct 7, 2022, at 7:26 AM, Frederic Weisbecker <frederic@kernel.org> wrote:
> 
> On Fri, Oct 07, 2022 at 02:47:52AM +0000, Joel Fernandes wrote:
>> On Tue, Oct 04, 2022 at 06:57:59PM -0400, Joel Fernandes wrote:
>> 
>> From: Frederic Weisbecker <frederic@kernel.org>
>> Subject: [PATCH] rcu: Wake up nocb gp thread on rcu_barrier_entrain()
>> 
>> In preparation of RCU lazy changes, wake up the RCU nocb gp thread if
>> needed after an entrain. Otherwise, the RCU barrier callback can wait in
>> the queue for several seconds before the lazy callbacks in front of it
>> are serviced.
>> 
>> Reported-by: Joel Fernandes (Google) <joel@joelfernandes.org>
>> Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
>> ---
>> kernel/rcu/tree.c      | 11 +++++++++++
>> kernel/rcu/tree.h      |  1 +
>> kernel/rcu/tree_nocb.h |  4 ++++
>> 3 files changed, 16 insertions(+)
>> 
>> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
>> index 5ec97e3f7468..04f33191e5ed 100644
>> --- a/kernel/rcu/tree.c
>> +++ b/kernel/rcu/tree.c
>> @@ -3894,6 +3894,8 @@ static void rcu_barrier_entrain(struct rcu_data *rdp)
>> {
>>    unsigned long gseq = READ_ONCE(rcu_state.barrier_sequence);
>>    unsigned long lseq = READ_ONCE(rdp->barrier_seq_snap);
>> +    bool wake_nocb = false;
>> +    bool was_done = false;
>> 
>>    lockdep_assert_held(&rcu_state.barrier_lock);
>>    if (rcu_seq_state(lseq) || !rcu_seq_state(gseq) || rcu_seq_ctr(lseq) != rcu_seq_ctr(gseq))
>> @@ -3902,6 +3904,7 @@ static void rcu_barrier_entrain(struct rcu_data *rdp)
>>    rdp->barrier_head.func = rcu_barrier_callback;
>>    debug_rcu_head_queue(&rdp->barrier_head);
>>    rcu_nocb_lock(rdp);
>> +    was_done = rcu_rdp_is_offloaded(rdp) && !rcu_segcblist_pend_cbs(&rdp->cblist);
>>    WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies));
>>    if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head)) {
>>        atomic_inc(&rcu_state.barrier_cpu_count);
>> @@ -3909,7 +3912,15 @@ static void rcu_barrier_entrain(struct rcu_data *rdp)
>>        debug_rcu_head_unqueue(&rdp->barrier_head);
>>        rcu_barrier_trace(TPS("IRQNQ"), -1, rcu_state.barrier_sequence);
>>    }
>> +
>> +    /*
>> +     * If bypass list was non-empty, wake up the nocb GP thread otherwise
>> +     * bypass/lazy CBs may not be noticed, and can cause real long delays!
>> +     */
>> +    wake_nocb = was_done && rcu_segcblist_pend_cbs(&rdp->cblist);
>>    rcu_nocb_unlock(rdp);
>> +    if (wake_nocb)
>> +        wake_nocb_gp(rdp, false);
>>    smp_store_release(&rdp->barrier_seq_snap, gseq);
>> }
>> 
>> diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
>> index d4a97e40ea9c..41faaf8eb872 100644
>> --- a/kernel/rcu/tree.h
>> +++ b/kernel/rcu/tree.h
>> @@ -443,6 +443,7 @@ static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
>>                  unsigned long j);
>> static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
>>                bool *was_alldone, unsigned long flags);
>> +static bool wake_nocb_gp(struct rcu_data *rdp, bool force);
>> static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty,
>>                 unsigned long flags);
>> static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level);
>> diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
>> index f77a6d7e1356..6caade0683dd 100644
>> --- a/kernel/rcu/tree_nocb.h
>> +++ b/kernel/rcu/tree_nocb.h
>> @@ -1558,6 +1558,10 @@ static void rcu_init_one_nocb(struct rcu_node *rnp)
>> {
>> }
>> 
>> +static bool wake_nocb_gp(struct rcu_data *rdp, bool force)
>> +{
> 
> It misses a return value...

Ah, true. I had the return in my tree but not in the posting, sigh.

>> +}
> 
> So I planned to post an updated version (please find below) but I triggered
> a rare TREE01 stall related to rcu_barrier(). I just can't remember if it was
> before or after the patch :-s

Weird.

Thanks,

- Joel




> Anyway I'm holding it off until I get more clues.
> 
> ---
> From: Frederic Weisbecker <frederic@kernel.org>
> Date: Tue, 4 Oct 2022 02:41:47 +0000
> Subject: [PATCH] rcu: Fix missing nocb gp wake on rcu_barrier()
> 
> Upon entraining a callback to a NOCB CPU, no further wake up is
> performed on the corresponding nocb_gp kthread. As a result, the callback
> and all the subsequent ones on that CPU may be ignored, at least until
> an RCU_NOCB_WAKE_FORCE timer is ever armed.
> 
> Here is a possible bad scenario:
> 
> 1) CPU 0 is NOCB unlike all other CPUs.
> 2) CPU 0 queues a callback
> 2) The grace period related to that callback elapses
> 3) The callback is moved to the done list (but is not invoked yet),
>   there are no more pending callbacks for CPU 0
> 4) CPU 1 calls rcu_barrier() and sends an IPI to CPU 0
> 5) CPU 0 entrains the callback but doesn't wake up nocb_gp
> 6) CPU 1 blocks forever, unless CPU 0 ever queues enough further
>   callbacks to arm an RCU_NOCB_WAKE_FORCE timer.
> 
> Make sure the necessary wake up is issued if necessary.
> 
> Reported-by: Joel Fernandes (Google) <joel@joelfernandes.org>
> Fixes: 5d6742b37727 ("rcu/nocb: Use rcu_segcblist for no-CBs CPUs")
> Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
> ---
> kernel/rcu/tree.c      | 6 ++++++
> kernel/rcu/tree.h      | 1 +
> kernel/rcu/tree_nocb.h | 5 +++++
> 3 files changed, 12 insertions(+)
> 
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index 96d678c9cfb6..025f59f6f97f 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -3914,6 +3914,8 @@ static void rcu_barrier_entrain(struct rcu_data *rdp)
> {
>    unsigned long gseq = READ_ONCE(rcu_state.barrier_sequence);
>    unsigned long lseq = READ_ONCE(rdp->barrier_seq_snap);
> +    bool wake_nocb = false;
> +    bool was_alldone = false;
> 
>    lockdep_assert_held(&rcu_state.barrier_lock);
>    if (rcu_seq_state(lseq) || !rcu_seq_state(gseq) || rcu_seq_ctr(lseq) != rcu_seq_ctr(gseq))
> @@ -3922,6 +3924,7 @@ static void rcu_barrier_entrain(struct rcu_data *rdp)
>    rdp->barrier_head.func = rcu_barrier_callback;
>    debug_rcu_head_queue(&rdp->barrier_head);
>    rcu_nocb_lock(rdp);
> +    was_alldone = rcu_rdp_is_offloaded(rdp) && !rcu_segcblist_pend_cbs(&rdp->cblist);
>    WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies));
>    if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head)) {
>        atomic_inc(&rcu_state.barrier_cpu_count);
> @@ -3929,7 +3932,10 @@ static void rcu_barrier_entrain(struct rcu_data *rdp)
>        debug_rcu_head_unqueue(&rdp->barrier_head);
>        rcu_barrier_trace(TPS("IRQNQ"), -1, rcu_state.barrier_sequence);
>    }
> +    wake_nocb = was_alldone && rcu_segcblist_pend_cbs(&rdp->cblist);
>    rcu_nocb_unlock(rdp);
> +    if (wake_nocb)
> +        wake_nocb_gp(rdp, false);
>    smp_store_release(&rdp->barrier_seq_snap, gseq);
> }
> 
> diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
> index d4a97e40ea9c..925dd98f8b23 100644
> --- a/kernel/rcu/tree.h
> +++ b/kernel/rcu/tree.h
> @@ -439,6 +439,7 @@ static void zero_cpu_stall_ticks(struct rcu_data *rdp);
> static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp);
> static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq);
> static void rcu_init_one_nocb(struct rcu_node *rnp);
> +static bool wake_nocb_gp(struct rcu_data *rdp, bool force);
> static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
>                  unsigned long j);
> static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
> diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
> index f77a6d7e1356..094fd454b6c3 100644
> --- a/kernel/rcu/tree_nocb.h
> +++ b/kernel/rcu/tree_nocb.h
> @@ -1558,6 +1558,11 @@ static void rcu_init_one_nocb(struct rcu_node *rnp)
> {
> }
> 
> +static bool wake_nocb_gp(struct rcu_data *rdp, bool force)
> +{
> +    return false;
> +}
> +
> static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
>                  unsigned long j)
> {
> -- 
> 2.25.1
> 
> 
>
diff mbox series

Patch

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 5ec97e3f7468..04f33191e5ed 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3894,6 +3894,8 @@  static void rcu_barrier_entrain(struct rcu_data *rdp)
 {
 	unsigned long gseq = READ_ONCE(rcu_state.barrier_sequence);
 	unsigned long lseq = READ_ONCE(rdp->barrier_seq_snap);
+	bool wake_nocb = false;
+	bool was_done = false;
 
 	lockdep_assert_held(&rcu_state.barrier_lock);
 	if (rcu_seq_state(lseq) || !rcu_seq_state(gseq) || rcu_seq_ctr(lseq) != rcu_seq_ctr(gseq))
@@ -3902,6 +3904,7 @@  static void rcu_barrier_entrain(struct rcu_data *rdp)
 	rdp->barrier_head.func = rcu_barrier_callback;
 	debug_rcu_head_queue(&rdp->barrier_head);
 	rcu_nocb_lock(rdp);
+	was_done = rcu_rdp_is_offloaded(rdp) && !rcu_segcblist_pend_cbs(&rdp->cblist);
 	WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies));
 	if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head)) {
 		atomic_inc(&rcu_state.barrier_cpu_count);
@@ -3909,7 +3912,15 @@  static void rcu_barrier_entrain(struct rcu_data *rdp)
 		debug_rcu_head_unqueue(&rdp->barrier_head);
 		rcu_barrier_trace(TPS("IRQNQ"), -1, rcu_state.barrier_sequence);
 	}
+
+	/*
+	 * If bypass list was non-empty, wake up the nocb GP thread otherwise
+	 * bypass/lazy CBs may not be noticed, and can cause real long delays!
+	 */
+	wake_nocb = was_done && rcu_segcblist_pend_cbs(&rdp->cblist);
 	rcu_nocb_unlock(rdp);
+	if (wake_nocb)
+		wake_nocb_gp(rdp, false);
 	smp_store_release(&rdp->barrier_seq_snap, gseq);
 }
 
diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index f77a6d7e1356..6caade0683dd 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -1558,6 +1558,10 @@  static void rcu_init_one_nocb(struct rcu_node *rnp)
 {
 }
 
+static bool wake_nocb_gp(struct rcu_data *rdp, bool force)
+{
+}
+
 static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
 				  unsigned long j)
 {