diff mbox series

[v2] rcu: Rework tick dependency setting into rcu_exp_handler()

Message ID 20230105034000.971748-1-qiang1.zhang@intel.com (mailing list archive)
State New, archived
Headers show
Series [v2] rcu: Rework tick dependency setting into rcu_exp_handler() | expand

Commit Message

Zqiang Jan. 5, 2023, 3:40 a.m. UTC
Currently, when first find out the expedited grace period is not end
and timeout occurred, we set tick dependency for CPUs that have not
yet reported the quiescent state in the rcu_node structure's->expmask
but need to eliminate races between set and clear tick dependency, setting
CPU tick dependency need to hold rcu_node structure's->lock.

This commit move tick dependency setting into rcu_exp_handler(), set tick
dependency when fail to report a quiescent state and clear tick dependency
in rcu_report_exp_rdp(). [from Frederic Weisbecker suggestion]

Signed-off-by: Zqiang <qiang1.zhang@intel.com>
---
 kernel/rcu/tree_exp.h | 43 ++++++++++++++++++++-----------------------
 1 file changed, 20 insertions(+), 23 deletions(-)

Comments

Paul E. McKenney Jan. 5, 2023, 5:11 a.m. UTC | #1
On Thu, Jan 05, 2023 at 11:40:00AM +0800, Zqiang wrote:
> Currently, when first find out the expedited grace period is not end
> and timeout occurred, we set tick dependency for CPUs that have not
> yet reported the quiescent state in the rcu_node structure's->expmask
> but need to eliminate races between set and clear tick dependency, setting
> CPU tick dependency need to hold rcu_node structure's->lock.
> 
> This commit move tick dependency setting into rcu_exp_handler(), set tick
> dependency when fail to report a quiescent state and clear tick dependency
> in rcu_report_exp_rdp(). [from Frederic Weisbecker suggestion]
> 
> Signed-off-by: Zqiang <qiang1.zhang@intel.com>

First, a big "thank you" to you an Frederic for investigating this
approach!

So which is better, this patch or the one that I already have queued?

The advantage of the patch of yours that I already have queued is that
CPUs that respond in some other way within a millisecond do not get hit
with an additional scheduling-clock interrupt.

On the other hand, if the CPU goes through a quiescent state before the
next scheduling-clock interrupt arrives, rcu_report_exp_cpu_mult() will
shut down the tick before it happens.  Plus if the CPU waits a full tick
before reaching a quiescent state, then the tick_dep_set_cpu() called from
synchronize_rcu_expedited_wait() is going to send along an IPI anyway.

Except that invoking tick_dep_set_cpu() on the CPU itself will also
do an IPI from tick_dep_set_cpu() because of IRQ_WORK_INIT_HARD(), right?
Which means that the patch below gets us an extra self-IPI, right?
Or am I misreading the code?

In addition, doesn't tick_dep_clear_cpu() just clear a bit?  Won't that
mean that the next scheduling-clock interrupt will happen, just that the
one after that won't?  (Give or take kernel-to-user or kernel-to-idle
transitions that might happen in the meantime.)

Am I missing something in my attempts to model the costs?

							Thanx, Paul

> ---
>  kernel/rcu/tree_exp.h | 43 ++++++++++++++++++++-----------------------
>  1 file changed, 20 insertions(+), 23 deletions(-)
> 
> diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
> index 7cc4856da081..f1e947675727 100644
> --- a/kernel/rcu/tree_exp.h
> +++ b/kernel/rcu/tree_exp.h
> @@ -586,39 +586,16 @@ static bool synchronize_rcu_expedited_wait_once(long tlimit)
>  static void synchronize_rcu_expedited_wait(void)
>  {
>  	int cpu;
> -	unsigned long j;
>  	unsigned long jiffies_stall;
>  	unsigned long jiffies_start;
>  	unsigned long mask;
>  	int ndetected;
> -	struct rcu_data *rdp;
>  	struct rcu_node *rnp;
>  	struct rcu_node *rnp_root = rcu_get_root();
> -	unsigned long flags;
>  
>  	trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("startwait"));
>  	jiffies_stall = rcu_exp_jiffies_till_stall_check();
>  	jiffies_start = jiffies;
> -	if (tick_nohz_full_enabled() && rcu_inkernel_boot_has_ended()) {
> -		if (synchronize_rcu_expedited_wait_once(1))
> -			return;
> -		rcu_for_each_leaf_node(rnp) {
> -			raw_spin_lock_irqsave_rcu_node(rnp, flags);
> -			mask = READ_ONCE(rnp->expmask);
> -			for_each_leaf_node_cpu_mask(rnp, cpu, mask) {
> -				rdp = per_cpu_ptr(&rcu_data, cpu);
> -				if (rdp->rcu_forced_tick_exp)
> -					continue;
> -				rdp->rcu_forced_tick_exp = true;
> -				if (cpu_online(cpu))
> -					tick_dep_set_cpu(cpu, TICK_DEP_BIT_RCU_EXP);
> -			}
> -			raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
> -		}
> -		j = READ_ONCE(jiffies_till_first_fqs);
> -		if (synchronize_rcu_expedited_wait_once(j + HZ))
> -			return;
> -	}
>  
>  	for (;;) {
>  		if (synchronize_rcu_expedited_wait_once(jiffies_stall))
> @@ -729,6 +706,24 @@ static void rcu_exp_sel_wait_wake(unsigned long s)
>  
>  #ifdef CONFIG_PREEMPT_RCU
>  
> +static void rcu_exp_set_tickdep(struct rcu_data *rdp)
> +{
> +	int cpu = rdp->cpu;
> +
> +	if (tick_nohz_full_cpu(cpu) && rcu_inkernel_boot_has_ended() &&
> +						cpu_online(cpu)) {
> +		/*
> +		 * The rcu_exp_handler() can be invoked from stop machine,
> +		 * at this time CPU has disabled all interrupts and offline,
> +		 * in which case, we don't need requeue IPI or irq work.
> +		 * which is not a problem since rcu_report_dead() does the
> +		 * QS report.
> +		 */
> +		rdp->rcu_forced_tick_exp = true;
> +		tick_dep_set_cpu(cpu, TICK_DEP_BIT_RCU_EXP);
> +	}
> +}
> +
>  /*
>   * Remote handler for smp_call_function_single().  If there is an
>   * RCU read-side critical section in effect, request that the
> @@ -757,6 +752,7 @@ static void rcu_exp_handler(void *unused)
>  			WRITE_ONCE(rdp->cpu_no_qs.b.exp, true);
>  			set_tsk_need_resched(t);
>  			set_preempt_need_resched();
> +			rcu_exp_set_tickdep(rdp);
>  		}
>  		return;
>  	}
> @@ -778,6 +774,7 @@ static void rcu_exp_handler(void *unused)
>  		if (rnp->expmask & rdp->grpmask) {
>  			WRITE_ONCE(rdp->cpu_no_qs.b.exp, true);
>  			t->rcu_read_unlock_special.b.exp_hint = true;
> +			rcu_exp_set_tickdep(rdp);
>  		}
>  		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
>  		return;
> -- 
> 2.25.1
>
Zqiang Jan. 6, 2023, 2:42 a.m. UTC | #2
On Thu, Jan 05, 2023 at 11:40:00AM +0800, Zqiang wrote:
> Currently, when first find out the expedited grace period is not end 
> and timeout occurred, we set tick dependency for CPUs that have not 
> yet reported the quiescent state in the rcu_node structure's->expmask 
> but need to eliminate races between set and clear tick dependency, 
> setting CPU tick dependency need to hold rcu_node structure's->lock.
> 
> This commit move tick dependency setting into rcu_exp_handler(), set 
> tick dependency when fail to report a quiescent state and clear tick 
> dependency in rcu_report_exp_rdp(). [from Frederic Weisbecker 
> suggestion]
> 
> Signed-off-by: Zqiang <qiang1.zhang@intel.com>
>
>First, a big "thank you" to you an Frederic for investigating this approach!
>
>So which is better, this patch or the one that I already have queued?
>
>The advantage of the patch of yours that I already have queued is that CPUs that respond in some other way within a millisecond do not get hit with an additional scheduling-clock interrupt.
>
>On the other hand, if the CPU goes through a quiescent state before the next scheduling-clock interrupt arrives, rcu_report_exp_cpu_mult() will shut down the tick before it happens.  Plus if the CPU waits a full tick before reaching a quiescent state, then the tick_dep_set_cpu() called from
>synchronize_rcu_expedited_wait() is going to send along an IPI anyway.

Agreed, this new patch is set tick dependency immediately when we can't report a quiescent state
in rcu_exp_handler(), this seems a little too aggressive.


>
>Except that invoking tick_dep_set_cpu() on the CPU itself will also do an IPI from tick_dep_set_cpu() because of IRQ_WORK_INIT_HARD(), right?
>Which means that the patch below gets us an extra self-IPI, right?
>Or am I misreading the code?

Yes, This looks like it will trigger an additional IPI interrupt.

>
>In addition, doesn't tick_dep_clear_cpu() just clear a bit?  Won't that mean that the next scheduling-clock interrupt will happen, just that the one after that won't?  (Give or take kernel-to-user or kernel-to-idle transitions that might happen in the meantime.)

Yes, tick_dep_clear_cpu() just only clear a bit. next scheduling-clock interrupt will happen.

So I also want to know which one is better 
Frederic Weisbecker Jan. 6, 2023, 10:45 p.m. UTC | #3
On Fri, Jan 06, 2023 at 02:42:59AM +0000, Zhang, Qiang1 wrote:
> On Thu, Jan 05, 2023 at 11:40:00AM +0800, Zqiang wrote:
> > Currently, when first find out the expedited grace period is not end 
> > and timeout occurred, we set tick dependency for CPUs that have not 
> > yet reported the quiescent state in the rcu_node structure's->expmask 
> > but need to eliminate races between set and clear tick dependency, 
> > setting CPU tick dependency need to hold rcu_node structure's->lock.
> > 
> > This commit move tick dependency setting into rcu_exp_handler(), set 
> > tick dependency when fail to report a quiescent state and clear tick 
> > dependency in rcu_report_exp_rdp(). [from Frederic Weisbecker 
> > suggestion]
> > 
> > Signed-off-by: Zqiang <qiang1.zhang@intel.com>
> >
> >First, a big "thank you" to you an Frederic for investigating this approach!
> >
> >So which is better, this patch or the one that I already have queued?
> >
> >The advantage of the patch of yours that I already have queued is that CPUs that respond in some other way within a millisecond do not get hit with an additional scheduling-clock interrupt.
> >
> >On the other hand, if the CPU goes through a quiescent state before the next scheduling-clock interrupt arrives, rcu_report_exp_cpu_mult() will shut down the tick before it happens.  Plus if the CPU waits a full tick before reaching a quiescent state, then the tick_dep_set_cpu() called from
> >synchronize_rcu_expedited_wait() is going to send along an IPI anyway.
> 
> Agreed, this new patch is set tick dependency immediately when we can't report a quiescent state
> in rcu_exp_handler(), this seems a little too aggressive.
> 
> 
> >
> >Except that invoking tick_dep_set_cpu() on the CPU itself will also do an IPI from tick_dep_set_cpu() because of IRQ_WORK_INIT_HARD(), right?
> >Which means that the patch below gets us an extra self-IPI, right?
> >Or am I misreading the code?
> 
> Yes, This looks like it will trigger an additional IPI interrupt.
> 
> >
> >In addition, doesn't tick_dep_clear_cpu() just clear a bit?  Won't that mean that the next scheduling-clock interrupt will happen, just that the one after that won't?  (Give or take kernel-to-user or kernel-to-idle transitions that might happen in the meantime.)
> 
> Yes, tick_dep_clear_cpu() just only clear a bit. next scheduling-clock interrupt will happen.
> 
> So I also want to know which one is better 
Paul E. McKenney Jan. 7, 2023, 4:31 p.m. UTC | #4
On Fri, Jan 06, 2023 at 11:45:46PM +0100, Frederic Weisbecker wrote:
> On Fri, Jan 06, 2023 at 02:42:59AM +0000, Zhang, Qiang1 wrote:
> > On Thu, Jan 05, 2023 at 11:40:00AM +0800, Zqiang wrote:
> > > Currently, when first find out the expedited grace period is not end 
> > > and timeout occurred, we set tick dependency for CPUs that have not 
> > > yet reported the quiescent state in the rcu_node structure's->expmask 
> > > but need to eliminate races between set and clear tick dependency, 
> > > setting CPU tick dependency need to hold rcu_node structure's->lock.
> > > 
> > > This commit move tick dependency setting into rcu_exp_handler(), set 
> > > tick dependency when fail to report a quiescent state and clear tick 
> > > dependency in rcu_report_exp_rdp(). [from Frederic Weisbecker 
> > > suggestion]
> > > 
> > > Signed-off-by: Zqiang <qiang1.zhang@intel.com>
> > >
> > >First, a big "thank you" to you an Frederic for investigating this approach!
> > >
> > >So which is better, this patch or the one that I already have queued?
> > >
> > >The advantage of the patch of yours that I already have queued is that CPUs that respond in some other way within a millisecond do not get hit with an additional scheduling-clock interrupt.
> > >
> > >On the other hand, if the CPU goes through a quiescent state before the next scheduling-clock interrupt arrives, rcu_report_exp_cpu_mult() will shut down the tick before it happens.  Plus if the CPU waits a full tick before reaching a quiescent state, then the tick_dep_set_cpu() called from
> > >synchronize_rcu_expedited_wait() is going to send along an IPI anyway.
> > 
> > Agreed, this new patch is set tick dependency immediately when we can't report a quiescent state
> > in rcu_exp_handler(), this seems a little too aggressive.
> > 
> > 
> > >
> > >Except that invoking tick_dep_set_cpu() on the CPU itself will also do an IPI from tick_dep_set_cpu() because of IRQ_WORK_INIT_HARD(), right?
> > >Which means that the patch below gets us an extra self-IPI, right?
> > >Or am I misreading the code?
> > 
> > Yes, This looks like it will trigger an additional IPI interrupt.
> > 
> > >
> > >In addition, doesn't tick_dep_clear_cpu() just clear a bit?  Won't that mean that the next scheduling-clock interrupt will happen, just that the one after that won't?  (Give or take kernel-to-user or kernel-to-idle transitions that might happen in the meantime.)
> > 
> > Yes, tick_dep_clear_cpu() just only clear a bit. next scheduling-clock interrupt will happen.
> > 
> > So I also want to know which one is better 
diff mbox series

Patch

diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 7cc4856da081..f1e947675727 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -586,39 +586,16 @@  static bool synchronize_rcu_expedited_wait_once(long tlimit)
 static void synchronize_rcu_expedited_wait(void)
 {
 	int cpu;
-	unsigned long j;
 	unsigned long jiffies_stall;
 	unsigned long jiffies_start;
 	unsigned long mask;
 	int ndetected;
-	struct rcu_data *rdp;
 	struct rcu_node *rnp;
 	struct rcu_node *rnp_root = rcu_get_root();
-	unsigned long flags;
 
 	trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("startwait"));
 	jiffies_stall = rcu_exp_jiffies_till_stall_check();
 	jiffies_start = jiffies;
-	if (tick_nohz_full_enabled() && rcu_inkernel_boot_has_ended()) {
-		if (synchronize_rcu_expedited_wait_once(1))
-			return;
-		rcu_for_each_leaf_node(rnp) {
-			raw_spin_lock_irqsave_rcu_node(rnp, flags);
-			mask = READ_ONCE(rnp->expmask);
-			for_each_leaf_node_cpu_mask(rnp, cpu, mask) {
-				rdp = per_cpu_ptr(&rcu_data, cpu);
-				if (rdp->rcu_forced_tick_exp)
-					continue;
-				rdp->rcu_forced_tick_exp = true;
-				if (cpu_online(cpu))
-					tick_dep_set_cpu(cpu, TICK_DEP_BIT_RCU_EXP);
-			}
-			raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
-		}
-		j = READ_ONCE(jiffies_till_first_fqs);
-		if (synchronize_rcu_expedited_wait_once(j + HZ))
-			return;
-	}
 
 	for (;;) {
 		if (synchronize_rcu_expedited_wait_once(jiffies_stall))
@@ -729,6 +706,24 @@  static void rcu_exp_sel_wait_wake(unsigned long s)
 
 #ifdef CONFIG_PREEMPT_RCU
 
+static void rcu_exp_set_tickdep(struct rcu_data *rdp)
+{
+	int cpu = rdp->cpu;
+
+	if (tick_nohz_full_cpu(cpu) && rcu_inkernel_boot_has_ended() &&
+						cpu_online(cpu)) {
+		/*
+		 * The rcu_exp_handler() can be invoked from stop machine,
+		 * at this time CPU has disabled all interrupts and offline,
+		 * in which case, we don't need requeue IPI or irq work.
+		 * which is not a problem since rcu_report_dead() does the
+		 * QS report.
+		 */
+		rdp->rcu_forced_tick_exp = true;
+		tick_dep_set_cpu(cpu, TICK_DEP_BIT_RCU_EXP);
+	}
+}
+
 /*
  * Remote handler for smp_call_function_single().  If there is an
  * RCU read-side critical section in effect, request that the
@@ -757,6 +752,7 @@  static void rcu_exp_handler(void *unused)
 			WRITE_ONCE(rdp->cpu_no_qs.b.exp, true);
 			set_tsk_need_resched(t);
 			set_preempt_need_resched();
+			rcu_exp_set_tickdep(rdp);
 		}
 		return;
 	}
@@ -778,6 +774,7 @@  static void rcu_exp_handler(void *unused)
 		if (rnp->expmask & rdp->grpmask) {
 			WRITE_ONCE(rdp->cpu_no_qs.b.exp, true);
 			t->rcu_read_unlock_special.b.exp_hint = true;
+			rcu_exp_set_tickdep(rdp);
 		}
 		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 		return;