Message ID | 20220620225128.3842050-4-paulmck@kernel.org (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Polled grace-period updates for v5.20 | expand |
Hi Paul, On Mon, Jun 20, 2022 at 03:51:20PM -0700, Paul E. McKenney wrote: > This commit switches the existing polled grace-period APIs to use a > new ->gp_seq_polled counter in the rcu_state structure. An additional > ->gp_seq_polled_snap counter in that same structure allows the normal > grace period kthread to interact properly with the !SMP !PREEMPT fastpath > through synchronize_rcu(). The first of the two to note the end of a > given grace period will make knowledge of this transition available to > the polled API. > > This commit is in preparation for polled expedited grace periods. > > Link: https://lore.kernel.org/all/20220121142454.1994916-1-bfoster@redhat.com/ > Link: https://docs.google.com/document/d/1RNKWW9jQyfjxw2E8dsXVTdvZYh0HnYeSHDKog9jhdN8/edit?usp=sharing > Cc: Brian Foster <bfoster@redhat.com> > Cc: Dave Chinner <david@fromorbit.com> > Cc: Al Viro <viro@zeniv.linux.org.uk> > Cc: Ian Kent <raven@themaw.net> > Signed-off-by: Paul E. McKenney <paulmck@kernel.org> > --- > kernel/rcu/tree.c | 90 +++++++++++++++++++++++++++++++++++++++++++++-- > kernel/rcu/tree.h | 2 ++ > 2 files changed, 89 insertions(+), 3 deletions(-) > > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c > index 46cfceea87847..637e8f9454573 100644 > --- a/kernel/rcu/tree.c > +++ b/kernel/rcu/tree.c > @@ -1775,6 +1775,78 @@ static void rcu_strict_gp_boundary(void *unused) > invoke_rcu_core(); > } > > +// Has rcu_init() been invoked? This is used (for example) to determine > +// whether spinlocks may be acquired safely. > +static bool rcu_init_invoked(void) > +{ > + return !!rcu_state.n_online_cpus; > +} > + > +// Make the polled API aware of the beginning of a grace period. > +static void rcu_poll_gp_seq_start(unsigned long *snap) > +{ > + struct rcu_node *rnp = rcu_get_root(); > + > + if (rcu_init_invoked()) > + raw_lockdep_assert_held_rcu_node(rnp); > + > + // If RCU was idle, note beginning of GP. > + if (!rcu_seq_state(rcu_state.gp_seq_polled)) > + rcu_seq_start(&rcu_state.gp_seq_polled); > + > + // Either way, record current state. > + *snap = rcu_state.gp_seq_polled; > +} > + > +// Make the polled API aware of the end of a grace period. > +static void rcu_poll_gp_seq_end(unsigned long *snap) > +{ > + struct rcu_node *rnp = rcu_get_root(); > + > + if (rcu_init_invoked()) > + raw_lockdep_assert_held_rcu_node(rnp); > + > + // If the the previously noted GP is still in effect, record the > + // end of that GP. Either way, zero counter to avoid counter-wrap > + // problems. > + if (*snap && *snap == rcu_state.gp_seq_polled) { > + rcu_seq_end(&rcu_state.gp_seq_polled); > + rcu_state.gp_seq_polled_snap = 0; > + } else { > + *snap = 0; > + } > +} > + > +// Make the polled API aware of the beginning of a grace period, but > +// where caller does not hold the root rcu_node structure's lock. > +static void rcu_poll_gp_seq_start_unlocked(unsigned long *snap) > +{ > + struct rcu_node *rnp = rcu_get_root(); > + > + if (rcu_init_invoked()) { > + lockdep_assert_irqs_enabled(); > + raw_spin_lock_irq_rcu_node(rnp); > + } > + rcu_poll_gp_seq_start(snap); > + if (rcu_init_invoked()) > + raw_spin_unlock_irq_rcu_node(rnp); > +} > + > +// Make the polled API aware of the end of a grace period, but where > +// caller does not hold the root rcu_node structure's lock. > +static void rcu_poll_gp_seq_end_unlocked(unsigned long *snap) > +{ > + struct rcu_node *rnp = rcu_get_root(); > + > + if (rcu_init_invoked()) { > + lockdep_assert_irqs_enabled(); > + raw_spin_lock_irq_rcu_node(rnp); > + } > + rcu_poll_gp_seq_end(snap); > + if (rcu_init_invoked()) > + raw_spin_unlock_irq_rcu_node(rnp); > +} > + > /* > * Initialize a new grace period. Return false if no grace period required. > */ > @@ -1810,6 +1882,7 @@ static noinline_for_stack bool rcu_gp_init(void) > rcu_seq_start(&rcu_state.gp_seq); > ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq); > trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("start")); > + rcu_poll_gp_seq_start(&rcu_state.gp_seq_polled_snap); > raw_spin_unlock_irq_rcu_node(rnp); > > /* > @@ -2069,6 +2142,7 @@ static noinline void rcu_gp_cleanup(void) > * safe for us to drop the lock in order to mark the grace > * period as completed in all of the rcu_node structures. > */ > + rcu_poll_gp_seq_end(&rcu_state.gp_seq_polled_snap); > raw_spin_unlock_irq_rcu_node(rnp); > > /* > @@ -3837,8 +3911,18 @@ void synchronize_rcu(void) > lock_is_held(&rcu_lock_map) || > lock_is_held(&rcu_sched_lock_map), > "Illegal synchronize_rcu() in RCU read-side critical section"); > - if (rcu_blocking_is_gp()) > + if (rcu_blocking_is_gp()) { > + // Note well that this code runs with !PREEMPT && !SMP. > + // In addition, all code that advances grace periods runs > + // at process level. Therefore, this GP overlaps with other > + // GPs only by being fully nested within them, which allows > + // reuse of ->gp_seq_polled_snap. > + rcu_poll_gp_seq_start_unlocked(&rcu_state.gp_seq_polled_snap); > + rcu_poll_gp_seq_end_unlocked(&rcu_state.gp_seq_polled_snap); > + if (rcu_init_invoked()) > + cond_resched_tasks_rcu_qs(); > return; // Context allows vacuous grace periods. > + } > if (rcu_gp_is_expedited()) > synchronize_rcu_expedited(); > else > @@ -3860,7 +3944,7 @@ unsigned long get_state_synchronize_rcu(void) > * before the load from ->gp_seq. > */ > smp_mb(); /* ^^^ */ > - return rcu_seq_snap(&rcu_state.gp_seq); > + return rcu_seq_snap(&rcu_state.gp_seq_polled); I happened to run into this. There is one usage of get_state_synchronize_rcu() in start_poll_synchronize_rcu(), in which the return value of get_state_synchronize_rcu() ("gp_seq") will be used for rcu_start_this_gp(). I don't think this is quite right, because after this change, rcu_state.gp_seq and rcu_state.gp_seq_polled are different values, in fact ->gp_seq_polled is greater than ->gp_seq by how many synchronize_rcu() is called in early boot. Am I missing something here? Regards, Boqun > } > EXPORT_SYMBOL_GPL(get_state_synchronize_rcu); > > @@ -3925,7 +4009,7 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu); > bool poll_state_synchronize_rcu(unsigned long oldstate) > { > if (oldstate == RCU_GET_STATE_COMPLETED || > - rcu_seq_done_exact(&rcu_state.gp_seq, oldstate)) { > + rcu_seq_done_exact(&rcu_state.gp_seq_polled, oldstate)) { > smp_mb(); /* Ensure GP ends before subsequent accesses. */ > return true; > } > diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h > index 2ccf5845957df..9c853033f159d 100644 > --- a/kernel/rcu/tree.h > +++ b/kernel/rcu/tree.h > @@ -323,6 +323,8 @@ struct rcu_state { > short gp_state; /* GP kthread sleep state. */ > unsigned long gp_wake_time; /* Last GP kthread wake. */ > unsigned long gp_wake_seq; /* ->gp_seq at ^^^. */ > + unsigned long gp_seq_polled; /* GP seq for polled API. */ > + unsigned long gp_seq_polled_snap; /* ->gp_seq_polled at normal GP start. */ > > /* End of fields guarded by root rcu_node's lock. */ > > -- > 2.31.1.189.g2e36527f23 >
On Wed, Jul 20, 2022 at 05:53:38PM -0700, Boqun Feng wrote: > Hi Paul, > > On Mon, Jun 20, 2022 at 03:51:20PM -0700, Paul E. McKenney wrote: > > This commit switches the existing polled grace-period APIs to use a > > new ->gp_seq_polled counter in the rcu_state structure. An additional > > ->gp_seq_polled_snap counter in that same structure allows the normal > > grace period kthread to interact properly with the !SMP !PREEMPT fastpath > > through synchronize_rcu(). The first of the two to note the end of a > > given grace period will make knowledge of this transition available to > > the polled API. > > > > This commit is in preparation for polled expedited grace periods. > > > > Link: https://lore.kernel.org/all/20220121142454.1994916-1-bfoster@redhat.com/ > > Link: https://docs.google.com/document/d/1RNKWW9jQyfjxw2E8dsXVTdvZYh0HnYeSHDKog9jhdN8/edit?usp=sharing > > Cc: Brian Foster <bfoster@redhat.com> > > Cc: Dave Chinner <david@fromorbit.com> > > Cc: Al Viro <viro@zeniv.linux.org.uk> > > Cc: Ian Kent <raven@themaw.net> > > Signed-off-by: Paul E. McKenney <paulmck@kernel.org> > > --- > > kernel/rcu/tree.c | 90 +++++++++++++++++++++++++++++++++++++++++++++-- > > kernel/rcu/tree.h | 2 ++ > > 2 files changed, 89 insertions(+), 3 deletions(-) > > > > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c > > index 46cfceea87847..637e8f9454573 100644 > > --- a/kernel/rcu/tree.c > > +++ b/kernel/rcu/tree.c > > @@ -1775,6 +1775,78 @@ static void rcu_strict_gp_boundary(void *unused) > > invoke_rcu_core(); > > } > > > > +// Has rcu_init() been invoked? This is used (for example) to determine > > +// whether spinlocks may be acquired safely. > > +static bool rcu_init_invoked(void) > > +{ > > + return !!rcu_state.n_online_cpus; > > +} > > + > > +// Make the polled API aware of the beginning of a grace period. > > +static void rcu_poll_gp_seq_start(unsigned long *snap) > > +{ > > + struct rcu_node *rnp = rcu_get_root(); > > + > > + if (rcu_init_invoked()) > > + raw_lockdep_assert_held_rcu_node(rnp); > > + > > + // If RCU was idle, note beginning of GP. > > + if (!rcu_seq_state(rcu_state.gp_seq_polled)) > > + rcu_seq_start(&rcu_state.gp_seq_polled); > > + > > + // Either way, record current state. > > + *snap = rcu_state.gp_seq_polled; > > +} > > + > > +// Make the polled API aware of the end of a grace period. > > +static void rcu_poll_gp_seq_end(unsigned long *snap) > > +{ > > + struct rcu_node *rnp = rcu_get_root(); > > + > > + if (rcu_init_invoked()) > > + raw_lockdep_assert_held_rcu_node(rnp); > > + > > + // If the the previously noted GP is still in effect, record the > > + // end of that GP. Either way, zero counter to avoid counter-wrap > > + // problems. > > + if (*snap && *snap == rcu_state.gp_seq_polled) { > > + rcu_seq_end(&rcu_state.gp_seq_polled); > > + rcu_state.gp_seq_polled_snap = 0; > > + } else { > > + *snap = 0; > > + } > > +} > > + > > +// Make the polled API aware of the beginning of a grace period, but > > +// where caller does not hold the root rcu_node structure's lock. > > +static void rcu_poll_gp_seq_start_unlocked(unsigned long *snap) > > +{ > > + struct rcu_node *rnp = rcu_get_root(); > > + > > + if (rcu_init_invoked()) { > > + lockdep_assert_irqs_enabled(); > > + raw_spin_lock_irq_rcu_node(rnp); > > + } > > + rcu_poll_gp_seq_start(snap); > > + if (rcu_init_invoked()) > > + raw_spin_unlock_irq_rcu_node(rnp); > > +} > > + > > +// Make the polled API aware of the end of a grace period, but where > > +// caller does not hold the root rcu_node structure's lock. > > +static void rcu_poll_gp_seq_end_unlocked(unsigned long *snap) > > +{ > > + struct rcu_node *rnp = rcu_get_root(); > > + > > + if (rcu_init_invoked()) { > > + lockdep_assert_irqs_enabled(); > > + raw_spin_lock_irq_rcu_node(rnp); > > + } > > + rcu_poll_gp_seq_end(snap); > > + if (rcu_init_invoked()) > > + raw_spin_unlock_irq_rcu_node(rnp); > > +} > > + > > /* > > * Initialize a new grace period. Return false if no grace period required. > > */ > > @@ -1810,6 +1882,7 @@ static noinline_for_stack bool rcu_gp_init(void) > > rcu_seq_start(&rcu_state.gp_seq); > > ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq); > > trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("start")); > > + rcu_poll_gp_seq_start(&rcu_state.gp_seq_polled_snap); > > raw_spin_unlock_irq_rcu_node(rnp); > > > > /* > > @@ -2069,6 +2142,7 @@ static noinline void rcu_gp_cleanup(void) > > * safe for us to drop the lock in order to mark the grace > > * period as completed in all of the rcu_node structures. > > */ > > + rcu_poll_gp_seq_end(&rcu_state.gp_seq_polled_snap); > > raw_spin_unlock_irq_rcu_node(rnp); > > > > /* > > @@ -3837,8 +3911,18 @@ void synchronize_rcu(void) > > lock_is_held(&rcu_lock_map) || > > lock_is_held(&rcu_sched_lock_map), > > "Illegal synchronize_rcu() in RCU read-side critical section"); > > - if (rcu_blocking_is_gp()) > > + if (rcu_blocking_is_gp()) { > > + // Note well that this code runs with !PREEMPT && !SMP. > > + // In addition, all code that advances grace periods runs > > + // at process level. Therefore, this GP overlaps with other > > + // GPs only by being fully nested within them, which allows > > + // reuse of ->gp_seq_polled_snap. > > + rcu_poll_gp_seq_start_unlocked(&rcu_state.gp_seq_polled_snap); > > + rcu_poll_gp_seq_end_unlocked(&rcu_state.gp_seq_polled_snap); > > + if (rcu_init_invoked()) > > + cond_resched_tasks_rcu_qs(); > > return; // Context allows vacuous grace periods. > > + } > > if (rcu_gp_is_expedited()) > > synchronize_rcu_expedited(); > > else > > @@ -3860,7 +3944,7 @@ unsigned long get_state_synchronize_rcu(void) > > * before the load from ->gp_seq. > > */ > > smp_mb(); /* ^^^ */ > > - return rcu_seq_snap(&rcu_state.gp_seq); > > + return rcu_seq_snap(&rcu_state.gp_seq_polled); > > I happened to run into this. There is one usage of > get_state_synchronize_rcu() in start_poll_synchronize_rcu(), in which > the return value of get_state_synchronize_rcu() ("gp_seq") will be used > for rcu_start_this_gp(). I don't think this is quite right, because > after this change, rcu_state.gp_seq and rcu_state.gp_seq_polled are > different values, in fact ->gp_seq_polled is greater than ->gp_seq > by how many synchronize_rcu() is called in early boot. > > Am I missing something here? It does not appear that your are missing anything, sad to say! Does the following make it work better? Thanx, Paul ------------------------------------------------------------------------ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 2122359f0c862..cf2fd58a93a41 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3571,7 +3571,7 @@ EXPORT_SYMBOL_GPL(get_state_synchronize_rcu); unsigned long start_poll_synchronize_rcu(void) { unsigned long flags; - unsigned long gp_seq = get_state_synchronize_rcu(); + unsigned long gp_seq = rcu_seq_snap(&rcu_state.gp_seq); bool needwake; struct rcu_data *rdp; struct rcu_node *rnp;
On Wed, Jul 20, 2022 at 06:04:55PM -0700, Paul E. McKenney wrote: [...] > > > @@ -3860,7 +3944,7 @@ unsigned long get_state_synchronize_rcu(void) > > > * before the load from ->gp_seq. > > > */ > > > smp_mb(); /* ^^^ */ > > > - return rcu_seq_snap(&rcu_state.gp_seq); > > > + return rcu_seq_snap(&rcu_state.gp_seq_polled); > > > > I happened to run into this. There is one usage of > > get_state_synchronize_rcu() in start_poll_synchronize_rcu(), in which > > the return value of get_state_synchronize_rcu() ("gp_seq") will be used > > for rcu_start_this_gp(). I don't think this is quite right, because > > after this change, rcu_state.gp_seq and rcu_state.gp_seq_polled are > > different values, in fact ->gp_seq_polled is greater than ->gp_seq > > by how many synchronize_rcu() is called in early boot. > > > > Am I missing something here? > > It does not appear that your are missing anything, sad to say! > > Does the following make it work better? > > Thanx, Paul > > ------------------------------------------------------------------------ > > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c > index 2122359f0c862..cf2fd58a93a41 100644 > --- a/kernel/rcu/tree.c > +++ b/kernel/rcu/tree.c > @@ -3571,7 +3571,7 @@ EXPORT_SYMBOL_GPL(get_state_synchronize_rcu); > unsigned long start_poll_synchronize_rcu(void) > { > unsigned long flags; > - unsigned long gp_seq = get_state_synchronize_rcu(); > + unsigned long gp_seq = rcu_seq_snap(&rcu_state.gp_seq); get_state_synchronize_rcu() is still needed, because we need to return a cookie for polling for this function. Something like below maybe? Hope I didn't mess up the ordering ;-) Regards, Boqun --------------- diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 84d281776688..0f9134871289 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3571,11 +3583,39 @@ EXPORT_SYMBOL_GPL(get_state_synchronize_rcu); unsigned long start_poll_synchronize_rcu(void) { unsigned long flags; - unsigned long gp_seq = get_state_synchronize_rcu(); + unsigned long gp_seq_poll = get_state_synchronize_rcu(); + unsigned long gp_seq; bool needwake; struct rcu_data *rdp; struct rcu_node *rnp; + /* + * Need to start a gp if no gp has been started yet. + * + * Note that we need to snapshot gp_seq after gp_seq_poll, otherwise + * consider the follow case: + * + * <no gp in progress> // gp# is 0 + * snapshot gp_seq // gp #2 will be set as needed + * <a gp passed> + * // gp# is 1 + * snapshot gp_seq_poll // polling gets ready until gp #3 + * + * then the following rcu_start_this_gp() won't mark gp #3 as needed, + * and polling won't become ready if others don't start a gp. + * + * And the following case is fine: + * + * <no gp in progress> // gp# is 0 + * snapshot gp_seq_poll // polling gets ready until gp #2 + * <a gp passed> + * // gp# is 1 + * snapshot gp_seq // gp #3 will be set as needed + * + * Also note, we rely on the smp_mb() in get_state_synchronize_rcu() + * to order the two snapshots. + */ + gp_seq = rcu_seq_snap(&rcu_state.gp_seq); lockdep_assert_irqs_enabled(); local_irq_save(flags); rdp = this_cpu_ptr(&rcu_data); @@ -3585,7 +3625,7 @@ unsigned long start_poll_synchronize_rcu(void) raw_spin_unlock_irqrestore_rcu_node(rnp, flags); if (needwake) rcu_gp_kthread_wake(); - return gp_seq; + return gp_seq_poll; } EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu);
On Wed, Jul 20, 2022 at 06:04:55PM -0700, Paul E. McKenney wrote: > On Wed, Jul 20, 2022 at 05:53:38PM -0700, Boqun Feng wrote: > > Hi Paul, > > > > On Mon, Jun 20, 2022 at 03:51:20PM -0700, Paul E. McKenney wrote: > > > This commit switches the existing polled grace-period APIs to use a > > > new ->gp_seq_polled counter in the rcu_state structure. An additional > > > ->gp_seq_polled_snap counter in that same structure allows the normal > > > grace period kthread to interact properly with the !SMP !PREEMPT fastpath > > > through synchronize_rcu(). The first of the two to note the end of a > > > given grace period will make knowledge of this transition available to > > > the polled API. > > > > > > This commit is in preparation for polled expedited grace periods. > > > > > > Link: https://lore.kernel.org/all/20220121142454.1994916-1-bfoster@redhat.com/ > > > Link: https://docs.google.com/document/d/1RNKWW9jQyfjxw2E8dsXVTdvZYh0HnYeSHDKog9jhdN8/edit?usp=sharing > > > Cc: Brian Foster <bfoster@redhat.com> > > > Cc: Dave Chinner <david@fromorbit.com> > > > Cc: Al Viro <viro@zeniv.linux.org.uk> > > > Cc: Ian Kent <raven@themaw.net> > > > Signed-off-by: Paul E. McKenney <paulmck@kernel.org> > > > --- > > > kernel/rcu/tree.c | 90 +++++++++++++++++++++++++++++++++++++++++++++-- > > > kernel/rcu/tree.h | 2 ++ > > > 2 files changed, 89 insertions(+), 3 deletions(-) > > > > > > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c > > > index 46cfceea87847..637e8f9454573 100644 > > > --- a/kernel/rcu/tree.c > > > +++ b/kernel/rcu/tree.c > > > @@ -1775,6 +1775,78 @@ static void rcu_strict_gp_boundary(void *unused) > > > invoke_rcu_core(); > > > } > > > > > > +// Has rcu_init() been invoked? This is used (for example) to determine > > > +// whether spinlocks may be acquired safely. > > > +static bool rcu_init_invoked(void) > > > +{ > > > + return !!rcu_state.n_online_cpus; > > > +} > > > + > > > +// Make the polled API aware of the beginning of a grace period. > > > +static void rcu_poll_gp_seq_start(unsigned long *snap) > > > +{ > > > + struct rcu_node *rnp = rcu_get_root(); > > > + > > > + if (rcu_init_invoked()) > > > + raw_lockdep_assert_held_rcu_node(rnp); > > > + > > > + // If RCU was idle, note beginning of GP. > > > + if (!rcu_seq_state(rcu_state.gp_seq_polled)) > > > + rcu_seq_start(&rcu_state.gp_seq_polled); > > > + > > > + // Either way, record current state. > > > + *snap = rcu_state.gp_seq_polled; > > > +} > > > + > > > +// Make the polled API aware of the end of a grace period. > > > +static void rcu_poll_gp_seq_end(unsigned long *snap) > > > +{ > > > + struct rcu_node *rnp = rcu_get_root(); > > > + > > > + if (rcu_init_invoked()) > > > + raw_lockdep_assert_held_rcu_node(rnp); > > > + > > > + // If the the previously noted GP is still in effect, record the > > > + // end of that GP. Either way, zero counter to avoid counter-wrap > > > + // problems. > > > + if (*snap && *snap == rcu_state.gp_seq_polled) { > > > + rcu_seq_end(&rcu_state.gp_seq_polled); > > > + rcu_state.gp_seq_polled_snap = 0; > > > + } else { > > > + *snap = 0; > > > + } > > > +} > > > + > > > +// Make the polled API aware of the beginning of a grace period, but > > > +// where caller does not hold the root rcu_node structure's lock. > > > +static void rcu_poll_gp_seq_start_unlocked(unsigned long *snap) > > > +{ > > > + struct rcu_node *rnp = rcu_get_root(); > > > + > > > + if (rcu_init_invoked()) { > > > + lockdep_assert_irqs_enabled(); > > > + raw_spin_lock_irq_rcu_node(rnp); > > > + } > > > + rcu_poll_gp_seq_start(snap); > > > + if (rcu_init_invoked()) > > > + raw_spin_unlock_irq_rcu_node(rnp); > > > +} > > > + > > > +// Make the polled API aware of the end of a grace period, but where > > > +// caller does not hold the root rcu_node structure's lock. > > > +static void rcu_poll_gp_seq_end_unlocked(unsigned long *snap) > > > +{ > > > + struct rcu_node *rnp = rcu_get_root(); > > > + > > > + if (rcu_init_invoked()) { > > > + lockdep_assert_irqs_enabled(); > > > + raw_spin_lock_irq_rcu_node(rnp); > > > + } > > > + rcu_poll_gp_seq_end(snap); > > > + if (rcu_init_invoked()) > > > + raw_spin_unlock_irq_rcu_node(rnp); > > > +} > > > + > > > /* > > > * Initialize a new grace period. Return false if no grace period required. > > > */ > > > @@ -1810,6 +1882,7 @@ static noinline_for_stack bool rcu_gp_init(void) > > > rcu_seq_start(&rcu_state.gp_seq); > > > ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq); > > > trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("start")); > > > + rcu_poll_gp_seq_start(&rcu_state.gp_seq_polled_snap); > > > raw_spin_unlock_irq_rcu_node(rnp); > > > > > > /* > > > @@ -2069,6 +2142,7 @@ static noinline void rcu_gp_cleanup(void) > > > * safe for us to drop the lock in order to mark the grace > > > * period as completed in all of the rcu_node structures. > > > */ > > > + rcu_poll_gp_seq_end(&rcu_state.gp_seq_polled_snap); > > > raw_spin_unlock_irq_rcu_node(rnp); > > > > > > /* > > > @@ -3837,8 +3911,18 @@ void synchronize_rcu(void) > > > lock_is_held(&rcu_lock_map) || > > > lock_is_held(&rcu_sched_lock_map), > > > "Illegal synchronize_rcu() in RCU read-side critical section"); > > > - if (rcu_blocking_is_gp()) > > > + if (rcu_blocking_is_gp()) { > > > + // Note well that this code runs with !PREEMPT && !SMP. > > > + // In addition, all code that advances grace periods runs > > > + // at process level. Therefore, this GP overlaps with other > > > + // GPs only by being fully nested within them, which allows > > > + // reuse of ->gp_seq_polled_snap. > > > + rcu_poll_gp_seq_start_unlocked(&rcu_state.gp_seq_polled_snap); > > > + rcu_poll_gp_seq_end_unlocked(&rcu_state.gp_seq_polled_snap); > > > + if (rcu_init_invoked()) > > > + cond_resched_tasks_rcu_qs(); > > > return; // Context allows vacuous grace periods. > > > + } > > > if (rcu_gp_is_expedited()) > > > synchronize_rcu_expedited(); > > > else > > > @@ -3860,7 +3944,7 @@ unsigned long get_state_synchronize_rcu(void) > > > * before the load from ->gp_seq. > > > */ > > > smp_mb(); /* ^^^ */ > > > - return rcu_seq_snap(&rcu_state.gp_seq); > > > + return rcu_seq_snap(&rcu_state.gp_seq_polled); > > > > I happened to run into this. There is one usage of > > get_state_synchronize_rcu() in start_poll_synchronize_rcu(), in which > > the return value of get_state_synchronize_rcu() ("gp_seq") will be used > > for rcu_start_this_gp(). I don't think this is quite right, because > > after this change, rcu_state.gp_seq and rcu_state.gp_seq_polled are > > different values, in fact ->gp_seq_polled is greater than ->gp_seq > > by how many synchronize_rcu() is called in early boot. > > > > Am I missing something here? > > It does not appear that your are missing anything, sad to say! > > Does the following make it work better? Well, rcutorture doesn't like this change much. ;-) No surprise, given that it is only the value feeding into rcu_start_this_gp() that needs to change, not the value returned from start_poll_synchronize_rcu(). Take 2, still untested. Thanx, Paul ------------------------------------------------------------------------ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 2122359f0c862..061c1f6737ddc 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3581,7 +3581,7 @@ unsigned long start_poll_synchronize_rcu(void) rdp = this_cpu_ptr(&rcu_data); rnp = rdp->mynode; raw_spin_lock_rcu_node(rnp); // irqs already disabled. - needwake = rcu_start_this_gp(rnp, rdp, gp_seq); + needwake = rcu_start_this_gp(rnp, rdp, rcu_seq_snap(&rcu_state.gp_seq)); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); if (needwake) rcu_gp_kthread_wake();
On Wed, Jul 20, 2022 at 06:51:45PM -0700, Boqun Feng wrote: > On Wed, Jul 20, 2022 at 06:04:55PM -0700, Paul E. McKenney wrote: > [...] > > > > @@ -3860,7 +3944,7 @@ unsigned long get_state_synchronize_rcu(void) > > > > * before the load from ->gp_seq. > > > > */ > > > > smp_mb(); /* ^^^ */ > > > > - return rcu_seq_snap(&rcu_state.gp_seq); > > > > + return rcu_seq_snap(&rcu_state.gp_seq_polled); > > > > > > I happened to run into this. There is one usage of > > > get_state_synchronize_rcu() in start_poll_synchronize_rcu(), in which > > > the return value of get_state_synchronize_rcu() ("gp_seq") will be used > > > for rcu_start_this_gp(). I don't think this is quite right, because > > > after this change, rcu_state.gp_seq and rcu_state.gp_seq_polled are > > > different values, in fact ->gp_seq_polled is greater than ->gp_seq > > > by how many synchronize_rcu() is called in early boot. > > > > > > Am I missing something here? > > > > It does not appear that your are missing anything, sad to say! > > > > Does the following make it work better? > > > > Thanx, Paul > > > > ------------------------------------------------------------------------ > > > > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c > > index 2122359f0c862..cf2fd58a93a41 100644 > > --- a/kernel/rcu/tree.c > > +++ b/kernel/rcu/tree.c > > @@ -3571,7 +3571,7 @@ EXPORT_SYMBOL_GPL(get_state_synchronize_rcu); > > unsigned long start_poll_synchronize_rcu(void) > > { > > unsigned long flags; > > - unsigned long gp_seq = get_state_synchronize_rcu(); > > + unsigned long gp_seq = rcu_seq_snap(&rcu_state.gp_seq); > > get_state_synchronize_rcu() is still needed, because we need to return > a cookie for polling for this function. Something like below maybe? Hope > I didn't mess up the ordering ;-) My thought is to combine your comment with my functionally equivalent code that avoids the extra variable. If that works for you (and if it works, for that matter), does Co-developed-by work for you? Thanx, Paul > Regards, > Boqun > > --------------- > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c > index 84d281776688..0f9134871289 100644 > --- a/kernel/rcu/tree.c > +++ b/kernel/rcu/tree.c > @@ -3571,11 +3583,39 @@ EXPORT_SYMBOL_GPL(get_state_synchronize_rcu); > unsigned long start_poll_synchronize_rcu(void) > { > unsigned long flags; > - unsigned long gp_seq = get_state_synchronize_rcu(); > + unsigned long gp_seq_poll = get_state_synchronize_rcu(); > + unsigned long gp_seq; > bool needwake; > struct rcu_data *rdp; > struct rcu_node *rnp; > > + /* > + * Need to start a gp if no gp has been started yet. > + * > + * Note that we need to snapshot gp_seq after gp_seq_poll, otherwise > + * consider the follow case: > + * > + * <no gp in progress> // gp# is 0 > + * snapshot gp_seq // gp #2 will be set as needed > + * <a gp passed> > + * // gp# is 1 > + * snapshot gp_seq_poll // polling gets ready until gp #3 > + * > + * then the following rcu_start_this_gp() won't mark gp #3 as needed, > + * and polling won't become ready if others don't start a gp. > + * > + * And the following case is fine: > + * > + * <no gp in progress> // gp# is 0 > + * snapshot gp_seq_poll // polling gets ready until gp #2 > + * <a gp passed> > + * // gp# is 1 > + * snapshot gp_seq // gp #3 will be set as needed > + * > + * Also note, we rely on the smp_mb() in get_state_synchronize_rcu() > + * to order the two snapshots. > + */ > + gp_seq = rcu_seq_snap(&rcu_state.gp_seq); > lockdep_assert_irqs_enabled(); > local_irq_save(flags); > rdp = this_cpu_ptr(&rcu_data); > @@ -3585,7 +3625,7 @@ unsigned long start_poll_synchronize_rcu(void) > raw_spin_unlock_irqrestore_rcu_node(rnp, flags); > if (needwake) > rcu_gp_kthread_wake(); > - return gp_seq; > + return gp_seq_poll; > } > EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu);
On Wed, Jul 20, 2022 at 09:47:08PM -0700, Paul E. McKenney wrote: > On Wed, Jul 20, 2022 at 06:51:45PM -0700, Boqun Feng wrote: > > On Wed, Jul 20, 2022 at 06:04:55PM -0700, Paul E. McKenney wrote: > > [...] > > > > > @@ -3860,7 +3944,7 @@ unsigned long get_state_synchronize_rcu(void) > > > > > * before the load from ->gp_seq. > > > > > */ > > > > > smp_mb(); /* ^^^ */ > > > > > - return rcu_seq_snap(&rcu_state.gp_seq); > > > > > + return rcu_seq_snap(&rcu_state.gp_seq_polled); > > > > > > > > I happened to run into this. There is one usage of > > > > get_state_synchronize_rcu() in start_poll_synchronize_rcu(), in which > > > > the return value of get_state_synchronize_rcu() ("gp_seq") will be used > > > > for rcu_start_this_gp(). I don't think this is quite right, because > > > > after this change, rcu_state.gp_seq and rcu_state.gp_seq_polled are > > > > different values, in fact ->gp_seq_polled is greater than ->gp_seq > > > > by how many synchronize_rcu() is called in early boot. > > > > > > > > Am I missing something here? > > > > > > It does not appear that your are missing anything, sad to say! > > > > > > Does the following make it work better? > > > > > > Thanx, Paul > > > > > > ------------------------------------------------------------------------ > > > > > > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c > > > index 2122359f0c862..cf2fd58a93a41 100644 > > > --- a/kernel/rcu/tree.c > > > +++ b/kernel/rcu/tree.c > > > @@ -3571,7 +3571,7 @@ EXPORT_SYMBOL_GPL(get_state_synchronize_rcu); > > > unsigned long start_poll_synchronize_rcu(void) > > > { > > > unsigned long flags; > > > - unsigned long gp_seq = get_state_synchronize_rcu(); > > > + unsigned long gp_seq = rcu_seq_snap(&rcu_state.gp_seq); > > > > get_state_synchronize_rcu() is still needed, because we need to return > > a cookie for polling for this function. Something like below maybe? Hope > > I didn't mess up the ordering ;-) > > My thought is to combine your comment with my functionally equivalent > code that avoids the extra variable. If that works for you (and if it > works, for that matter), does Co-developed-by work for you? > Sure! Thanks ;-) Regards, Boqun > Thanx, Paul > > > Regards, > > Boqun > > > > --------------- > > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c > > index 84d281776688..0f9134871289 100644 > > --- a/kernel/rcu/tree.c > > +++ b/kernel/rcu/tree.c > > @@ -3571,11 +3583,39 @@ EXPORT_SYMBOL_GPL(get_state_synchronize_rcu); > > unsigned long start_poll_synchronize_rcu(void) > > { > > unsigned long flags; > > - unsigned long gp_seq = get_state_synchronize_rcu(); > > + unsigned long gp_seq_poll = get_state_synchronize_rcu(); > > + unsigned long gp_seq; > > bool needwake; > > struct rcu_data *rdp; > > struct rcu_node *rnp; > > > > + /* > > + * Need to start a gp if no gp has been started yet. > > + * > > + * Note that we need to snapshot gp_seq after gp_seq_poll, otherwise > > + * consider the follow case: > > + * > > + * <no gp in progress> // gp# is 0 > > + * snapshot gp_seq // gp #2 will be set as needed > > + * <a gp passed> > > + * // gp# is 1 > > + * snapshot gp_seq_poll // polling gets ready until gp #3 > > + * > > + * then the following rcu_start_this_gp() won't mark gp #3 as needed, > > + * and polling won't become ready if others don't start a gp. > > + * > > + * And the following case is fine: > > + * > > + * <no gp in progress> // gp# is 0 > > + * snapshot gp_seq_poll // polling gets ready until gp #2 > > + * <a gp passed> > > + * // gp# is 1 > > + * snapshot gp_seq // gp #3 will be set as needed > > + * > > + * Also note, we rely on the smp_mb() in get_state_synchronize_rcu() > > + * to order the two snapshots. > > + */ > > + gp_seq = rcu_seq_snap(&rcu_state.gp_seq); > > lockdep_assert_irqs_enabled(); > > local_irq_save(flags); > > rdp = this_cpu_ptr(&rcu_data); > > @@ -3585,7 +3625,7 @@ unsigned long start_poll_synchronize_rcu(void) > > raw_spin_unlock_irqrestore_rcu_node(rnp, flags); > > if (needwake) > > rcu_gp_kthread_wake(); > > - return gp_seq; > > + return gp_seq_poll; > > } > > EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu);
On Wed, Jul 20, 2022 at 10:49:26PM -0700, Boqun Feng wrote: > On Wed, Jul 20, 2022 at 09:47:08PM -0700, Paul E. McKenney wrote: > > On Wed, Jul 20, 2022 at 06:51:45PM -0700, Boqun Feng wrote: > > > On Wed, Jul 20, 2022 at 06:04:55PM -0700, Paul E. McKenney wrote: > > > [...] > > > > > > @@ -3860,7 +3944,7 @@ unsigned long get_state_synchronize_rcu(void) > > > > > > * before the load from ->gp_seq. > > > > > > */ > > > > > > smp_mb(); /* ^^^ */ > > > > > > - return rcu_seq_snap(&rcu_state.gp_seq); > > > > > > + return rcu_seq_snap(&rcu_state.gp_seq_polled); > > > > > > > > > > I happened to run into this. There is one usage of > > > > > get_state_synchronize_rcu() in start_poll_synchronize_rcu(), in which > > > > > the return value of get_state_synchronize_rcu() ("gp_seq") will be used > > > > > for rcu_start_this_gp(). I don't think this is quite right, because > > > > > after this change, rcu_state.gp_seq and rcu_state.gp_seq_polled are > > > > > different values, in fact ->gp_seq_polled is greater than ->gp_seq > > > > > by how many synchronize_rcu() is called in early boot. > > > > > > > > > > Am I missing something here? > > > > > > > > It does not appear that your are missing anything, sad to say! > > > > > > > > Does the following make it work better? > > > > > > > > Thanx, Paul > > > > > > > > ------------------------------------------------------------------------ > > > > > > > > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c > > > > index 2122359f0c862..cf2fd58a93a41 100644 > > > > --- a/kernel/rcu/tree.c > > > > +++ b/kernel/rcu/tree.c > > > > @@ -3571,7 +3571,7 @@ EXPORT_SYMBOL_GPL(get_state_synchronize_rcu); > > > > unsigned long start_poll_synchronize_rcu(void) > > > > { > > > > unsigned long flags; > > > > - unsigned long gp_seq = get_state_synchronize_rcu(); > > > > + unsigned long gp_seq = rcu_seq_snap(&rcu_state.gp_seq); > > > > > > get_state_synchronize_rcu() is still needed, because we need to return > > > a cookie for polling for this function. Something like below maybe? Hope > > > I didn't mess up the ordering ;-) > > > > My thought is to combine your comment with my functionally equivalent > > code that avoids the extra variable. If that works for you (and if it > > works, for that matter), does Co-developed-by work for you? > > Sure! Thanks ;-) I did some summarization on the comment, folded it into the original, and ended up with the patch shown below. Thoughts? Thanx, Paul ------------------------------------------------------------------------ commit bf95b2bc3e42f11f4d7a5e8a98376c2b4a2aa82f Author: Paul E. McKenney <paulmck@kernel.org> Date: Wed Apr 13 17:46:15 2022 -0700 rcu: Switch polled grace-period APIs to ->gp_seq_polled This commit switches the existing polled grace-period APIs to use a new ->gp_seq_polled counter in the rcu_state structure. An additional ->gp_seq_polled_snap counter in that same structure allows the normal grace period kthread to interact properly with the !SMP !PREEMPT fastpath through synchronize_rcu(). The first of the two to note the end of a given grace period will make knowledge of this transition available to the polled API. This commit is in preparation for polled expedited grace periods. [ paulmck: Fix use of rcu_state.gp_seq_polled to start normal grace period. ] Link: https://lore.kernel.org/all/20220121142454.1994916-1-bfoster@redhat.com/ Link: https://docs.google.com/document/d/1RNKWW9jQyfjxw2E8dsXVTdvZYh0HnYeSHDKog9jhdN8/edit?usp=sharing Cc: Brian Foster <bfoster@redhat.com> Cc: Dave Chinner <david@fromorbit.com> Cc: Al Viro <viro@zeniv.linux.org.uk> Cc: Ian Kent <raven@themaw.net> Co-developed-by: Boqun Feng <boqun.feng@gmail.com> Signed-off-by: Boqun Feng <boqun.feng@gmail.com> Signed-off-by: Paul E. McKenney <paulmck@kernel.org> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 46cfceea87847..b40a5a19ddd2a 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1775,6 +1775,78 @@ static void rcu_strict_gp_boundary(void *unused) invoke_rcu_core(); } +// Has rcu_init() been invoked? This is used (for example) to determine +// whether spinlocks may be acquired safely. +static bool rcu_init_invoked(void) +{ + return !!rcu_state.n_online_cpus; +} + +// Make the polled API aware of the beginning of a grace period. +static void rcu_poll_gp_seq_start(unsigned long *snap) +{ + struct rcu_node *rnp = rcu_get_root(); + + if (rcu_init_invoked()) + raw_lockdep_assert_held_rcu_node(rnp); + + // If RCU was idle, note beginning of GP. + if (!rcu_seq_state(rcu_state.gp_seq_polled)) + rcu_seq_start(&rcu_state.gp_seq_polled); + + // Either way, record current state. + *snap = rcu_state.gp_seq_polled; +} + +// Make the polled API aware of the end of a grace period. +static void rcu_poll_gp_seq_end(unsigned long *snap) +{ + struct rcu_node *rnp = rcu_get_root(); + + if (rcu_init_invoked()) + raw_lockdep_assert_held_rcu_node(rnp); + + // If the previously noted GP is still in effect, record the + // end of that GP. Either way, zero counter to avoid counter-wrap + // problems. + if (*snap && *snap == rcu_state.gp_seq_polled) { + rcu_seq_end(&rcu_state.gp_seq_polled); + rcu_state.gp_seq_polled_snap = 0; + } else { + *snap = 0; + } +} + +// Make the polled API aware of the beginning of a grace period, but +// where caller does not hold the root rcu_node structure's lock. +static void rcu_poll_gp_seq_start_unlocked(unsigned long *snap) +{ + struct rcu_node *rnp = rcu_get_root(); + + if (rcu_init_invoked()) { + lockdep_assert_irqs_enabled(); + raw_spin_lock_irq_rcu_node(rnp); + } + rcu_poll_gp_seq_start(snap); + if (rcu_init_invoked()) + raw_spin_unlock_irq_rcu_node(rnp); +} + +// Make the polled API aware of the end of a grace period, but where +// caller does not hold the root rcu_node structure's lock. +static void rcu_poll_gp_seq_end_unlocked(unsigned long *snap) +{ + struct rcu_node *rnp = rcu_get_root(); + + if (rcu_init_invoked()) { + lockdep_assert_irqs_enabled(); + raw_spin_lock_irq_rcu_node(rnp); + } + rcu_poll_gp_seq_end(snap); + if (rcu_init_invoked()) + raw_spin_unlock_irq_rcu_node(rnp); +} + /* * Initialize a new grace period. Return false if no grace period required. */ @@ -1810,6 +1882,7 @@ static noinline_for_stack bool rcu_gp_init(void) rcu_seq_start(&rcu_state.gp_seq); ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq); trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("start")); + rcu_poll_gp_seq_start(&rcu_state.gp_seq_polled_snap); raw_spin_unlock_irq_rcu_node(rnp); /* @@ -2069,6 +2142,7 @@ static noinline void rcu_gp_cleanup(void) * safe for us to drop the lock in order to mark the grace * period as completed in all of the rcu_node structures. */ + rcu_poll_gp_seq_end(&rcu_state.gp_seq_polled_snap); raw_spin_unlock_irq_rcu_node(rnp); /* @@ -3837,8 +3911,18 @@ void synchronize_rcu(void) lock_is_held(&rcu_lock_map) || lock_is_held(&rcu_sched_lock_map), "Illegal synchronize_rcu() in RCU read-side critical section"); - if (rcu_blocking_is_gp()) + if (rcu_blocking_is_gp()) { + // Note well that this code runs with !PREEMPT && !SMP. + // In addition, all code that advances grace periods runs + // at process level. Therefore, this GP overlaps with other + // GPs only by being fully nested within them, which allows + // reuse of ->gp_seq_polled_snap. + rcu_poll_gp_seq_start_unlocked(&rcu_state.gp_seq_polled_snap); + rcu_poll_gp_seq_end_unlocked(&rcu_state.gp_seq_polled_snap); + if (rcu_init_invoked()) + cond_resched_tasks_rcu_qs(); return; // Context allows vacuous grace periods. + } if (rcu_gp_is_expedited()) synchronize_rcu_expedited(); else @@ -3860,7 +3944,7 @@ unsigned long get_state_synchronize_rcu(void) * before the load from ->gp_seq. */ smp_mb(); /* ^^^ */ - return rcu_seq_snap(&rcu_state.gp_seq); + return rcu_seq_snap(&rcu_state.gp_seq_polled); } EXPORT_SYMBOL_GPL(get_state_synchronize_rcu); @@ -3889,7 +3973,13 @@ unsigned long start_poll_synchronize_rcu(void) rdp = this_cpu_ptr(&rcu_data); rnp = rdp->mynode; raw_spin_lock_rcu_node(rnp); // irqs already disabled. - needwake = rcu_start_this_gp(rnp, rdp, gp_seq); + // Note it is possible for a grace period to have elapsed between + // the above call to get_state_synchronize_rcu() and the below call + // to rcu_seq_snap. This is OK, the worst that happens is that we + // get a grace period that no one needed. These accesses are ordered + // by smp_mb(), and we are accessing them in the opposite order + // from which they are updated at grace-period start, as required. + needwake = rcu_start_this_gp(rnp, rdp, rcu_seq_snap(&rcu_state.gp_seq)); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); if (needwake) rcu_gp_kthread_wake(); @@ -3925,7 +4015,7 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu); bool poll_state_synchronize_rcu(unsigned long oldstate) { if (oldstate == RCU_GET_STATE_COMPLETED || - rcu_seq_done_exact(&rcu_state.gp_seq, oldstate)) { + rcu_seq_done_exact(&rcu_state.gp_seq_polled, oldstate)) { smp_mb(); /* Ensure GP ends before subsequent accesses. */ return true; } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 2ccf5845957df..9c853033f159d 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -323,6 +323,8 @@ struct rcu_state { short gp_state; /* GP kthread sleep state. */ unsigned long gp_wake_time; /* Last GP kthread wake. */ unsigned long gp_wake_seq; /* ->gp_seq at ^^^. */ + unsigned long gp_seq_polled; /* GP seq for polled API. */ + unsigned long gp_seq_polled_snap; /* ->gp_seq_polled at normal GP start. */ /* End of fields guarded by root rcu_node's lock. */
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 46cfceea87847..637e8f9454573 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1775,6 +1775,78 @@ static void rcu_strict_gp_boundary(void *unused) invoke_rcu_core(); } +// Has rcu_init() been invoked? This is used (for example) to determine +// whether spinlocks may be acquired safely. +static bool rcu_init_invoked(void) +{ + return !!rcu_state.n_online_cpus; +} + +// Make the polled API aware of the beginning of a grace period. +static void rcu_poll_gp_seq_start(unsigned long *snap) +{ + struct rcu_node *rnp = rcu_get_root(); + + if (rcu_init_invoked()) + raw_lockdep_assert_held_rcu_node(rnp); + + // If RCU was idle, note beginning of GP. + if (!rcu_seq_state(rcu_state.gp_seq_polled)) + rcu_seq_start(&rcu_state.gp_seq_polled); + + // Either way, record current state. + *snap = rcu_state.gp_seq_polled; +} + +// Make the polled API aware of the end of a grace period. +static void rcu_poll_gp_seq_end(unsigned long *snap) +{ + struct rcu_node *rnp = rcu_get_root(); + + if (rcu_init_invoked()) + raw_lockdep_assert_held_rcu_node(rnp); + + // If the the previously noted GP is still in effect, record the + // end of that GP. Either way, zero counter to avoid counter-wrap + // problems. + if (*snap && *snap == rcu_state.gp_seq_polled) { + rcu_seq_end(&rcu_state.gp_seq_polled); + rcu_state.gp_seq_polled_snap = 0; + } else { + *snap = 0; + } +} + +// Make the polled API aware of the beginning of a grace period, but +// where caller does not hold the root rcu_node structure's lock. +static void rcu_poll_gp_seq_start_unlocked(unsigned long *snap) +{ + struct rcu_node *rnp = rcu_get_root(); + + if (rcu_init_invoked()) { + lockdep_assert_irqs_enabled(); + raw_spin_lock_irq_rcu_node(rnp); + } + rcu_poll_gp_seq_start(snap); + if (rcu_init_invoked()) + raw_spin_unlock_irq_rcu_node(rnp); +} + +// Make the polled API aware of the end of a grace period, but where +// caller does not hold the root rcu_node structure's lock. +static void rcu_poll_gp_seq_end_unlocked(unsigned long *snap) +{ + struct rcu_node *rnp = rcu_get_root(); + + if (rcu_init_invoked()) { + lockdep_assert_irqs_enabled(); + raw_spin_lock_irq_rcu_node(rnp); + } + rcu_poll_gp_seq_end(snap); + if (rcu_init_invoked()) + raw_spin_unlock_irq_rcu_node(rnp); +} + /* * Initialize a new grace period. Return false if no grace period required. */ @@ -1810,6 +1882,7 @@ static noinline_for_stack bool rcu_gp_init(void) rcu_seq_start(&rcu_state.gp_seq); ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq); trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("start")); + rcu_poll_gp_seq_start(&rcu_state.gp_seq_polled_snap); raw_spin_unlock_irq_rcu_node(rnp); /* @@ -2069,6 +2142,7 @@ static noinline void rcu_gp_cleanup(void) * safe for us to drop the lock in order to mark the grace * period as completed in all of the rcu_node structures. */ + rcu_poll_gp_seq_end(&rcu_state.gp_seq_polled_snap); raw_spin_unlock_irq_rcu_node(rnp); /* @@ -3837,8 +3911,18 @@ void synchronize_rcu(void) lock_is_held(&rcu_lock_map) || lock_is_held(&rcu_sched_lock_map), "Illegal synchronize_rcu() in RCU read-side critical section"); - if (rcu_blocking_is_gp()) + if (rcu_blocking_is_gp()) { + // Note well that this code runs with !PREEMPT && !SMP. + // In addition, all code that advances grace periods runs + // at process level. Therefore, this GP overlaps with other + // GPs only by being fully nested within them, which allows + // reuse of ->gp_seq_polled_snap. + rcu_poll_gp_seq_start_unlocked(&rcu_state.gp_seq_polled_snap); + rcu_poll_gp_seq_end_unlocked(&rcu_state.gp_seq_polled_snap); + if (rcu_init_invoked()) + cond_resched_tasks_rcu_qs(); return; // Context allows vacuous grace periods. + } if (rcu_gp_is_expedited()) synchronize_rcu_expedited(); else @@ -3860,7 +3944,7 @@ unsigned long get_state_synchronize_rcu(void) * before the load from ->gp_seq. */ smp_mb(); /* ^^^ */ - return rcu_seq_snap(&rcu_state.gp_seq); + return rcu_seq_snap(&rcu_state.gp_seq_polled); } EXPORT_SYMBOL_GPL(get_state_synchronize_rcu); @@ -3925,7 +4009,7 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu); bool poll_state_synchronize_rcu(unsigned long oldstate) { if (oldstate == RCU_GET_STATE_COMPLETED || - rcu_seq_done_exact(&rcu_state.gp_seq, oldstate)) { + rcu_seq_done_exact(&rcu_state.gp_seq_polled, oldstate)) { smp_mb(); /* Ensure GP ends before subsequent accesses. */ return true; } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 2ccf5845957df..9c853033f159d 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -323,6 +323,8 @@ struct rcu_state { short gp_state; /* GP kthread sleep state. */ unsigned long gp_wake_time; /* Last GP kthread wake. */ unsigned long gp_wake_seq; /* ->gp_seq at ^^^. */ + unsigned long gp_seq_polled; /* GP seq for polled API. */ + unsigned long gp_seq_polled_snap; /* ->gp_seq_polled at normal GP start. */ /* End of fields guarded by root rcu_node's lock. */
This commit switches the existing polled grace-period APIs to use a new ->gp_seq_polled counter in the rcu_state structure. An additional ->gp_seq_polled_snap counter in that same structure allows the normal grace period kthread to interact properly with the !SMP !PREEMPT fastpath through synchronize_rcu(). The first of the two to note the end of a given grace period will make knowledge of this transition available to the polled API. This commit is in preparation for polled expedited grace periods. Link: https://lore.kernel.org/all/20220121142454.1994916-1-bfoster@redhat.com/ Link: https://docs.google.com/document/d/1RNKWW9jQyfjxw2E8dsXVTdvZYh0HnYeSHDKog9jhdN8/edit?usp=sharing Cc: Brian Foster <bfoster@redhat.com> Cc: Dave Chinner <david@fromorbit.com> Cc: Al Viro <viro@zeniv.linux.org.uk> Cc: Ian Kent <raven@themaw.net> Signed-off-by: Paul E. McKenney <paulmck@kernel.org> --- kernel/rcu/tree.c | 90 +++++++++++++++++++++++++++++++++++++++++++++-- kernel/rcu/tree.h | 2 ++ 2 files changed, 89 insertions(+), 3 deletions(-)