diff mbox series

[04/12] rcu: Switch polled grace-period APIs to ->gp_seq_polled

Message ID 20220620225128.3842050-4-paulmck@kernel.org (mailing list archive)
State New, archived
Headers show
Series Polled grace-period updates for v5.20 | expand

Commit Message

Paul E. McKenney June 20, 2022, 10:51 p.m. UTC
This commit switches the existing polled grace-period APIs to use a
new ->gp_seq_polled counter in the rcu_state structure.  An additional
->gp_seq_polled_snap counter in that same structure allows the normal
grace period kthread to interact properly with the !SMP !PREEMPT fastpath
through synchronize_rcu().  The first of the two to note the end of a
given grace period will make knowledge of this transition available to
the polled API.

This commit is in preparation for polled expedited grace periods.

Link: https://lore.kernel.org/all/20220121142454.1994916-1-bfoster@redhat.com/
Link: https://docs.google.com/document/d/1RNKWW9jQyfjxw2E8dsXVTdvZYh0HnYeSHDKog9jhdN8/edit?usp=sharing
Cc: Brian Foster <bfoster@redhat.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Ian Kent <raven@themaw.net>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c | 90 +++++++++++++++++++++++++++++++++++++++++++++--
 kernel/rcu/tree.h |  2 ++
 2 files changed, 89 insertions(+), 3 deletions(-)

Comments

Boqun Feng July 21, 2022, 12:53 a.m. UTC | #1
Hi Paul,

On Mon, Jun 20, 2022 at 03:51:20PM -0700, Paul E. McKenney wrote:
> This commit switches the existing polled grace-period APIs to use a
> new ->gp_seq_polled counter in the rcu_state structure.  An additional
> ->gp_seq_polled_snap counter in that same structure allows the normal
> grace period kthread to interact properly with the !SMP !PREEMPT fastpath
> through synchronize_rcu().  The first of the two to note the end of a
> given grace period will make knowledge of this transition available to
> the polled API.
> 
> This commit is in preparation for polled expedited grace periods.
> 
> Link: https://lore.kernel.org/all/20220121142454.1994916-1-bfoster@redhat.com/
> Link: https://docs.google.com/document/d/1RNKWW9jQyfjxw2E8dsXVTdvZYh0HnYeSHDKog9jhdN8/edit?usp=sharing
> Cc: Brian Foster <bfoster@redhat.com>
> Cc: Dave Chinner <david@fromorbit.com>
> Cc: Al Viro <viro@zeniv.linux.org.uk>
> Cc: Ian Kent <raven@themaw.net>
> Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
> ---
>  kernel/rcu/tree.c | 90 +++++++++++++++++++++++++++++++++++++++++++++--
>  kernel/rcu/tree.h |  2 ++
>  2 files changed, 89 insertions(+), 3 deletions(-)
> 
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index 46cfceea87847..637e8f9454573 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -1775,6 +1775,78 @@ static void rcu_strict_gp_boundary(void *unused)
>  	invoke_rcu_core();
>  }
>  
> +// Has rcu_init() been invoked?  This is used (for example) to determine
> +// whether spinlocks may be acquired safely.
> +static bool rcu_init_invoked(void)
> +{
> +	return !!rcu_state.n_online_cpus;
> +}
> +
> +// Make the polled API aware of the beginning of a grace period.
> +static void rcu_poll_gp_seq_start(unsigned long *snap)
> +{
> +	struct rcu_node *rnp = rcu_get_root();
> +
> +	if (rcu_init_invoked())
> +		raw_lockdep_assert_held_rcu_node(rnp);
> +
> +	// If RCU was idle, note beginning of GP.
> +	if (!rcu_seq_state(rcu_state.gp_seq_polled))
> +		rcu_seq_start(&rcu_state.gp_seq_polled);
> +
> +	// Either way, record current state.
> +	*snap = rcu_state.gp_seq_polled;
> +}
> +
> +// Make the polled API aware of the end of a grace period.
> +static void rcu_poll_gp_seq_end(unsigned long *snap)
> +{
> +	struct rcu_node *rnp = rcu_get_root();
> +
> +	if (rcu_init_invoked())
> +		raw_lockdep_assert_held_rcu_node(rnp);
> +
> +	// If the the previously noted GP is still in effect, record the
> +	// end of that GP.  Either way, zero counter to avoid counter-wrap
> +	// problems.
> +	if (*snap && *snap == rcu_state.gp_seq_polled) {
> +		rcu_seq_end(&rcu_state.gp_seq_polled);
> +		rcu_state.gp_seq_polled_snap = 0;
> +	} else {
> +		*snap = 0;
> +	}
> +}
> +
> +// Make the polled API aware of the beginning of a grace period, but
> +// where caller does not hold the root rcu_node structure's lock.
> +static void rcu_poll_gp_seq_start_unlocked(unsigned long *snap)
> +{
> +	struct rcu_node *rnp = rcu_get_root();
> +
> +	if (rcu_init_invoked()) {
> +		lockdep_assert_irqs_enabled();
> +		raw_spin_lock_irq_rcu_node(rnp);
> +	}
> +	rcu_poll_gp_seq_start(snap);
> +	if (rcu_init_invoked())
> +		raw_spin_unlock_irq_rcu_node(rnp);
> +}
> +
> +// Make the polled API aware of the end of a grace period, but where
> +// caller does not hold the root rcu_node structure's lock.
> +static void rcu_poll_gp_seq_end_unlocked(unsigned long *snap)
> +{
> +	struct rcu_node *rnp = rcu_get_root();
> +
> +	if (rcu_init_invoked()) {
> +		lockdep_assert_irqs_enabled();
> +		raw_spin_lock_irq_rcu_node(rnp);
> +	}
> +	rcu_poll_gp_seq_end(snap);
> +	if (rcu_init_invoked())
> +		raw_spin_unlock_irq_rcu_node(rnp);
> +}
> +
>  /*
>   * Initialize a new grace period.  Return false if no grace period required.
>   */
> @@ -1810,6 +1882,7 @@ static noinline_for_stack bool rcu_gp_init(void)
>  	rcu_seq_start(&rcu_state.gp_seq);
>  	ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq);
>  	trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("start"));
> +	rcu_poll_gp_seq_start(&rcu_state.gp_seq_polled_snap);
>  	raw_spin_unlock_irq_rcu_node(rnp);
>  
>  	/*
> @@ -2069,6 +2142,7 @@ static noinline void rcu_gp_cleanup(void)
>  	 * safe for us to drop the lock in order to mark the grace
>  	 * period as completed in all of the rcu_node structures.
>  	 */
> +	rcu_poll_gp_seq_end(&rcu_state.gp_seq_polled_snap);
>  	raw_spin_unlock_irq_rcu_node(rnp);
>  
>  	/*
> @@ -3837,8 +3911,18 @@ void synchronize_rcu(void)
>  			 lock_is_held(&rcu_lock_map) ||
>  			 lock_is_held(&rcu_sched_lock_map),
>  			 "Illegal synchronize_rcu() in RCU read-side critical section");
> -	if (rcu_blocking_is_gp())
> +	if (rcu_blocking_is_gp()) {
> +		// Note well that this code runs with !PREEMPT && !SMP.
> +		// In addition, all code that advances grace periods runs
> +		// at process level.  Therefore, this GP overlaps with other
> +		// GPs only by being fully nested within them, which allows
> +		// reuse of ->gp_seq_polled_snap.
> +		rcu_poll_gp_seq_start_unlocked(&rcu_state.gp_seq_polled_snap);
> +		rcu_poll_gp_seq_end_unlocked(&rcu_state.gp_seq_polled_snap);
> +		if (rcu_init_invoked())
> +			cond_resched_tasks_rcu_qs();
>  		return;  // Context allows vacuous grace periods.
> +	}
>  	if (rcu_gp_is_expedited())
>  		synchronize_rcu_expedited();
>  	else
> @@ -3860,7 +3944,7 @@ unsigned long get_state_synchronize_rcu(void)
>  	 * before the load from ->gp_seq.
>  	 */
>  	smp_mb();  /* ^^^ */
> -	return rcu_seq_snap(&rcu_state.gp_seq);
> +	return rcu_seq_snap(&rcu_state.gp_seq_polled);

I happened to run into this. There is one usage of
get_state_synchronize_rcu() in start_poll_synchronize_rcu(), in which
the return value of get_state_synchronize_rcu() ("gp_seq") will be used
for rcu_start_this_gp(). I don't think this is quite right, because
after this change, rcu_state.gp_seq and rcu_state.gp_seq_polled are
different values, in fact ->gp_seq_polled is greater than ->gp_seq
by how many synchronize_rcu() is called in early boot.

Am I missing something here?

Regards,
Boqun

>  }
>  EXPORT_SYMBOL_GPL(get_state_synchronize_rcu);
>  
> @@ -3925,7 +4009,7 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu);
>  bool poll_state_synchronize_rcu(unsigned long oldstate)
>  {
>  	if (oldstate == RCU_GET_STATE_COMPLETED ||
> -	    rcu_seq_done_exact(&rcu_state.gp_seq, oldstate)) {
> +	    rcu_seq_done_exact(&rcu_state.gp_seq_polled, oldstate)) {
>  		smp_mb(); /* Ensure GP ends before subsequent accesses. */
>  		return true;
>  	}
> diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
> index 2ccf5845957df..9c853033f159d 100644
> --- a/kernel/rcu/tree.h
> +++ b/kernel/rcu/tree.h
> @@ -323,6 +323,8 @@ struct rcu_state {
>  	short gp_state;				/* GP kthread sleep state. */
>  	unsigned long gp_wake_time;		/* Last GP kthread wake. */
>  	unsigned long gp_wake_seq;		/* ->gp_seq at ^^^. */
> +	unsigned long gp_seq_polled;		/* GP seq for polled API. */
> +	unsigned long gp_seq_polled_snap;	/* ->gp_seq_polled at normal GP start. */
>  
>  	/* End of fields guarded by root rcu_node's lock. */
>  
> -- 
> 2.31.1.189.g2e36527f23
>
Paul E. McKenney July 21, 2022, 1:04 a.m. UTC | #2
On Wed, Jul 20, 2022 at 05:53:38PM -0700, Boqun Feng wrote:
> Hi Paul,
> 
> On Mon, Jun 20, 2022 at 03:51:20PM -0700, Paul E. McKenney wrote:
> > This commit switches the existing polled grace-period APIs to use a
> > new ->gp_seq_polled counter in the rcu_state structure.  An additional
> > ->gp_seq_polled_snap counter in that same structure allows the normal
> > grace period kthread to interact properly with the !SMP !PREEMPT fastpath
> > through synchronize_rcu().  The first of the two to note the end of a
> > given grace period will make knowledge of this transition available to
> > the polled API.
> > 
> > This commit is in preparation for polled expedited grace periods.
> > 
> > Link: https://lore.kernel.org/all/20220121142454.1994916-1-bfoster@redhat.com/
> > Link: https://docs.google.com/document/d/1RNKWW9jQyfjxw2E8dsXVTdvZYh0HnYeSHDKog9jhdN8/edit?usp=sharing
> > Cc: Brian Foster <bfoster@redhat.com>
> > Cc: Dave Chinner <david@fromorbit.com>
> > Cc: Al Viro <viro@zeniv.linux.org.uk>
> > Cc: Ian Kent <raven@themaw.net>
> > Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
> > ---
> >  kernel/rcu/tree.c | 90 +++++++++++++++++++++++++++++++++++++++++++++--
> >  kernel/rcu/tree.h |  2 ++
> >  2 files changed, 89 insertions(+), 3 deletions(-)
> > 
> > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> > index 46cfceea87847..637e8f9454573 100644
> > --- a/kernel/rcu/tree.c
> > +++ b/kernel/rcu/tree.c
> > @@ -1775,6 +1775,78 @@ static void rcu_strict_gp_boundary(void *unused)
> >  	invoke_rcu_core();
> >  }
> >  
> > +// Has rcu_init() been invoked?  This is used (for example) to determine
> > +// whether spinlocks may be acquired safely.
> > +static bool rcu_init_invoked(void)
> > +{
> > +	return !!rcu_state.n_online_cpus;
> > +}
> > +
> > +// Make the polled API aware of the beginning of a grace period.
> > +static void rcu_poll_gp_seq_start(unsigned long *snap)
> > +{
> > +	struct rcu_node *rnp = rcu_get_root();
> > +
> > +	if (rcu_init_invoked())
> > +		raw_lockdep_assert_held_rcu_node(rnp);
> > +
> > +	// If RCU was idle, note beginning of GP.
> > +	if (!rcu_seq_state(rcu_state.gp_seq_polled))
> > +		rcu_seq_start(&rcu_state.gp_seq_polled);
> > +
> > +	// Either way, record current state.
> > +	*snap = rcu_state.gp_seq_polled;
> > +}
> > +
> > +// Make the polled API aware of the end of a grace period.
> > +static void rcu_poll_gp_seq_end(unsigned long *snap)
> > +{
> > +	struct rcu_node *rnp = rcu_get_root();
> > +
> > +	if (rcu_init_invoked())
> > +		raw_lockdep_assert_held_rcu_node(rnp);
> > +
> > +	// If the the previously noted GP is still in effect, record the
> > +	// end of that GP.  Either way, zero counter to avoid counter-wrap
> > +	// problems.
> > +	if (*snap && *snap == rcu_state.gp_seq_polled) {
> > +		rcu_seq_end(&rcu_state.gp_seq_polled);
> > +		rcu_state.gp_seq_polled_snap = 0;
> > +	} else {
> > +		*snap = 0;
> > +	}
> > +}
> > +
> > +// Make the polled API aware of the beginning of a grace period, but
> > +// where caller does not hold the root rcu_node structure's lock.
> > +static void rcu_poll_gp_seq_start_unlocked(unsigned long *snap)
> > +{
> > +	struct rcu_node *rnp = rcu_get_root();
> > +
> > +	if (rcu_init_invoked()) {
> > +		lockdep_assert_irqs_enabled();
> > +		raw_spin_lock_irq_rcu_node(rnp);
> > +	}
> > +	rcu_poll_gp_seq_start(snap);
> > +	if (rcu_init_invoked())
> > +		raw_spin_unlock_irq_rcu_node(rnp);
> > +}
> > +
> > +// Make the polled API aware of the end of a grace period, but where
> > +// caller does not hold the root rcu_node structure's lock.
> > +static void rcu_poll_gp_seq_end_unlocked(unsigned long *snap)
> > +{
> > +	struct rcu_node *rnp = rcu_get_root();
> > +
> > +	if (rcu_init_invoked()) {
> > +		lockdep_assert_irqs_enabled();
> > +		raw_spin_lock_irq_rcu_node(rnp);
> > +	}
> > +	rcu_poll_gp_seq_end(snap);
> > +	if (rcu_init_invoked())
> > +		raw_spin_unlock_irq_rcu_node(rnp);
> > +}
> > +
> >  /*
> >   * Initialize a new grace period.  Return false if no grace period required.
> >   */
> > @@ -1810,6 +1882,7 @@ static noinline_for_stack bool rcu_gp_init(void)
> >  	rcu_seq_start(&rcu_state.gp_seq);
> >  	ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq);
> >  	trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("start"));
> > +	rcu_poll_gp_seq_start(&rcu_state.gp_seq_polled_snap);
> >  	raw_spin_unlock_irq_rcu_node(rnp);
> >  
> >  	/*
> > @@ -2069,6 +2142,7 @@ static noinline void rcu_gp_cleanup(void)
> >  	 * safe for us to drop the lock in order to mark the grace
> >  	 * period as completed in all of the rcu_node structures.
> >  	 */
> > +	rcu_poll_gp_seq_end(&rcu_state.gp_seq_polled_snap);
> >  	raw_spin_unlock_irq_rcu_node(rnp);
> >  
> >  	/*
> > @@ -3837,8 +3911,18 @@ void synchronize_rcu(void)
> >  			 lock_is_held(&rcu_lock_map) ||
> >  			 lock_is_held(&rcu_sched_lock_map),
> >  			 "Illegal synchronize_rcu() in RCU read-side critical section");
> > -	if (rcu_blocking_is_gp())
> > +	if (rcu_blocking_is_gp()) {
> > +		// Note well that this code runs with !PREEMPT && !SMP.
> > +		// In addition, all code that advances grace periods runs
> > +		// at process level.  Therefore, this GP overlaps with other
> > +		// GPs only by being fully nested within them, which allows
> > +		// reuse of ->gp_seq_polled_snap.
> > +		rcu_poll_gp_seq_start_unlocked(&rcu_state.gp_seq_polled_snap);
> > +		rcu_poll_gp_seq_end_unlocked(&rcu_state.gp_seq_polled_snap);
> > +		if (rcu_init_invoked())
> > +			cond_resched_tasks_rcu_qs();
> >  		return;  // Context allows vacuous grace periods.
> > +	}
> >  	if (rcu_gp_is_expedited())
> >  		synchronize_rcu_expedited();
> >  	else
> > @@ -3860,7 +3944,7 @@ unsigned long get_state_synchronize_rcu(void)
> >  	 * before the load from ->gp_seq.
> >  	 */
> >  	smp_mb();  /* ^^^ */
> > -	return rcu_seq_snap(&rcu_state.gp_seq);
> > +	return rcu_seq_snap(&rcu_state.gp_seq_polled);
> 
> I happened to run into this. There is one usage of
> get_state_synchronize_rcu() in start_poll_synchronize_rcu(), in which
> the return value of get_state_synchronize_rcu() ("gp_seq") will be used
> for rcu_start_this_gp(). I don't think this is quite right, because
> after this change, rcu_state.gp_seq and rcu_state.gp_seq_polled are
> different values, in fact ->gp_seq_polled is greater than ->gp_seq
> by how many synchronize_rcu() is called in early boot.
> 
> Am I missing something here?

It does not appear that your are missing anything, sad to say!

Does the following make it work better?

							Thanx, Paul

------------------------------------------------------------------------

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 2122359f0c862..cf2fd58a93a41 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3571,7 +3571,7 @@ EXPORT_SYMBOL_GPL(get_state_synchronize_rcu);
 unsigned long start_poll_synchronize_rcu(void)
 {
 	unsigned long flags;
-	unsigned long gp_seq = get_state_synchronize_rcu();
+	unsigned long gp_seq = rcu_seq_snap(&rcu_state.gp_seq);
 	bool needwake;
 	struct rcu_data *rdp;
 	struct rcu_node *rnp;
Boqun Feng July 21, 2022, 1:51 a.m. UTC | #3
On Wed, Jul 20, 2022 at 06:04:55PM -0700, Paul E. McKenney wrote:
[...]
> > > @@ -3860,7 +3944,7 @@ unsigned long get_state_synchronize_rcu(void)
> > >  	 * before the load from ->gp_seq.
> > >  	 */
> > >  	smp_mb();  /* ^^^ */
> > > -	return rcu_seq_snap(&rcu_state.gp_seq);
> > > +	return rcu_seq_snap(&rcu_state.gp_seq_polled);
> > 
> > I happened to run into this. There is one usage of
> > get_state_synchronize_rcu() in start_poll_synchronize_rcu(), in which
> > the return value of get_state_synchronize_rcu() ("gp_seq") will be used
> > for rcu_start_this_gp(). I don't think this is quite right, because
> > after this change, rcu_state.gp_seq and rcu_state.gp_seq_polled are
> > different values, in fact ->gp_seq_polled is greater than ->gp_seq
> > by how many synchronize_rcu() is called in early boot.
> > 
> > Am I missing something here?
> 
> It does not appear that your are missing anything, sad to say!
> 
> Does the following make it work better?
> 
> 							Thanx, Paul
> 
> ------------------------------------------------------------------------
> 
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index 2122359f0c862..cf2fd58a93a41 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -3571,7 +3571,7 @@ EXPORT_SYMBOL_GPL(get_state_synchronize_rcu);
>  unsigned long start_poll_synchronize_rcu(void)
>  {
>  	unsigned long flags;
> -	unsigned long gp_seq = get_state_synchronize_rcu();
> +	unsigned long gp_seq = rcu_seq_snap(&rcu_state.gp_seq);

get_state_synchronize_rcu() is still needed, because we need to return
a cookie for polling for this function. Something like below maybe? Hope
I didn't mess up the ordering ;-)

Regards,
Boqun

---------------
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 84d281776688..0f9134871289 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3571,11 +3583,39 @@ EXPORT_SYMBOL_GPL(get_state_synchronize_rcu);
 unsigned long start_poll_synchronize_rcu(void)
 {
        unsigned long flags;
-       unsigned long gp_seq = get_state_synchronize_rcu();
+       unsigned long gp_seq_poll = get_state_synchronize_rcu();
+       unsigned long gp_seq;
        bool needwake;
        struct rcu_data *rdp;
        struct rcu_node *rnp;

+       /*
+        * Need to start a gp if no gp has been started yet.
+        *
+        * Note that we need to snapshot gp_seq after gp_seq_poll, otherwise
+        * consider the follow case:
+        *
+        *      <no gp in progress>     // gp# is 0
+        *      snapshot gp_seq         // gp #2 will be set as needed
+        *      <a gp passed>
+        *                              // gp# is 1
+        *      snapshot gp_seq_poll    // polling gets ready until gp #3
+        *
+        * then the following rcu_start_this_gp() won't mark gp #3 as needed,
+        * and polling won't become ready if others don't start a gp.
+        *
+        * And the following case is fine:
+        *
+        *      <no gp in progress>     // gp# is 0
+        *      snapshot gp_seq_poll    // polling gets ready until gp #2
+        *      <a gp passed>
+        *                              // gp# is 1
+        *      snapshot gp_seq         // gp #3 will be set as needed
+        *
+        * Also note, we rely on the smp_mb() in get_state_synchronize_rcu()
+        * to order the two snapshots.
+        */
+       gp_seq = rcu_seq_snap(&rcu_state.gp_seq);
        lockdep_assert_irqs_enabled();
        local_irq_save(flags);
        rdp = this_cpu_ptr(&rcu_data);
@@ -3585,7 +3625,7 @@ unsigned long start_poll_synchronize_rcu(void)
        raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
        if (needwake)
                rcu_gp_kthread_wake();
-       return gp_seq;
+       return gp_seq_poll;
 }
 EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu);
Paul E. McKenney July 21, 2022, 1:56 a.m. UTC | #4
On Wed, Jul 20, 2022 at 06:04:55PM -0700, Paul E. McKenney wrote:
> On Wed, Jul 20, 2022 at 05:53:38PM -0700, Boqun Feng wrote:
> > Hi Paul,
> > 
> > On Mon, Jun 20, 2022 at 03:51:20PM -0700, Paul E. McKenney wrote:
> > > This commit switches the existing polled grace-period APIs to use a
> > > new ->gp_seq_polled counter in the rcu_state structure.  An additional
> > > ->gp_seq_polled_snap counter in that same structure allows the normal
> > > grace period kthread to interact properly with the !SMP !PREEMPT fastpath
> > > through synchronize_rcu().  The first of the two to note the end of a
> > > given grace period will make knowledge of this transition available to
> > > the polled API.
> > > 
> > > This commit is in preparation for polled expedited grace periods.
> > > 
> > > Link: https://lore.kernel.org/all/20220121142454.1994916-1-bfoster@redhat.com/
> > > Link: https://docs.google.com/document/d/1RNKWW9jQyfjxw2E8dsXVTdvZYh0HnYeSHDKog9jhdN8/edit?usp=sharing
> > > Cc: Brian Foster <bfoster@redhat.com>
> > > Cc: Dave Chinner <david@fromorbit.com>
> > > Cc: Al Viro <viro@zeniv.linux.org.uk>
> > > Cc: Ian Kent <raven@themaw.net>
> > > Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
> > > ---
> > >  kernel/rcu/tree.c | 90 +++++++++++++++++++++++++++++++++++++++++++++--
> > >  kernel/rcu/tree.h |  2 ++
> > >  2 files changed, 89 insertions(+), 3 deletions(-)
> > > 
> > > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> > > index 46cfceea87847..637e8f9454573 100644
> > > --- a/kernel/rcu/tree.c
> > > +++ b/kernel/rcu/tree.c
> > > @@ -1775,6 +1775,78 @@ static void rcu_strict_gp_boundary(void *unused)
> > >  	invoke_rcu_core();
> > >  }
> > >  
> > > +// Has rcu_init() been invoked?  This is used (for example) to determine
> > > +// whether spinlocks may be acquired safely.
> > > +static bool rcu_init_invoked(void)
> > > +{
> > > +	return !!rcu_state.n_online_cpus;
> > > +}
> > > +
> > > +// Make the polled API aware of the beginning of a grace period.
> > > +static void rcu_poll_gp_seq_start(unsigned long *snap)
> > > +{
> > > +	struct rcu_node *rnp = rcu_get_root();
> > > +
> > > +	if (rcu_init_invoked())
> > > +		raw_lockdep_assert_held_rcu_node(rnp);
> > > +
> > > +	// If RCU was idle, note beginning of GP.
> > > +	if (!rcu_seq_state(rcu_state.gp_seq_polled))
> > > +		rcu_seq_start(&rcu_state.gp_seq_polled);
> > > +
> > > +	// Either way, record current state.
> > > +	*snap = rcu_state.gp_seq_polled;
> > > +}
> > > +
> > > +// Make the polled API aware of the end of a grace period.
> > > +static void rcu_poll_gp_seq_end(unsigned long *snap)
> > > +{
> > > +	struct rcu_node *rnp = rcu_get_root();
> > > +
> > > +	if (rcu_init_invoked())
> > > +		raw_lockdep_assert_held_rcu_node(rnp);
> > > +
> > > +	// If the the previously noted GP is still in effect, record the
> > > +	// end of that GP.  Either way, zero counter to avoid counter-wrap
> > > +	// problems.
> > > +	if (*snap && *snap == rcu_state.gp_seq_polled) {
> > > +		rcu_seq_end(&rcu_state.gp_seq_polled);
> > > +		rcu_state.gp_seq_polled_snap = 0;
> > > +	} else {
> > > +		*snap = 0;
> > > +	}
> > > +}
> > > +
> > > +// Make the polled API aware of the beginning of a grace period, but
> > > +// where caller does not hold the root rcu_node structure's lock.
> > > +static void rcu_poll_gp_seq_start_unlocked(unsigned long *snap)
> > > +{
> > > +	struct rcu_node *rnp = rcu_get_root();
> > > +
> > > +	if (rcu_init_invoked()) {
> > > +		lockdep_assert_irqs_enabled();
> > > +		raw_spin_lock_irq_rcu_node(rnp);
> > > +	}
> > > +	rcu_poll_gp_seq_start(snap);
> > > +	if (rcu_init_invoked())
> > > +		raw_spin_unlock_irq_rcu_node(rnp);
> > > +}
> > > +
> > > +// Make the polled API aware of the end of a grace period, but where
> > > +// caller does not hold the root rcu_node structure's lock.
> > > +static void rcu_poll_gp_seq_end_unlocked(unsigned long *snap)
> > > +{
> > > +	struct rcu_node *rnp = rcu_get_root();
> > > +
> > > +	if (rcu_init_invoked()) {
> > > +		lockdep_assert_irqs_enabled();
> > > +		raw_spin_lock_irq_rcu_node(rnp);
> > > +	}
> > > +	rcu_poll_gp_seq_end(snap);
> > > +	if (rcu_init_invoked())
> > > +		raw_spin_unlock_irq_rcu_node(rnp);
> > > +}
> > > +
> > >  /*
> > >   * Initialize a new grace period.  Return false if no grace period required.
> > >   */
> > > @@ -1810,6 +1882,7 @@ static noinline_for_stack bool rcu_gp_init(void)
> > >  	rcu_seq_start(&rcu_state.gp_seq);
> > >  	ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq);
> > >  	trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("start"));
> > > +	rcu_poll_gp_seq_start(&rcu_state.gp_seq_polled_snap);
> > >  	raw_spin_unlock_irq_rcu_node(rnp);
> > >  
> > >  	/*
> > > @@ -2069,6 +2142,7 @@ static noinline void rcu_gp_cleanup(void)
> > >  	 * safe for us to drop the lock in order to mark the grace
> > >  	 * period as completed in all of the rcu_node structures.
> > >  	 */
> > > +	rcu_poll_gp_seq_end(&rcu_state.gp_seq_polled_snap);
> > >  	raw_spin_unlock_irq_rcu_node(rnp);
> > >  
> > >  	/*
> > > @@ -3837,8 +3911,18 @@ void synchronize_rcu(void)
> > >  			 lock_is_held(&rcu_lock_map) ||
> > >  			 lock_is_held(&rcu_sched_lock_map),
> > >  			 "Illegal synchronize_rcu() in RCU read-side critical section");
> > > -	if (rcu_blocking_is_gp())
> > > +	if (rcu_blocking_is_gp()) {
> > > +		// Note well that this code runs with !PREEMPT && !SMP.
> > > +		// In addition, all code that advances grace periods runs
> > > +		// at process level.  Therefore, this GP overlaps with other
> > > +		// GPs only by being fully nested within them, which allows
> > > +		// reuse of ->gp_seq_polled_snap.
> > > +		rcu_poll_gp_seq_start_unlocked(&rcu_state.gp_seq_polled_snap);
> > > +		rcu_poll_gp_seq_end_unlocked(&rcu_state.gp_seq_polled_snap);
> > > +		if (rcu_init_invoked())
> > > +			cond_resched_tasks_rcu_qs();
> > >  		return;  // Context allows vacuous grace periods.
> > > +	}
> > >  	if (rcu_gp_is_expedited())
> > >  		synchronize_rcu_expedited();
> > >  	else
> > > @@ -3860,7 +3944,7 @@ unsigned long get_state_synchronize_rcu(void)
> > >  	 * before the load from ->gp_seq.
> > >  	 */
> > >  	smp_mb();  /* ^^^ */
> > > -	return rcu_seq_snap(&rcu_state.gp_seq);
> > > +	return rcu_seq_snap(&rcu_state.gp_seq_polled);
> > 
> > I happened to run into this. There is one usage of
> > get_state_synchronize_rcu() in start_poll_synchronize_rcu(), in which
> > the return value of get_state_synchronize_rcu() ("gp_seq") will be used
> > for rcu_start_this_gp(). I don't think this is quite right, because
> > after this change, rcu_state.gp_seq and rcu_state.gp_seq_polled are
> > different values, in fact ->gp_seq_polled is greater than ->gp_seq
> > by how many synchronize_rcu() is called in early boot.
> > 
> > Am I missing something here?
> 
> It does not appear that your are missing anything, sad to say!
> 
> Does the following make it work better?

Well, rcutorture doesn't like this change much.  ;-)

No surprise, given that it is only the value feeding into
rcu_start_this_gp() that needs to change, not the value returned from
start_poll_synchronize_rcu().

Take 2, still untested.

							Thanx, Paul

------------------------------------------------------------------------

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 2122359f0c862..061c1f6737ddc 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3581,7 +3581,7 @@ unsigned long start_poll_synchronize_rcu(void)
 	rdp = this_cpu_ptr(&rcu_data);
 	rnp = rdp->mynode;
 	raw_spin_lock_rcu_node(rnp); // irqs already disabled.
-	needwake = rcu_start_this_gp(rnp, rdp, gp_seq);
+	needwake = rcu_start_this_gp(rnp, rdp, rcu_seq_snap(&rcu_state.gp_seq));
 	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 	if (needwake)
 		rcu_gp_kthread_wake();
Paul E. McKenney July 21, 2022, 4:47 a.m. UTC | #5
On Wed, Jul 20, 2022 at 06:51:45PM -0700, Boqun Feng wrote:
> On Wed, Jul 20, 2022 at 06:04:55PM -0700, Paul E. McKenney wrote:
> [...]
> > > > @@ -3860,7 +3944,7 @@ unsigned long get_state_synchronize_rcu(void)
> > > >  	 * before the load from ->gp_seq.
> > > >  	 */
> > > >  	smp_mb();  /* ^^^ */
> > > > -	return rcu_seq_snap(&rcu_state.gp_seq);
> > > > +	return rcu_seq_snap(&rcu_state.gp_seq_polled);
> > > 
> > > I happened to run into this. There is one usage of
> > > get_state_synchronize_rcu() in start_poll_synchronize_rcu(), in which
> > > the return value of get_state_synchronize_rcu() ("gp_seq") will be used
> > > for rcu_start_this_gp(). I don't think this is quite right, because
> > > after this change, rcu_state.gp_seq and rcu_state.gp_seq_polled are
> > > different values, in fact ->gp_seq_polled is greater than ->gp_seq
> > > by how many synchronize_rcu() is called in early boot.
> > > 
> > > Am I missing something here?
> > 
> > It does not appear that your are missing anything, sad to say!
> > 
> > Does the following make it work better?
> > 
> > 							Thanx, Paul
> > 
> > ------------------------------------------------------------------------
> > 
> > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> > index 2122359f0c862..cf2fd58a93a41 100644
> > --- a/kernel/rcu/tree.c
> > +++ b/kernel/rcu/tree.c
> > @@ -3571,7 +3571,7 @@ EXPORT_SYMBOL_GPL(get_state_synchronize_rcu);
> >  unsigned long start_poll_synchronize_rcu(void)
> >  {
> >  	unsigned long flags;
> > -	unsigned long gp_seq = get_state_synchronize_rcu();
> > +	unsigned long gp_seq = rcu_seq_snap(&rcu_state.gp_seq);
> 
> get_state_synchronize_rcu() is still needed, because we need to return
> a cookie for polling for this function. Something like below maybe? Hope
> I didn't mess up the ordering ;-)

My thought is to combine your comment with my functionally equivalent
code that avoids the extra variable.  If that works for you (and if it
works, for that matter), does Co-developed-by work for you?

							Thanx, Paul

> Regards,
> Boqun
> 
> ---------------
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index 84d281776688..0f9134871289 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -3571,11 +3583,39 @@ EXPORT_SYMBOL_GPL(get_state_synchronize_rcu);
>  unsigned long start_poll_synchronize_rcu(void)
>  {
>         unsigned long flags;
> -       unsigned long gp_seq = get_state_synchronize_rcu();
> +       unsigned long gp_seq_poll = get_state_synchronize_rcu();
> +       unsigned long gp_seq;
>         bool needwake;
>         struct rcu_data *rdp;
>         struct rcu_node *rnp;
> 
> +       /*
> +        * Need to start a gp if no gp has been started yet.
> +        *
> +        * Note that we need to snapshot gp_seq after gp_seq_poll, otherwise
> +        * consider the follow case:
> +        *
> +        *      <no gp in progress>     // gp# is 0
> +        *      snapshot gp_seq         // gp #2 will be set as needed
> +        *      <a gp passed>
> +        *                              // gp# is 1
> +        *      snapshot gp_seq_poll    // polling gets ready until gp #3
> +        *
> +        * then the following rcu_start_this_gp() won't mark gp #3 as needed,
> +        * and polling won't become ready if others don't start a gp.
> +        *
> +        * And the following case is fine:
> +        *
> +        *      <no gp in progress>     // gp# is 0
> +        *      snapshot gp_seq_poll    // polling gets ready until gp #2
> +        *      <a gp passed>
> +        *                              // gp# is 1
> +        *      snapshot gp_seq         // gp #3 will be set as needed
> +        *
> +        * Also note, we rely on the smp_mb() in get_state_synchronize_rcu()
> +        * to order the two snapshots.
> +        */
> +       gp_seq = rcu_seq_snap(&rcu_state.gp_seq);
>         lockdep_assert_irqs_enabled();
>         local_irq_save(flags);
>         rdp = this_cpu_ptr(&rcu_data);
> @@ -3585,7 +3625,7 @@ unsigned long start_poll_synchronize_rcu(void)
>         raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
>         if (needwake)
>                 rcu_gp_kthread_wake();
> -       return gp_seq;
> +       return gp_seq_poll;
>  }
>  EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu);
Boqun Feng July 21, 2022, 5:49 a.m. UTC | #6
On Wed, Jul 20, 2022 at 09:47:08PM -0700, Paul E. McKenney wrote:
> On Wed, Jul 20, 2022 at 06:51:45PM -0700, Boqun Feng wrote:
> > On Wed, Jul 20, 2022 at 06:04:55PM -0700, Paul E. McKenney wrote:
> > [...]
> > > > > @@ -3860,7 +3944,7 @@ unsigned long get_state_synchronize_rcu(void)
> > > > >  	 * before the load from ->gp_seq.
> > > > >  	 */
> > > > >  	smp_mb();  /* ^^^ */
> > > > > -	return rcu_seq_snap(&rcu_state.gp_seq);
> > > > > +	return rcu_seq_snap(&rcu_state.gp_seq_polled);
> > > > 
> > > > I happened to run into this. There is one usage of
> > > > get_state_synchronize_rcu() in start_poll_synchronize_rcu(), in which
> > > > the return value of get_state_synchronize_rcu() ("gp_seq") will be used
> > > > for rcu_start_this_gp(). I don't think this is quite right, because
> > > > after this change, rcu_state.gp_seq and rcu_state.gp_seq_polled are
> > > > different values, in fact ->gp_seq_polled is greater than ->gp_seq
> > > > by how many synchronize_rcu() is called in early boot.
> > > > 
> > > > Am I missing something here?
> > > 
> > > It does not appear that your are missing anything, sad to say!
> > > 
> > > Does the following make it work better?
> > > 
> > > 							Thanx, Paul
> > > 
> > > ------------------------------------------------------------------------
> > > 
> > > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> > > index 2122359f0c862..cf2fd58a93a41 100644
> > > --- a/kernel/rcu/tree.c
> > > +++ b/kernel/rcu/tree.c
> > > @@ -3571,7 +3571,7 @@ EXPORT_SYMBOL_GPL(get_state_synchronize_rcu);
> > >  unsigned long start_poll_synchronize_rcu(void)
> > >  {
> > >  	unsigned long flags;
> > > -	unsigned long gp_seq = get_state_synchronize_rcu();
> > > +	unsigned long gp_seq = rcu_seq_snap(&rcu_state.gp_seq);
> > 
> > get_state_synchronize_rcu() is still needed, because we need to return
> > a cookie for polling for this function. Something like below maybe? Hope
> > I didn't mess up the ordering ;-)
> 
> My thought is to combine your comment with my functionally equivalent
> code that avoids the extra variable.  If that works for you (and if it
> works, for that matter), does Co-developed-by work for you?
> 

Sure! Thanks ;-)

Regards,
Boqun

> 							Thanx, Paul
> 
> > Regards,
> > Boqun
> > 
> > ---------------
> > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> > index 84d281776688..0f9134871289 100644
> > --- a/kernel/rcu/tree.c
> > +++ b/kernel/rcu/tree.c
> > @@ -3571,11 +3583,39 @@ EXPORT_SYMBOL_GPL(get_state_synchronize_rcu);
> >  unsigned long start_poll_synchronize_rcu(void)
> >  {
> >         unsigned long flags;
> > -       unsigned long gp_seq = get_state_synchronize_rcu();
> > +       unsigned long gp_seq_poll = get_state_synchronize_rcu();
> > +       unsigned long gp_seq;
> >         bool needwake;
> >         struct rcu_data *rdp;
> >         struct rcu_node *rnp;
> > 
> > +       /*
> > +        * Need to start a gp if no gp has been started yet.
> > +        *
> > +        * Note that we need to snapshot gp_seq after gp_seq_poll, otherwise
> > +        * consider the follow case:
> > +        *
> > +        *      <no gp in progress>     // gp# is 0
> > +        *      snapshot gp_seq         // gp #2 will be set as needed
> > +        *      <a gp passed>
> > +        *                              // gp# is 1
> > +        *      snapshot gp_seq_poll    // polling gets ready until gp #3
> > +        *
> > +        * then the following rcu_start_this_gp() won't mark gp #3 as needed,
> > +        * and polling won't become ready if others don't start a gp.
> > +        *
> > +        * And the following case is fine:
> > +        *
> > +        *      <no gp in progress>     // gp# is 0
> > +        *      snapshot gp_seq_poll    // polling gets ready until gp #2
> > +        *      <a gp passed>
> > +        *                              // gp# is 1
> > +        *      snapshot gp_seq         // gp #3 will be set as needed
> > +        *
> > +        * Also note, we rely on the smp_mb() in get_state_synchronize_rcu()
> > +        * to order the two snapshots.
> > +        */
> > +       gp_seq = rcu_seq_snap(&rcu_state.gp_seq);
> >         lockdep_assert_irqs_enabled();
> >         local_irq_save(flags);
> >         rdp = this_cpu_ptr(&rcu_data);
> > @@ -3585,7 +3625,7 @@ unsigned long start_poll_synchronize_rcu(void)
> >         raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
> >         if (needwake)
> >                 rcu_gp_kthread_wake();
> > -       return gp_seq;
> > +       return gp_seq_poll;
> >  }
> >  EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu);
Paul E. McKenney July 22, 2022, 1:03 a.m. UTC | #7
On Wed, Jul 20, 2022 at 10:49:26PM -0700, Boqun Feng wrote:
> On Wed, Jul 20, 2022 at 09:47:08PM -0700, Paul E. McKenney wrote:
> > On Wed, Jul 20, 2022 at 06:51:45PM -0700, Boqun Feng wrote:
> > > On Wed, Jul 20, 2022 at 06:04:55PM -0700, Paul E. McKenney wrote:
> > > [...]
> > > > > > @@ -3860,7 +3944,7 @@ unsigned long get_state_synchronize_rcu(void)
> > > > > >  	 * before the load from ->gp_seq.
> > > > > >  	 */
> > > > > >  	smp_mb();  /* ^^^ */
> > > > > > -	return rcu_seq_snap(&rcu_state.gp_seq);
> > > > > > +	return rcu_seq_snap(&rcu_state.gp_seq_polled);
> > > > > 
> > > > > I happened to run into this. There is one usage of
> > > > > get_state_synchronize_rcu() in start_poll_synchronize_rcu(), in which
> > > > > the return value of get_state_synchronize_rcu() ("gp_seq") will be used
> > > > > for rcu_start_this_gp(). I don't think this is quite right, because
> > > > > after this change, rcu_state.gp_seq and rcu_state.gp_seq_polled are
> > > > > different values, in fact ->gp_seq_polled is greater than ->gp_seq
> > > > > by how many synchronize_rcu() is called in early boot.
> > > > > 
> > > > > Am I missing something here?
> > > > 
> > > > It does not appear that your are missing anything, sad to say!
> > > > 
> > > > Does the following make it work better?
> > > > 
> > > > 							Thanx, Paul
> > > > 
> > > > ------------------------------------------------------------------------
> > > > 
> > > > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> > > > index 2122359f0c862..cf2fd58a93a41 100644
> > > > --- a/kernel/rcu/tree.c
> > > > +++ b/kernel/rcu/tree.c
> > > > @@ -3571,7 +3571,7 @@ EXPORT_SYMBOL_GPL(get_state_synchronize_rcu);
> > > >  unsigned long start_poll_synchronize_rcu(void)
> > > >  {
> > > >  	unsigned long flags;
> > > > -	unsigned long gp_seq = get_state_synchronize_rcu();
> > > > +	unsigned long gp_seq = rcu_seq_snap(&rcu_state.gp_seq);
> > > 
> > > get_state_synchronize_rcu() is still needed, because we need to return
> > > a cookie for polling for this function. Something like below maybe? Hope
> > > I didn't mess up the ordering ;-)
> > 
> > My thought is to combine your comment with my functionally equivalent
> > code that avoids the extra variable.  If that works for you (and if it
> > works, for that matter), does Co-developed-by work for you?
> 
> Sure! Thanks ;-)

I did some summarization on the comment, folded it into the original,
and ended up with the patch shown below.  Thoughts?

							Thanx, Paul

------------------------------------------------------------------------

commit bf95b2bc3e42f11f4d7a5e8a98376c2b4a2aa82f
Author: Paul E. McKenney <paulmck@kernel.org>
Date:   Wed Apr 13 17:46:15 2022 -0700

    rcu: Switch polled grace-period APIs to ->gp_seq_polled
    
    This commit switches the existing polled grace-period APIs to use a
    new ->gp_seq_polled counter in the rcu_state structure.  An additional
    ->gp_seq_polled_snap counter in that same structure allows the normal
    grace period kthread to interact properly with the !SMP !PREEMPT fastpath
    through synchronize_rcu().  The first of the two to note the end of a
    given grace period will make knowledge of this transition available to
    the polled API.
    
    This commit is in preparation for polled expedited grace periods.
    
    [ paulmck: Fix use of rcu_state.gp_seq_polled to start normal grace period. ]
    
    Link: https://lore.kernel.org/all/20220121142454.1994916-1-bfoster@redhat.com/
    Link: https://docs.google.com/document/d/1RNKWW9jQyfjxw2E8dsXVTdvZYh0HnYeSHDKog9jhdN8/edit?usp=sharing
    Cc: Brian Foster <bfoster@redhat.com>
    Cc: Dave Chinner <david@fromorbit.com>
    Cc: Al Viro <viro@zeniv.linux.org.uk>
    Cc: Ian Kent <raven@themaw.net>
    Co-developed-by: Boqun Feng <boqun.feng@gmail.com>
    Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
    Signed-off-by: Paul E. McKenney <paulmck@kernel.org>

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 46cfceea87847..b40a5a19ddd2a 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1775,6 +1775,78 @@ static void rcu_strict_gp_boundary(void *unused)
 	invoke_rcu_core();
 }
 
+// Has rcu_init() been invoked?  This is used (for example) to determine
+// whether spinlocks may be acquired safely.
+static bool rcu_init_invoked(void)
+{
+	return !!rcu_state.n_online_cpus;
+}
+
+// Make the polled API aware of the beginning of a grace period.
+static void rcu_poll_gp_seq_start(unsigned long *snap)
+{
+	struct rcu_node *rnp = rcu_get_root();
+
+	if (rcu_init_invoked())
+		raw_lockdep_assert_held_rcu_node(rnp);
+
+	// If RCU was idle, note beginning of GP.
+	if (!rcu_seq_state(rcu_state.gp_seq_polled))
+		rcu_seq_start(&rcu_state.gp_seq_polled);
+
+	// Either way, record current state.
+	*snap = rcu_state.gp_seq_polled;
+}
+
+// Make the polled API aware of the end of a grace period.
+static void rcu_poll_gp_seq_end(unsigned long *snap)
+{
+	struct rcu_node *rnp = rcu_get_root();
+
+	if (rcu_init_invoked())
+		raw_lockdep_assert_held_rcu_node(rnp);
+
+	// If the previously noted GP is still in effect, record the
+	// end of that GP.  Either way, zero counter to avoid counter-wrap
+	// problems.
+	if (*snap && *snap == rcu_state.gp_seq_polled) {
+		rcu_seq_end(&rcu_state.gp_seq_polled);
+		rcu_state.gp_seq_polled_snap = 0;
+	} else {
+		*snap = 0;
+	}
+}
+
+// Make the polled API aware of the beginning of a grace period, but
+// where caller does not hold the root rcu_node structure's lock.
+static void rcu_poll_gp_seq_start_unlocked(unsigned long *snap)
+{
+	struct rcu_node *rnp = rcu_get_root();
+
+	if (rcu_init_invoked()) {
+		lockdep_assert_irqs_enabled();
+		raw_spin_lock_irq_rcu_node(rnp);
+	}
+	rcu_poll_gp_seq_start(snap);
+	if (rcu_init_invoked())
+		raw_spin_unlock_irq_rcu_node(rnp);
+}
+
+// Make the polled API aware of the end of a grace period, but where
+// caller does not hold the root rcu_node structure's lock.
+static void rcu_poll_gp_seq_end_unlocked(unsigned long *snap)
+{
+	struct rcu_node *rnp = rcu_get_root();
+
+	if (rcu_init_invoked()) {
+		lockdep_assert_irqs_enabled();
+		raw_spin_lock_irq_rcu_node(rnp);
+	}
+	rcu_poll_gp_seq_end(snap);
+	if (rcu_init_invoked())
+		raw_spin_unlock_irq_rcu_node(rnp);
+}
+
 /*
  * Initialize a new grace period.  Return false if no grace period required.
  */
@@ -1810,6 +1882,7 @@ static noinline_for_stack bool rcu_gp_init(void)
 	rcu_seq_start(&rcu_state.gp_seq);
 	ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq);
 	trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("start"));
+	rcu_poll_gp_seq_start(&rcu_state.gp_seq_polled_snap);
 	raw_spin_unlock_irq_rcu_node(rnp);
 
 	/*
@@ -2069,6 +2142,7 @@ static noinline void rcu_gp_cleanup(void)
 	 * safe for us to drop the lock in order to mark the grace
 	 * period as completed in all of the rcu_node structures.
 	 */
+	rcu_poll_gp_seq_end(&rcu_state.gp_seq_polled_snap);
 	raw_spin_unlock_irq_rcu_node(rnp);
 
 	/*
@@ -3837,8 +3911,18 @@ void synchronize_rcu(void)
 			 lock_is_held(&rcu_lock_map) ||
 			 lock_is_held(&rcu_sched_lock_map),
 			 "Illegal synchronize_rcu() in RCU read-side critical section");
-	if (rcu_blocking_is_gp())
+	if (rcu_blocking_is_gp()) {
+		// Note well that this code runs with !PREEMPT && !SMP.
+		// In addition, all code that advances grace periods runs
+		// at process level.  Therefore, this GP overlaps with other
+		// GPs only by being fully nested within them, which allows
+		// reuse of ->gp_seq_polled_snap.
+		rcu_poll_gp_seq_start_unlocked(&rcu_state.gp_seq_polled_snap);
+		rcu_poll_gp_seq_end_unlocked(&rcu_state.gp_seq_polled_snap);
+		if (rcu_init_invoked())
+			cond_resched_tasks_rcu_qs();
 		return;  // Context allows vacuous grace periods.
+	}
 	if (rcu_gp_is_expedited())
 		synchronize_rcu_expedited();
 	else
@@ -3860,7 +3944,7 @@ unsigned long get_state_synchronize_rcu(void)
 	 * before the load from ->gp_seq.
 	 */
 	smp_mb();  /* ^^^ */
-	return rcu_seq_snap(&rcu_state.gp_seq);
+	return rcu_seq_snap(&rcu_state.gp_seq_polled);
 }
 EXPORT_SYMBOL_GPL(get_state_synchronize_rcu);
 
@@ -3889,7 +3973,13 @@ unsigned long start_poll_synchronize_rcu(void)
 	rdp = this_cpu_ptr(&rcu_data);
 	rnp = rdp->mynode;
 	raw_spin_lock_rcu_node(rnp); // irqs already disabled.
-	needwake = rcu_start_this_gp(rnp, rdp, gp_seq);
+	// Note it is possible for a grace period to have elapsed between
+	// the above call to get_state_synchronize_rcu() and the below call
+	// to rcu_seq_snap.  This is OK, the worst that happens is that we
+	// get a grace period that no one needed.  These accesses are ordered
+	// by smp_mb(), and we are accessing them in the opposite order
+	// from which they are updated at grace-period start, as required.
+	needwake = rcu_start_this_gp(rnp, rdp, rcu_seq_snap(&rcu_state.gp_seq));
 	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 	if (needwake)
 		rcu_gp_kthread_wake();
@@ -3925,7 +4015,7 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu);
 bool poll_state_synchronize_rcu(unsigned long oldstate)
 {
 	if (oldstate == RCU_GET_STATE_COMPLETED ||
-	    rcu_seq_done_exact(&rcu_state.gp_seq, oldstate)) {
+	    rcu_seq_done_exact(&rcu_state.gp_seq_polled, oldstate)) {
 		smp_mb(); /* Ensure GP ends before subsequent accesses. */
 		return true;
 	}
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 2ccf5845957df..9c853033f159d 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -323,6 +323,8 @@ struct rcu_state {
 	short gp_state;				/* GP kthread sleep state. */
 	unsigned long gp_wake_time;		/* Last GP kthread wake. */
 	unsigned long gp_wake_seq;		/* ->gp_seq at ^^^. */
+	unsigned long gp_seq_polled;		/* GP seq for polled API. */
+	unsigned long gp_seq_polled_snap;	/* ->gp_seq_polled at normal GP start. */
 
 	/* End of fields guarded by root rcu_node's lock. */
diff mbox series

Patch

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 46cfceea87847..637e8f9454573 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1775,6 +1775,78 @@  static void rcu_strict_gp_boundary(void *unused)
 	invoke_rcu_core();
 }
 
+// Has rcu_init() been invoked?  This is used (for example) to determine
+// whether spinlocks may be acquired safely.
+static bool rcu_init_invoked(void)
+{
+	return !!rcu_state.n_online_cpus;
+}
+
+// Make the polled API aware of the beginning of a grace period.
+static void rcu_poll_gp_seq_start(unsigned long *snap)
+{
+	struct rcu_node *rnp = rcu_get_root();
+
+	if (rcu_init_invoked())
+		raw_lockdep_assert_held_rcu_node(rnp);
+
+	// If RCU was idle, note beginning of GP.
+	if (!rcu_seq_state(rcu_state.gp_seq_polled))
+		rcu_seq_start(&rcu_state.gp_seq_polled);
+
+	// Either way, record current state.
+	*snap = rcu_state.gp_seq_polled;
+}
+
+// Make the polled API aware of the end of a grace period.
+static void rcu_poll_gp_seq_end(unsigned long *snap)
+{
+	struct rcu_node *rnp = rcu_get_root();
+
+	if (rcu_init_invoked())
+		raw_lockdep_assert_held_rcu_node(rnp);
+
+	// If the the previously noted GP is still in effect, record the
+	// end of that GP.  Either way, zero counter to avoid counter-wrap
+	// problems.
+	if (*snap && *snap == rcu_state.gp_seq_polled) {
+		rcu_seq_end(&rcu_state.gp_seq_polled);
+		rcu_state.gp_seq_polled_snap = 0;
+	} else {
+		*snap = 0;
+	}
+}
+
+// Make the polled API aware of the beginning of a grace period, but
+// where caller does not hold the root rcu_node structure's lock.
+static void rcu_poll_gp_seq_start_unlocked(unsigned long *snap)
+{
+	struct rcu_node *rnp = rcu_get_root();
+
+	if (rcu_init_invoked()) {
+		lockdep_assert_irqs_enabled();
+		raw_spin_lock_irq_rcu_node(rnp);
+	}
+	rcu_poll_gp_seq_start(snap);
+	if (rcu_init_invoked())
+		raw_spin_unlock_irq_rcu_node(rnp);
+}
+
+// Make the polled API aware of the end of a grace period, but where
+// caller does not hold the root rcu_node structure's lock.
+static void rcu_poll_gp_seq_end_unlocked(unsigned long *snap)
+{
+	struct rcu_node *rnp = rcu_get_root();
+
+	if (rcu_init_invoked()) {
+		lockdep_assert_irqs_enabled();
+		raw_spin_lock_irq_rcu_node(rnp);
+	}
+	rcu_poll_gp_seq_end(snap);
+	if (rcu_init_invoked())
+		raw_spin_unlock_irq_rcu_node(rnp);
+}
+
 /*
  * Initialize a new grace period.  Return false if no grace period required.
  */
@@ -1810,6 +1882,7 @@  static noinline_for_stack bool rcu_gp_init(void)
 	rcu_seq_start(&rcu_state.gp_seq);
 	ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq);
 	trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("start"));
+	rcu_poll_gp_seq_start(&rcu_state.gp_seq_polled_snap);
 	raw_spin_unlock_irq_rcu_node(rnp);
 
 	/*
@@ -2069,6 +2142,7 @@  static noinline void rcu_gp_cleanup(void)
 	 * safe for us to drop the lock in order to mark the grace
 	 * period as completed in all of the rcu_node structures.
 	 */
+	rcu_poll_gp_seq_end(&rcu_state.gp_seq_polled_snap);
 	raw_spin_unlock_irq_rcu_node(rnp);
 
 	/*
@@ -3837,8 +3911,18 @@  void synchronize_rcu(void)
 			 lock_is_held(&rcu_lock_map) ||
 			 lock_is_held(&rcu_sched_lock_map),
 			 "Illegal synchronize_rcu() in RCU read-side critical section");
-	if (rcu_blocking_is_gp())
+	if (rcu_blocking_is_gp()) {
+		// Note well that this code runs with !PREEMPT && !SMP.
+		// In addition, all code that advances grace periods runs
+		// at process level.  Therefore, this GP overlaps with other
+		// GPs only by being fully nested within them, which allows
+		// reuse of ->gp_seq_polled_snap.
+		rcu_poll_gp_seq_start_unlocked(&rcu_state.gp_seq_polled_snap);
+		rcu_poll_gp_seq_end_unlocked(&rcu_state.gp_seq_polled_snap);
+		if (rcu_init_invoked())
+			cond_resched_tasks_rcu_qs();
 		return;  // Context allows vacuous grace periods.
+	}
 	if (rcu_gp_is_expedited())
 		synchronize_rcu_expedited();
 	else
@@ -3860,7 +3944,7 @@  unsigned long get_state_synchronize_rcu(void)
 	 * before the load from ->gp_seq.
 	 */
 	smp_mb();  /* ^^^ */
-	return rcu_seq_snap(&rcu_state.gp_seq);
+	return rcu_seq_snap(&rcu_state.gp_seq_polled);
 }
 EXPORT_SYMBOL_GPL(get_state_synchronize_rcu);
 
@@ -3925,7 +4009,7 @@  EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu);
 bool poll_state_synchronize_rcu(unsigned long oldstate)
 {
 	if (oldstate == RCU_GET_STATE_COMPLETED ||
-	    rcu_seq_done_exact(&rcu_state.gp_seq, oldstate)) {
+	    rcu_seq_done_exact(&rcu_state.gp_seq_polled, oldstate)) {
 		smp_mb(); /* Ensure GP ends before subsequent accesses. */
 		return true;
 	}
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 2ccf5845957df..9c853033f159d 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -323,6 +323,8 @@  struct rcu_state {
 	short gp_state;				/* GP kthread sleep state. */
 	unsigned long gp_wake_time;		/* Last GP kthread wake. */
 	unsigned long gp_wake_seq;		/* ->gp_seq at ^^^. */
+	unsigned long gp_seq_polled;		/* GP seq for polled API. */
+	unsigned long gp_seq_polled_snap;	/* ->gp_seq_polled at normal GP start. */
 
 	/* End of fields guarded by root rcu_node's lock. */