diff mbox series

[v4,4/4] rcu: Add RCU stall diagnosis information

Message ID 20221104021224.102-5-thunder.leizhen@huawei.com (mailing list archive)
State Superseded
Headers show
Series rcu: Add RCU stall diagnosis information | expand

Commit Message

Leizhen (ThunderTown) Nov. 4, 2022, 2:12 a.m. UTC
In some extreme cases, such as the I/O pressure test, the CPU usage may
be 100%, causing RCU stall. In this case, the printed information about
current is not useful. Displays the number and usage of hard interrupts,
soft interrupts, and context switches that are generated within half of
the CPU stall timeout, can help us make a general judgment. In other
cases, we can preliminarily determine whether an infinite loop occurs
when local_irq, local_bh or preempt is disabled.

For example:
rcu: INFO: rcu_preempt self-detected stall on CPU
rcu:     0-....: (1250 ticks this GP) <omitted>
rcu:          hardirqs   softirqs   csw/system
rcu:  number:      624         45            0
rcu: cputime:       69          1         2425   ==> 2500(ms)

The example above shows that the number of hard and soft interrupts is
small, there is zero context switching, and the system takes up a lot of
time. We can quickly conclude that the current task is infinitely looped
with preempt_disable().

The impact on system performance is negligible because snapshot is
recorded only one time after 1/2 CPU stall timeout.

This enhanced debugging information is suppressed by default and can be
enabled by CONFIG_RCU_CPU_STALL_DEEP_DEBUG=y or
rcupdate.rcu_cpu_stall_deep_debug=1.

Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
---
 .../admin-guide/kernel-parameters.txt         |  5 +++
 kernel/rcu/Kconfig.debug                      | 10 ++++++
 kernel/rcu/rcu.h                              |  1 +
 kernel/rcu/tree.c                             | 16 ++++++++++
 kernel/rcu/tree.h                             | 17 ++++++++++
 kernel/rcu/tree_stall.h                       | 31 +++++++++++++++++++
 kernel/rcu/update.c                           |  2 ++
 7 files changed, 82 insertions(+)

Comments

Leizhen (ThunderTown) Nov. 4, 2022, 3:12 a.m. UTC | #1
On 2022/11/4 10:12, Zhen Lei wrote:
> diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
> index d4a97e40ea9c3e2..44d49819a31f050 100644
> --- a/kernel/rcu/tree.h
> +++ b/kernel/rcu/tree.h
> @@ -158,6 +158,22 @@ union rcu_noqs {
>  	u16 s; /* Set of bits, aggregate OR here. */
>  };
>  
> +/*
> + * Record the snapshot of the core stats at 1/2 rcu stall timeout. The member
> + * gp_seq is used to ensure that all members are updated only once during the
> + * second half period. The snapshot is taken only if this gp_seq is not equal
> + * to rdp->gp_seq.
> + */
> +struct rcu_snap_record {
> +	unsigned long	gp_seq;		/* Track rdp->gp_seq counter */
> +	u64		cputime_irq;	/* Accumulated cputime of hard irqs */
> +	u64		cputime_softirq;/* Accumulated cputime of soft irqs */
> +	u64		cputime_system; /* Accumulated cputime of kernel tasks */
> +	unsigned long	nr_hardirqs;	/* Accumulated number of hard irqs */
> +	unsigned int	nr_softirqs;	/* Accumulated number of soft irqs */
> +	unsigned long long nr_csw;	/* Accumulated number of task switches */
> +};
> +
>  /* Per-CPU data for read-copy update. */
>  struct rcu_data {
>  	/* 1) quiescent-state and grace-period handling : */
> @@ -262,6 +278,7 @@ struct rcu_data {
>  	short rcu_onl_gp_flags;		/* ->gp_flags at last online. */
>  	unsigned long last_fqs_resched;	/* Time of last rcu_resched(). */
>  	unsigned long last_sched_clock;	/* Jiffies of last rcu_sched_clock_irq(). */
> +	struct rcu_snap_record snap_record; /* Snapshot of core stats at 1/2 rcu stall timeout */
>  
>  	int cpu;
>  };

Sorry, I forgot to update linux-rcu.git. There is a conflict here.

----------------
 	int cpu;
  };
------------should be--------
 	long lazy_len;			/* Length of buffered lazy callbacks. */
 	int cpu;
----------------
Paul E. McKenney Nov. 4, 2022, 1:43 p.m. UTC | #2
On Fri, Nov 04, 2022 at 11:12:02AM +0800, Leizhen (ThunderTown) wrote:
> 
> 
> On 2022/11/4 10:12, Zhen Lei wrote:
> > diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
> > index d4a97e40ea9c3e2..44d49819a31f050 100644
> > --- a/kernel/rcu/tree.h
> > +++ b/kernel/rcu/tree.h
> > @@ -158,6 +158,22 @@ union rcu_noqs {
> >  	u16 s; /* Set of bits, aggregate OR here. */
> >  };
> >  
> > +/*
> > + * Record the snapshot of the core stats at 1/2 rcu stall timeout. The member
> > + * gp_seq is used to ensure that all members are updated only once during the
> > + * second half period. The snapshot is taken only if this gp_seq is not equal
> > + * to rdp->gp_seq.
> > + */
> > +struct rcu_snap_record {
> > +	unsigned long	gp_seq;		/* Track rdp->gp_seq counter */
> > +	u64		cputime_irq;	/* Accumulated cputime of hard irqs */
> > +	u64		cputime_softirq;/* Accumulated cputime of soft irqs */
> > +	u64		cputime_system; /* Accumulated cputime of kernel tasks */
> > +	unsigned long	nr_hardirqs;	/* Accumulated number of hard irqs */
> > +	unsigned int	nr_softirqs;	/* Accumulated number of soft irqs */
> > +	unsigned long long nr_csw;	/* Accumulated number of task switches */
> > +};
> > +
> >  /* Per-CPU data for read-copy update. */
> >  struct rcu_data {
> >  	/* 1) quiescent-state and grace-period handling : */
> > @@ -262,6 +278,7 @@ struct rcu_data {
> >  	short rcu_onl_gp_flags;		/* ->gp_flags at last online. */
> >  	unsigned long last_fqs_resched;	/* Time of last rcu_resched(). */
> >  	unsigned long last_sched_clock;	/* Jiffies of last rcu_sched_clock_irq(). */
> > +	struct rcu_snap_record snap_record; /* Snapshot of core stats at 1/2 rcu stall timeout */
> >  
> >  	int cpu;
> >  };
> 
> Sorry, I forgot to update linux-rcu.git. There is a conflict here.
> 
> ----------------
>  	int cpu;
>   };
> ------------should be--------
>  	long lazy_len;			/* Length of buffered lazy callbacks. */
>  	int cpu;
> ----------------

Not a problem!  Looks good otherwise.

So please just rebase your series onto -rcu and repost.

							Thanx, Paul
Elliott, Robert (Servers) Nov. 5, 2022, 1:58 a.m. UTC | #3
> --- a/kernel/rcu/Kconfig.debug
> +++ b/kernel/rcu/Kconfig.debug
> @@ -95,6 +95,16 @@ config RCU_EXP_CPU_STALL_TIMEOUT
>  	  says to use the RCU_CPU_STALL_TIMEOUT value converted from
>  	  seconds to milliseconds.
> 
> +config RCU_CPU_STALL_CPUTIME

Since the information might change and grow over time, consider
calling it "ADDITIONAL_INFO" rather than just "CPUTIME".

> +	bool "Provide additional rcu stall debug information"
> +	depends on RCU_STALL_COMMON
> +	default n
> +	help
> +	  Statistics during the period from RCU_CPU_STALL_TIMEOUT/2 to
> +	  RCU_CPU_STALL_TIMEOUT, such as the number of (hard interrupts,
> soft
> +	  interrupts, task switches) and the cputime of (hard interrupts,
> soft
> +	  interrupts, kerenl tasks) are added to the rcu stall report.

s/kerenl/kernel/

Those parenthetical words are not grammatically correct, and
nobody will remember to update Kconfig if they change the code.

Try making it is a little less precise: 
"Collect statistics during the RCU stall timeout period
(e.g., counts and CPU time consumed by hard interrupts, soft
interrupts, task switches, and kernel tasks) and include
them in the RCU stall console error message."

...
> +
> +	pr_err("         hardirqs   softirqs   csw/system\n");
> +	pr_err(" number: %8ld %10d %12lld\n",

Everything after a %d is a "number". These are more specifically counts.

> +		kstat_cpu_irqs_sum(cpu) - rsrp->nr_hardirqs,
> +		kstat_cpu_softirqs_sum(cpu) - rsrp->nr_softirqs,
> +		nr_context_switches_cpu(cpu) - rsrp->nr_csw);

Two columns are plural and one is not. 

Within a few lines, this is referred to as "context_switches", cws, 
and "task switches." One term would be better. vmstat uses "cs", 
pidstat uses "cswch", and /proc/<pid>/status uses "ctxt_switches,"
so there's not one ideal term for that column. Matching one
of those might be better than using another unique name.

Since no other variants are shown, the "/system" suffix is a bit
confusing. 


> +	pr_err("cputime: %8lld %10lld %12lld   ==> %lld(ms)\n",
> +		div_u64(cpustat[CPUTIME_IRQ] - rsrp->cputime_irq, NSEC_PER_MSEC),
> +		div_u64(cpustat[CPUTIME_SOFTIRQ] - rsrp->cputime_softirq, NSEC_PER_MSEC),
> +		div_u64(cpustat[CPUTIME_SYSTEM] - rsrp->cputime_system, NSEC_PER_MSEC),
> +		jiffies64_to_msecs(half_timeout));

There is no column header for that fourth number. "half_timeout"
might be good, dropping the "==>".

The "(ms)" at the end is awkward. Try moving it to the left
as "cputime (ms): "

> +int rcu_cpu_stall_cputime __read_mostly =
> IS_ENABLED(CONFIG_RCU_CPU_STALL_CPUTIME);

As a config option and module parameter, adding some more
instrumentation overhead might be worthwhile for other
likely causes of rcu stalls.

For example, if enabled, have these functions (if available
on the architecture) maintain a per-CPU running count of
their invocations, which also cause the CPU to be unavailable
for rcu: 
- kernel_fpu_begin() calls - FPU/SIMD context preservation,
  which also calls preempt_disable()
- preempt_disable() calls - scheduler context switches disabled
- local_irq_save() calls - interrupts disabled
- cond_resched() calls - lack of these is a problem

For kernel_fpu_begin and preempt_disable, knowing if it is
currently blocked for those reasons is probably the most
helpful.
Leizhen (ThunderTown) Nov. 5, 2022, 7:03 a.m. UTC | #4
On 2022/11/5 9:58, Elliott, Robert (Servers) wrote:
> 
>> --- a/kernel/rcu/Kconfig.debug
>> +++ b/kernel/rcu/Kconfig.debug
>> @@ -95,6 +95,16 @@ config RCU_EXP_CPU_STALL_TIMEOUT
>>  	  says to use the RCU_CPU_STALL_TIMEOUT value converted from
>>  	  seconds to milliseconds.
>>
>> +config RCU_CPU_STALL_CPUTIME
> 
> Since the information might change and grow over time, consider
> calling it "ADDITIONAL_INFO" rather than just "CPUTIME".

I think it's okay. But I need to wait for Paul's opinion.
In fact, I've also considered using MORE_DEBUG_INFO.

> 
>> +	bool "Provide additional rcu stall debug information"
>> +	depends on RCU_STALL_COMMON
>> +	default n
>> +	help
>> +	  Statistics during the period from RCU_CPU_STALL_TIMEOUT/2 to
>> +	  RCU_CPU_STALL_TIMEOUT, such as the number of (hard interrupts,
>> soft
>> +	  interrupts, task switches) and the cputime of (hard interrupts,
>> soft
>> +	  interrupts, kerenl tasks) are added to the rcu stall report.
> 
> s/kerenl/kernel/

Good catch.

> 
> Those parenthetical words are not grammatically correct, and
> nobody will remember to update Kconfig if they change the code.
> 
> Try making it is a little less precise: 
> "Collect statistics during the RCU stall timeout period

the second half period of RCU stall timeout

> (e.g., counts and CPU time consumed by hard interrupts, soft
> interrupts, task switches, and kernel tasks) and include

Maybe we can remove "task switches, and kernel tasks". Just for example,
no need to exhaustively.

> them in the RCU stall console error message."

Looks good above. Thanks.

> 
> ...
>> +
>> +	pr_err("         hardirqs   softirqs   csw/system\n");
>> +	pr_err(" number: %8ld %10d %12lld\n",
> 
> Everything after a %d is a "number". These are more specifically counts.
> 
>> +		kstat_cpu_irqs_sum(cpu) - rsrp->nr_hardirqs,
>> +		kstat_cpu_softirqs_sum(cpu) - rsrp->nr_softirqs,
>> +		nr_context_switches_cpu(cpu) - rsrp->nr_csw);
> 
> Two columns are plural and one is not. 

I didn't understand. Do you mean:
(1)
             hardirqs   softirqs  csw  system
     number:                      xx   N/A
cputime(ms):                      N/A   xx

> 
> Within a few lines, this is referred to as "context_switches", cws, 
> and "task switches." One term would be better. vmstat uses "cs", 
> pidstat uses "cswch", and /proc/<pid>/status uses "ctxt_switches,"
> so there's not one ideal term for that column. Matching one
> of those might be better than using another unique name.

struct task_struct {
        /* Context switch counts: */
        unsigned long                   nvcsw;
        unsigned long                   nivcsw;

There's nothing wrong with 'csw'.

> 
> Since no other variants are shown, the "/system" suffix is a bit
> confusing.

I don't think it's a problem! Because the prefix cputime already exists,
so CPUTIME_SYSTEM --> system.

How about:
             hardirqs   softirqs  tasks
     counts:
cputime(ms):

It's like drawing a head with a pipe in the men's room, and a head with
long hair in the women's room. Although women may also have smokers, men
may also have long hair. Oh, the example right in front of us, who would
interpret the first line as the number of interrupt types? So I think
it's pretty good to use 'tasks'.

> 
> 
>> +	pr_err("cputime: %8lld %10lld %12lld   ==> %lld(ms)\n",
>> +		div_u64(cpustat[CPUTIME_IRQ] - rsrp->cputime_irq, NSEC_PER_MSEC),
>> +		div_u64(cpustat[CPUTIME_SOFTIRQ] - rsrp->cputime_softirq, NSEC_PER_MSEC),
>> +		div_u64(cpustat[CPUTIME_SYSTEM] - rsrp->cputime_system, NSEC_PER_MSEC),
>> +		jiffies64_to_msecs(half_timeout));
> 
> There is no column header for that fourth number. "half_timeout"
> might be good, dropping the "==>".

If we use words, I'll use "sampling period" or "period".

To be honest, the printed information of the RCU needs to be understood in
combination with the code. For exmaple:

pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%04x/%ld/%#lx softirq=%u/%u fqs=%ld%s%s\n",
               cpu,
               "O."[!!cpu_online(cpu)],
               "o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)],
               "N."[!!(rdp->grpmask & rdp->mynode->qsmaskinitnext)],

Paul mentioned a few days ago that there were complaints that the RCU was
printing too much. So "==>" is better suited for people who often deal with
rcu stall issues.

> 
> The "(ms)" at the end is awkward. Try moving it to the left
> as "cputime (ms): "

OK

> 
>> +int rcu_cpu_stall_cputime __read_mostly =
>> IS_ENABLED(CONFIG_RCU_CPU_STALL_CPUTIME);
> 
> As a config option and module parameter, adding some more
> instrumentation overhead might be worthwhile for other
> likely causes of rcu stalls.
> 
> For example, if enabled, have these functions (if available
> on the architecture) maintain a per-CPU running count of
> their invocations, which also cause the CPU to be unavailable
> for rcu: 
> - kernel_fpu_begin() calls - FPU/SIMD context preservation,
>   which also calls preempt_disable()
> - preempt_disable() calls - scheduler context switches disabled
> - local_irq_save() calls - interrupts disabled
> - cond_resched() calls - lack of these is a problem
> 
> For kernel_fpu_begin and preempt_disable, knowing if it is
> currently blocked for those reasons is probably the most
> helpful.

These instructions is already in Documentation/RCU/stallwarn.rst

> 
> .
>
Paul E. McKenney Nov. 5, 2022, 8:06 p.m. UTC | #5
On Sat, Nov 05, 2022 at 01:58:28AM +0000, Elliott, Robert (Servers) wrote:
> 
> > --- a/kernel/rcu/Kconfig.debug
> > +++ b/kernel/rcu/Kconfig.debug
> > @@ -95,6 +95,16 @@ config RCU_EXP_CPU_STALL_TIMEOUT
> >  	  says to use the RCU_CPU_STALL_TIMEOUT value converted from
> >  	  seconds to milliseconds.
> > 
> > +config RCU_CPU_STALL_CPUTIME
> 
> Since the information might change and grow over time, consider
> calling it "ADDITIONAL_INFO" rather than just "CPUTIME".

Except that I already redirected Zhen Lei from a generic description
to a more specific one.  The reason for this is that I already get
complaints about the large volume of output from RCU CPU stall warnings,
which suggests that unless the additional information is quite small,
it should get its own Kconfig option and kernel boot parameter.

So we well be keeping RCU_CPU_STALL_CPUTIME.

And yes, I understand that high-end enterprise deployments might have
absolutely no problem with the additional volume, but Linux handles a
range of hardware and workloads that I would have thought impossible
only 20 years ago.  And RCU needs to accommodate the full range.

> > +	bool "Provide additional rcu stall debug information"
> > +	depends on RCU_STALL_COMMON
> > +	default n
> > +	help
> > +	  Statistics during the period from RCU_CPU_STALL_TIMEOUT/2 to
> > +	  RCU_CPU_STALL_TIMEOUT, such as the number of (hard interrupts,
> > soft
> > +	  interrupts, task switches) and the cputime of (hard interrupts,
> > soft
> > +	  interrupts, kerenl tasks) are added to the rcu stall report.
> 
> s/kerenl/kernel/
> 
> Those parenthetical words are not grammatically correct, and
> nobody will remember to update Kconfig if they change the code.
> 
> Try making it is a little less precise: 
> "Collect statistics during the RCU stall timeout period
> (e.g., counts and CPU time consumed by hard interrupts, soft
> interrupts, task switches, and kernel tasks) and include
> them in the RCU stall console error message."

My usual practice when taking in patches from someone whose native
language differs greatly from English is to do a round of wordsmithing.
I run the wordsmithing past the author, which helps catch any errors
I might make translating the rest of the way to Engligh and also
(more frequently than not) helps the author's English skills.

I have taken care of all but part of Zhen Lei's lat patch, and will
pull that in later today.

> ...
> > +
> > +	pr_err("         hardirqs   softirqs   csw/system\n");
> > +	pr_err(" number: %8ld %10d %12lld\n",
> 
> Everything after a %d is a "number". These are more specifically counts.

The distinction between "number" and "count" probably isn't going to
travel well, especially given that the next line could be argued to be
either a number or a count of milliseconds.

To see this, please use Google Translate to convert "George is just along
for the ride" to Chinese (simplified) and back and see what you get.  ;-)

OK, here is what I am going to do.  I am going to translate the last
patch's commit log, but leave the patch itself as Zhen Lei sent it.
Please feel free to send a follow-on patch that does the wordsmithing,
but please check your work with some non-native English speakers.
I am guessing that Zhen Lei would be happy to give you feedback from a
both a technical viewpoint and a Chinese-speaker viewpoint.

Please understand that this offer is not in any way sarcastic or
insincere.  This sort of translation does take time and energy, and so
I would have to be quite stupid not to accept good help.

> > +		kstat_cpu_irqs_sum(cpu) - rsrp->nr_hardirqs,
> > +		kstat_cpu_softirqs_sum(cpu) - rsrp->nr_softirqs,
> > +		nr_context_switches_cpu(cpu) - rsrp->nr_csw);
> 
> Two columns are plural and one is not. 
> 
> Within a few lines, this is referred to as "context_switches", cws, 
> and "task switches." One term would be better. vmstat uses "cs", 
> pidstat uses "cswch", and /proc/<pid>/status uses "ctxt_switches,"
> so there's not one ideal term for that column. Matching one
> of those might be better than using another unique name.
> 
> Since no other variants are shown, the "/system" suffix is a bit
> confusing. 

These are good points, including the lack of standard current
nomenclature.  Again, would you be interested in providing a translation
patch?

> > +	pr_err("cputime: %8lld %10lld %12lld   ==> %lld(ms)\n",
> > +		div_u64(cpustat[CPUTIME_IRQ] - rsrp->cputime_irq, NSEC_PER_MSEC),
> > +		div_u64(cpustat[CPUTIME_SOFTIRQ] - rsrp->cputime_softirq, NSEC_PER_MSEC),
> > +		div_u64(cpustat[CPUTIME_SYSTEM] - rsrp->cputime_system, NSEC_PER_MSEC),
> > +		jiffies64_to_msecs(half_timeout));
> 
> There is no column header for that fourth number. "half_timeout"
> might be good, dropping the "==>".

Another approach is to print it as a fraction, in this case, 2425/2500.
Of course, adding the "/2500" to the other two columns would be annoying.
Yet another approach is to leave that last column, but prepend a "/",
as in "/2500(ms)".

> The "(ms)" at the end is awkward. Try moving it to the left
> as "cputime (ms): "

Good point, and that might also better disambiguate "number" than
any other choice.

> > +int rcu_cpu_stall_cputime __read_mostly =
> > IS_ENABLED(CONFIG_RCU_CPU_STALL_CPUTIME);
> 
> As a config option and module parameter, adding some more
> instrumentation overhead might be worthwhile for other
> likely causes of rcu stalls.
> 
> For example, if enabled, have these functions (if available
> on the architecture) maintain a per-CPU running count of
> their invocations, which also cause the CPU to be unavailable
> for rcu: 
> - kernel_fpu_begin() calls - FPU/SIMD context preservation,
>   which also calls preempt_disable()
> - preempt_disable() calls - scheduler context switches disabled
> - local_irq_save() calls - interrupts disabled
> - cond_resched() calls - lack of these is a problem
> 
> For kernel_fpu_begin and preempt_disable, knowing if it is
> currently blocked for those reasons is probably the most
> helpful.

These could be additional patches.

But please tie any such patch to an actual use case.  After all, if no
one actually uses that additional information, we have irritates untold
numbers of electrons for no purpose.

Also, some of those functions are on fastpaths, so adding unconditional
instrumentation overhead might result in an objection or three.

As always, choose wisely!  ;-)

If I don't see anything from you by this coming Friday, I will fold my
usual wordsmithing into the patch.

							Thanx, Paul
Paul E. McKenney Nov. 5, 2022, 8:32 p.m. UTC | #6
On Sat, Nov 05, 2022 at 03:03:14PM +0800, Leizhen (ThunderTown) wrote:
> On 2022/11/5 9:58, Elliott, Robert (Servers) wrote:

[ . . . ]

> >> +int rcu_cpu_stall_cputime __read_mostly =
> >> IS_ENABLED(CONFIG_RCU_CPU_STALL_CPUTIME);
> > 
> > As a config option and module parameter, adding some more
> > instrumentation overhead might be worthwhile for other
> > likely causes of rcu stalls.
> > 
> > For example, if enabled, have these functions (if available
> > on the architecture) maintain a per-CPU running count of
> > their invocations, which also cause the CPU to be unavailable
> > for rcu: 
> > - kernel_fpu_begin() calls - FPU/SIMD context preservation,
> >   which also calls preempt_disable()
> > - preempt_disable() calls - scheduler context switches disabled
> > - local_irq_save() calls - interrupts disabled
> > - cond_resched() calls - lack of these is a problem
> > 
> > For kernel_fpu_begin and preempt_disable, knowing if it is
> > currently blocked for those reasons is probably the most
> > helpful.
> 
> These instructions is already in Documentation/RCU/stallwarn.rst

Excellent point -- this document also needs to be updated with this
new information.  I have pulled in your four patches as noted in my
previous email.  They are on the -rcu tree's "dev" branch.

Could you please send a patch containing an initial update to
stallwarn.rst?  The main thing I need is your perspective on how each
field is used.

							Thanx, Paul
Elliott, Robert (Servers) Nov. 7, 2022, 2:10 a.m. UTC | #7
> > > +config RCU_CPU_STALL_CPUTIME
> >
> > Since the information might change and grow over time, consider
> > calling it "ADDITIONAL_INFO" rather than just "CPUTIME".
> 
> Except that I already redirected Zhen Lei from a generic description
> to a more specific one.  The reason for this is that I already get
> complaints about the large volume of output from RCU CPU stall warnings,
> which suggests that unless the additional information is quite small,
> it should get its own Kconfig option and kernel boot parameter.
> 
> So we well be keeping RCU_CPU_STALL_CPUTIME.

No problem - CPUTIME is easier to search for, and will likely
always remain as one of the values reported.

> But please tie any such patch to an actual use case.  After all, if no
> one actually uses that additional information, we have irritates untold
> numbers of electrons for no purpose.

I've been working on fixing intermittent RCU stalls that turned out to be
caused by x86-optimized crypto modules holding kernel_fpu_begin()/end()
too long and crypto tests not calling cond_resched() often enough, so am
interested in features that will help identify and prevent such
problems.

> Also, some of those functions are on fastpaths, so adding unconditional
> instrumentation overhead might result in an objection or three.
> 
> As always, choose wisely!  ;-)
> 
> If I don't see anything from you by this coming Friday, I will fold my
> usual wordsmithing into the patch.

I applied the series to a tree not including any crypto module
fixes, but an overnight run didn't trigger any RCU stalls. So, I
modified the tcrypt test module to simulate a problem by
running a lengthy loop after kernel_fpu_begin().  
Here's an example of how the output looks:


[ 1816.698380] rcu: INFO: rcu_preempt self-detected stall on CPU
[ 1816.704368] rcu:     0-....: (2999 ticks this GP) idle=1dbc/1/0x4000000000000000 softirq=39683/39683 fqs=751
[ 1816.714200] rcu:          hardirqs   softirqs   csw/system
[ 1816.719922] rcu:  number:        5         10            0
[ 1816.725643] rcu: cputime:        3          0         1492   ==> 1500(ms)
[ 1816.732669]  (t=3030 jiffies g=89765 q=308 ncpus=56)
[ 1816.737857] CPU: 0 PID: 46826 Comm: modprobe Tainted: G        W          6.0.0+ #5
[ 1816.745754] Hardware name: HPE ProLiant DL360 Gen10/ProLiant DL360 Gen10, BIOS U32 03/08/2022
[ 1816.754523] RIP: 0010:rude_sleep_cycles.constprop.0+0x1c/0x30 [tcrypt]
[ 1816.761290] Code: 5d 41 5c 41 5d 41 5e 41 5f c3 cc cc cc cc 0f 1f 44 00 00 0f 31 48 c1 e2 20 be ab 90 41 00 48 89 d1 48 c1 e6 0b 48 09 c1 0f 31 <48> c1 e2 20 48 09 c2 48 29 ca 48 39 d6 73 ef c3 cc cc cc cc 48 8b
...

That makes me realize what "csw/switch" means:
- "csw" context switches applies to the number line
- "system" applies the cputime line
 
Maybe this shouldn't be a table? Make it grep-friendly:
[ 1816.719922] rcu: half-timeout counts: hardirq =5 softirq=10 csw=0
[ 1816.725643] rcu: half_timeout cputimes (ms): time=1500 hardirq=3 softirq=0 system=1492

The new prints interfere with existing prints, pushing the "t=3030" line 
further away from its context. That's from a pr_cont. Existing prints after
the "self-detect stall" message all start with \t, so they are roughly
related to the first line. The new prints should probably do the same.

Since the first line ends with \n, the pr_cont will never make it on the
same line, so it might be cleaner to use pr_err instead. That way it'll have
the "rcu:" prefix of the other lines.

That's from:
        pr_err("INFO: %s self-detected stall on CPU\n", rcu_state.name);
        raw_spin_lock_irqsave_rcu_node(rdp->mynode, flags);
        print_cpu_stall_info(smp_processor_id());
[that's where this patch adds more prints]
        raw_spin_unlock_irqrestore_rcu_node(rdp->mynode, flags);
        for_each_possible_cpu(cpu)
                totqlen += rcu_get_n_cbs_cpu(cpu);
        pr_cont("\t(t=%lu jiffies g=%ld q=%lu ncpus=%d)\n",
                jiffies - gps,
                (long)rcu_seq_current(&rcu_state.gp_seq), totqlen, rcu_state.n_online_cpus);


One other print has similar construction:
        pr_err("INFO: %s detected stalls on CPUs/tasks:\n", rcu_state.name);
        ...
                                        print_cpu_stall_info(cpu);
        ...
        pr_cont("\t(detected by %d, t=%ld jiffies, g=%ld, q=%lu ncpus=%d)\n",
               smp_processor_id(), (long)(jiffies - gps),
               (long)rcu_seq_current(&rcu_state.gp_seq), totqlen, rcu_state.n_online_cpus);
Leizhen (ThunderTown) Nov. 7, 2022, 3:20 a.m. UTC | #8
On 2022/11/6 4:32, Paul E. McKenney wrote:
> On Sat, Nov 05, 2022 at 03:03:14PM +0800, Leizhen (ThunderTown) wrote:
>> On 2022/11/5 9:58, Elliott, Robert (Servers) wrote:
> 
> [ . . . ]
> 
>>>> +int rcu_cpu_stall_cputime __read_mostly =
>>>> IS_ENABLED(CONFIG_RCU_CPU_STALL_CPUTIME);
>>>
>>> As a config option and module parameter, adding some more
>>> instrumentation overhead might be worthwhile for other
>>> likely causes of rcu stalls.
>>>
>>> For example, if enabled, have these functions (if available
>>> on the architecture) maintain a per-CPU running count of
>>> their invocations, which also cause the CPU to be unavailable
>>> for rcu: 
>>> - kernel_fpu_begin() calls - FPU/SIMD context preservation,
>>>   which also calls preempt_disable()
>>> - preempt_disable() calls - scheduler context switches disabled
>>> - local_irq_save() calls - interrupts disabled
>>> - cond_resched() calls - lack of these is a problem
>>>
>>> For kernel_fpu_begin and preempt_disable, knowing if it is
>>> currently blocked for those reasons is probably the most
>>> helpful.
>>
>> These instructions is already in Documentation/RCU/stallwarn.rst
> 
> Excellent point -- this document also needs to be updated with this
> new information.  I have pulled in your four patches as noted in my
> previous email.  They are on the -rcu tree's "dev" branch.

OK, thanks.

> 
> Could you please send a patch containing an initial update to
> stallwarn.rst?  The main thing I need is your perspective on how each
> field is used.

Okay, I'll add some descriptions to illustrate how to use this function
to identify each RCU stall cases.

> 
> 							Thanx, Paul
> .
>
Leizhen (ThunderTown) Nov. 7, 2022, 11:49 a.m. UTC | #9
On 2022/11/7 10:10, Elliott, Robert (Servers) wrote:
> 
>>>> +config RCU_CPU_STALL_CPUTIME
>>>
>>> Since the information might change and grow over time, consider
>>> calling it "ADDITIONAL_INFO" rather than just "CPUTIME".
>>
>> Except that I already redirected Zhen Lei from a generic description
>> to a more specific one.  The reason for this is that I already get
>> complaints about the large volume of output from RCU CPU stall warnings,
>> which suggests that unless the additional information is quite small,
>> it should get its own Kconfig option and kernel boot parameter.
>>
>> So we well be keeping RCU_CPU_STALL_CPUTIME.
> 
> No problem - CPUTIME is easier to search for, and will likely
> always remain as one of the values reported.
> 
>> But please tie any such patch to an actual use case.  After all, if no
>> one actually uses that additional information, we have irritates untold
>> numbers of electrons for no purpose.
> 
> I've been working on fixing intermittent RCU stalls that turned out to be
> caused by x86-optimized crypto modules holding kernel_fpu_begin()/end()
> too long and crypto tests not calling cond_resched() often enough, so am
> interested in features that will help identify and prevent such
> problems.
> 
>> Also, some of those functions are on fastpaths, so adding unconditional
>> instrumentation overhead might result in an objection or three.
>>
>> As always, choose wisely!  ;-)
>>
>> If I don't see anything from you by this coming Friday, I will fold my
>> usual wordsmithing into the patch.
> 
> I applied the series to a tree not including any crypto module
> fixes, but an overnight run didn't trigger any RCU stalls. So, I
> modified the tcrypt test module to simulate a problem by
> running a lengthy loop after kernel_fpu_begin().  
> Here's an example of how the output looks:
> 
> 
> [ 1816.698380] rcu: INFO: rcu_preempt self-detected stall on CPU
> [ 1816.704368] rcu:     0-....: (2999 ticks this GP) idle=1dbc/1/0x4000000000000000 softirq=39683/39683 fqs=751
> [ 1816.714200] rcu:          hardirqs   softirqs   csw/system
> [ 1816.719922] rcu:  number:        5         10            0
> [ 1816.725643] rcu: cputime:        3          0         1492   ==> 1500(ms)
> [ 1816.732669]  (t=3030 jiffies g=89765 q=308 ncpus=56)
> [ 1816.737857] CPU: 0 PID: 46826 Comm: modprobe Tainted: G        W          6.0.0+ #5
> [ 1816.745754] Hardware name: HPE ProLiant DL360 Gen10/ProLiant DL360 Gen10, BIOS U32 03/08/2022
> [ 1816.754523] RIP: 0010:rude_sleep_cycles.constprop.0+0x1c/0x30 [tcrypt]
> [ 1816.761290] Code: 5d 41 5c 41 5d 41 5e 41 5f c3 cc cc cc cc 0f 1f 44 00 00 0f 31 48 c1 e2 20 be ab 90 41 00 48 89 d1 48 c1 e6 0b 48 09 c1 0f 31 <48> c1 e2 20 48 09 c2 48 29 ca 48 39 d6 73 ef c3 cc cc cc cc 48 8b
> ...
> 
> That makes me realize what "csw/switch" means:
> - "csw" context switches applies to the number line
> - "system" applies the cputime line
>  
> Maybe this shouldn't be a table? Make it grep-friendly:
> [ 1816.719922] rcu: half-timeout counts: hardirq =5 softirq=10 csw=0
> [ 1816.725643] rcu: half_timeout cputimes (ms): time=1500 hardirq=3 softirq=0 system=1492

I prefer the table. Table look clearer and easier to compare.

> 
> The new prints interfere with existing prints, pushing the "t=3030" line 
> further away from its context. That's from a pr_cont. Existing prints after
> the "self-detect stall" message all start with \t, so they are roughly
> related to the first line. The new prints should probably do the same.
> 
> Since the first line ends with \n, the pr_cont will never make it on the
> same line, so it might be cleaner to use pr_err instead. That way it'll have
> the "rcu:" prefix of the other lines.
> 
> That's from:
>         pr_err("INFO: %s self-detected stall on CPU\n", rcu_state.name);
>         raw_spin_lock_irqsave_rcu_node(rdp->mynode, flags);
>         print_cpu_stall_info(smp_processor_id());
> [that's where this patch adds more prints]
>         raw_spin_unlock_irqrestore_rcu_node(rdp->mynode, flags);
>         for_each_possible_cpu(cpu)
>                 totqlen += rcu_get_n_cbs_cpu(cpu);
>         pr_cont("\t(t=%lu jiffies g=%ld q=%lu ncpus=%d)\n",
>                 jiffies - gps,
>                 (long)rcu_seq_current(&rcu_state.gp_seq), totqlen, rcu_state.n_online_cpus);
> 
> 
> One other print has similar construction:
>         pr_err("INFO: %s detected stalls on CPUs/tasks:\n", rcu_state.name);
>         ...
>                                         print_cpu_stall_info(cpu);
>         ...
>         pr_cont("\t(detected by %d, t=%ld jiffies, g=%ld, q=%lu ncpus=%d)\n",
>                smp_processor_id(), (long)(jiffies - gps),
>                (long)rcu_seq_current(&rcu_state.gp_seq), totqlen, rcu_state.n_online_cpus);
> 
> 
> 
> .
>
Elliott, Robert (Servers) Nov. 7, 2022, 8:38 p.m. UTC | #10
> > Maybe this shouldn't be a table? Make it grep-friendly:
> > [ 1816.719922] rcu: half-timeout counts: hardirq =5 softirq=10 csw=0
> > [ 1816.725643] rcu: half_timeout cputimes (ms): time=1500 hardirq=3
> softirq=0 system=1492
> 
> I prefer the table. Table look clearer and easier to compare.

Here's an example where it also triggers expedited stalls. The 
self-detected stall prints including the new table can end up
interleaved with the expedited stall prints.

(the table is expanded a bit here to experiment with adding more info, 
and I included the \t prefixes)

[  933.728032] tcrypt: rcu testing - kernel_fpu_disable for rude 6 s
[  938.038278] rcu: INFO: rcu_preempt detected expedited stalls on CPUs/tasks: { 46-.... } 4 jiffies s: 521 root: 0x8/.
[  938.049151] rcu: blocking rcu_node structures (internal RCU debug): l=1:42-55:0x10/.
[  938.057236] Sending NMI from CPU 35 to CPUs 46:
[  938.062048] NMI backtrace for cpu 46
[  938.062050] CPU: 46 PID: 44184 Comm: modprobe Not tainted 6.0.0+ #11
[  938.062052] Hardware name: HPE ProLiant DL360 Gen10/ProLiant DL360 Gen10, BIOS U32 03/08/2022
[  938.062053] RIP: 0010:rude_sleep_cycles+0x13/0x27 [tcrypt]
...
[  938.062151]  </TASK>
[  938.073277] rcu: INFO: rcu_preempt detected expedited stalls on CPUs/tasks: { 46-.... } 289 jiffies s: 521 root: 0x8/.
[  938.334292] rcu: blocking rcu_node structures (internal RCU debug): l=1:42-55:0x10/.
[  938.342289] Sending NMI from CPU 35 to CPUs 46:
[  938.347049] NMI backtrace for cpu 46
[  938.347050] CPU: 46 PID: 44184 Comm: modprobe Not tainted 6.0.0+ #11
[  938.347051] Hardware name: HPE ProLiant DL360 Gen10/ProLiant DL360 Gen10, BIOS U32 03/08/2022
[  938.347052] RIP: 0010:rude_sleep_cycles+0x13/0x27 [tcrypt]
...
[  938.347121]  </TASK>
[  938.358275] rcu: INFO: rcu_preempt detected expedited stalls on CPUs/tasks: { 46-.... } 573 jiffies s: 521 root: 0x8/.
[  938.619182] rcu: blocking rcu_node structures (internal RCU debug): l=1:42-55:0x10/.
[  938.627179] Sending NMI from CPU 35 to CPUs 46:
[  938.631939] NMI backtrace for cpu 46
[  938.631939] CPU: 46 PID: 44184 Comm: modprobe Not tainted 6.0.0+ #11
[  938.631941] Hardware name: HPE ProLiant DL360 Gen10/ProLiant DL360 Gen10, BIOS U32 03/08/2022
[  938.631942] RIP: 0010:rude_sleep_cycles+0x13/0x27 [tcrypt]
...
[  938.631997]  </TASK>
[  938.643272] rcu: INFO: rcu_preempt detected expedited stalls on CPUs/tasks: {
[  938.734267] rcu: INFO: rcu_preempt self-detected stall on CPU
[  938.740111]  46-....
[  938.747282] rcu:     46-....: (4948 ticks this GP) idle=a9cc/1/0x4000000000000000 softirq=7598/7598 fqs=1253
[  938.747285] rcu:               hardirqs   softirqs          csw     system cond_resched
[  938.747286] rcu:      number:        0          0            0                      0
[  938.754461]  } 720 jiffies s: 521 root: 0x8/.
[  938.757178] rcu:     cputime:       62          0                     2436              ==> 2500 (ms)
[  938.757179] rcu:     current: in_kernel_fpu_begin=1 this_cpu_preemptible=0
[  938.757181]  (t=5023 jiffies g=61053 q=704 ncpus=56)
[  938.759640] rcu: blocking rcu_node structures (internal RCU debug):

The first "46-" line is from the expedited stall.
The second "46-" line is from the self-detected stall.
The table lines are from the self-detected stall.
The "} 720" line is from the expedited stall.
The "(t=5023" line is from the self-detected stall.
The "blocking" line is from the expedited stall.

[  938.761745] CPU: 46 PID: 44184 Comm: modprobe Not tainted 6.0.0+ #11
[  938.761747] Hardware name: HPE ProLiant DL360 Gen10/ProLiant DL360 Gen10, BIOS U32 03/08/2022
[  938.982118] RIP: 0010:rude_sleep_cycles+0x13/0x27 [tcrypt]
...
[  939.221833]  </TASK>
[  939.224239]  l=1:42-55:0x10/.
[  939.227440] Sending NMI from CPU 35 to CPUs 46:
[  939.232204] NMI backtrace for cpu 46
[  939.232205] CPU: 46 PID: 44184 Comm: modprobe Not tainted 6.0.0+ #11
[  939.232206] Hardware name: HPE ProLiant DL360 Gen10/ProLiant DL360 Gen10, BIOS U32 03/08/2022
[  939.232207] RIP: 0010:rude_sleep_cycles+0x13/0x27 [tcrypt]
...
[  939.232262]  </TASK>
[  939.243264] rcu: INFO: rcu_preempt detected expedited stalls on CPUs/tasks: { 46-.... } 1458 jiffies s: 521 root: 0x8/.
[  939.503926] rcu: blocking rcu_node structures (internal RCU debug): l=1:42-55:0x10/.
[  939.511920] Sending NMI from CPU 35 to CPUs 46:
[  939.516678] NMI backtrace for cpu 46
[  939.516679] CPU: 46 PID: 44184 Comm: modprobe Not tainted 6.0.0+ #11
[  939.516680] Hardware name: HPE ProLiant DL360 Gen10/ProLiant DL360 Gen10, BIOS U32 03/08/2022
[  939.516680] RIP: 0010:rude_sleep_cycles+0x13/0x27 [tcrypt]
...
[  939.516732]  </TASK>
[  939.528263] rcu: INFO: rcu_preempt detected expedited stalls on CPUs/tasks: {
[  939.748586] initcall tcrypt_mod_init+0x0/0x1000 [tcrypt] returned -11 after 6020623 usecs
[  939.753200]  46-.... } 1718 jiffies s: 521 root: 0x8/.
[  939.791580] rcu: blocking rcu_node structures (internal RCU debug):
Elliott, Robert (Servers) Nov. 7, 2022, 9:57 p.m. UTC | #11
I created a 22 second stall, which triggered two self-detected stall
messages. The second one covers 18 seconds (and reports 17444 ms
of system cputime), but still reports the half_timeout of 2.5 s on
the right. The duration since the snapshot was taken would be
more meaningful.

[ 3428.422726] tcrypt: rcu testing - preempt_disable for rude 22 s
[ 3433.419012] rcu: INFO: rcu_preempt self-detected stall on CPU
[ 3433.425145] rcu:     52-....: (4993 ticks this GP) idle=7704/1/0x4000000000000000 softirq=8448/8448 fqs=1247
[ 3433.435073] rcu:               hardirqs   softirqs          csw     system cond_resched
[ 3433.443096] rcu:      number:        0          5            0                      0
[ 3433.450930] rcu:     cputime:        8          0                     2489              ==> 2500 (ms)
[ 3433.460151] rcu:     current: in_kernel_fpu_begin=0 this_cpu_preemptible=0
[ 3433.467006]  (t=5044 jiffies g=127261 q=179 ncpus=56)
[ 3433.472285] CPU: 52 PID: 44429 Comm: modprobe Not tainted 6.0.0+ #11
[ 3433.478879] Hardware name: HPE ProLiant DL360 Gen10/ProLiant DL360 Gen10, BIOS U32 03/08/2022
[ 3433.487664] RIP: 0010:rude_sleep_cycles+0x13/0x27 [tcrypt]
...
[ 3433.717818]  </TASK>
[ 3448.719827] rcu: INFO: rcu_preempt self-detected stall on CPU
[ 3448.725816] rcu:     52-....: (19994 ticks this GP) idle=7704/1/0x4000000000000000 softirq=8448/8448 fqs=5002
[ 3448.735736] rcu:               hardirqs   softirqs          csw     system cond_resched
[ 3448.743735] rcu:      number:        0         38            0                      0
[ 3448.751560] rcu:     cputime:      354          0                    17444              ==> 2500 (ms)
[ 3448.760780] rcu:     current: in_kernel_fpu_begin=0 this_cpu_preemptible=0
[ 3448.767643]  (t=20348 jiffies g=127261 q=1019 ncpus=56)
[ 3448.773106] CPU: 52 PID: 44429 Comm: modprobe Not tainted 6.0.0+ #11
[ 3448.779704] Hardware name: HPE ProLiant DL360 Gen10/ProLiant DL360 Gen10, BIOS U32 03/08/2022
[ 3448.788488] RIP: 0010:rude_sleep_cycles+0x13/0x27 [tcrypt]
...
Leizhen (ThunderTown) Nov. 8, 2022, 3:06 a.m. UTC | #12
On 2022/11/8 5:57, Elliott, Robert (Servers) wrote:
> I created a 22 second stall, which triggered two self-detected stall
> messages. The second one covers 18 seconds (and reports 17444 ms
> of system cputime), but still reports the half_timeout of 2.5 s on

Because I have not updated rsrp->gp_seq in print_cpu_stat_info().

Please add rsrp->gp_seq-- at the bottom of print_cpu_stat_info()
and try again.

diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index 2e560a70d88fd87..6f6c95d599e6436 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -455,6 +455,7 @@ static void print_cpu_stat_info(int cpu)
                div_u64(cpustat[CPUTIME_SOFTIRQ] - rsrp->cputime_softirq, NSEC_PER_MSEC),
                div_u64(cpustat[CPUTIME_SYSTEM] - rsrp->cputime_system, NSEC_PER_MSEC),
                jiffies64_to_msecs(half_timeout));
+       rsrp->gp_seq--;
 }

 /*

> the right. The duration since the snapshot was taken would be
> more meaningful.
> 
> [ 3428.422726] tcrypt: rcu testing - preempt_disable for rude 22 s
> [ 3433.419012] rcu: INFO: rcu_preempt self-detected stall on CPU
> [ 3433.425145] rcu:     52-....: (4993 ticks this GP) idle=7704/1/0x4000000000000000 softirq=8448/8448 fqs=1247
> [ 3433.435073] rcu:               hardirqs   softirqs          csw     system cond_resched
> [ 3433.443096] rcu:      number:        0          5            0                      0
> [ 3433.450930] rcu:     cputime:        8          0                     2489              ==> 2500 (ms)
> [ 3433.460151] rcu:     current: in_kernel_fpu_begin=0 this_cpu_preemptible=0
> [ 3433.467006]  (t=5044 jiffies g=127261 q=179 ncpus=56)
> [ 3433.472285] CPU: 52 PID: 44429 Comm: modprobe Not tainted 6.0.0+ #11
> [ 3433.478879] Hardware name: HPE ProLiant DL360 Gen10/ProLiant DL360 Gen10, BIOS U32 03/08/2022
> [ 3433.487664] RIP: 0010:rude_sleep_cycles+0x13/0x27 [tcrypt]
> ...
> [ 3433.717818]  </TASK>
> [ 3448.719827] rcu: INFO: rcu_preempt self-detected stall on CPU
> [ 3448.725816] rcu:     52-....: (19994 ticks this GP) idle=7704/1/0x4000000000000000 softirq=8448/8448 fqs=5002
> [ 3448.735736] rcu:               hardirqs   softirqs          csw     system cond_resched
> [ 3448.743735] rcu:      number:        0         38            0                      0
> [ 3448.751560] rcu:     cputime:      354          0                    17444              ==> 2500 (ms)
> [ 3448.760780] rcu:     current: in_kernel_fpu_begin=0 this_cpu_preemptible=0
> [ 3448.767643]  (t=20348 jiffies g=127261 q=1019 ncpus=56)
> [ 3448.773106] CPU: 52 PID: 44429 Comm: modprobe Not tainted 6.0.0+ #11
> [ 3448.779704] Hardware name: HPE ProLiant DL360 Gen10/ProLiant DL360 Gen10, BIOS U32 03/08/2022
> [ 3448.788488] RIP: 0010:rude_sleep_cycles+0x13/0x27 [tcrypt]
> ...
> 
> 
>
Leizhen (ThunderTown) Nov. 8, 2022, 3:18 a.m. UTC | #13
On 2022/11/8 4:38, Elliott, Robert (Servers) wrote:
> 
> 
>>> Maybe this shouldn't be a table? Make it grep-friendly:
>>> [ 1816.719922] rcu: half-timeout counts: hardirq =5 softirq=10 csw=0
>>> [ 1816.725643] rcu: half_timeout cputimes (ms): time=1500 hardirq=3
>> softirq=0 system=1492
>>
>> I prefer the table. Table look clearer and easier to compare.
> 
> Here's an example where it also triggers expedited stalls. The 
> self-detected stall prints including the new table can end up
> interleaved with the expedited stall prints.

But even if use your format, it won't solve the printing
interleaving problem. We should use lock.

> 
> (the table is expanded a bit here to experiment with adding more info, 
> and I included the \t prefixes)
> 
> [  933.728032] tcrypt: rcu testing - kernel_fpu_disable for rude 6 s
> [  938.038278] rcu: INFO: rcu_preempt detected expedited stalls on CPUs/tasks: { 46-.... } 4 jiffies s: 521 root: 0x8/.
> [  938.049151] rcu: blocking rcu_node structures (internal RCU debug): l=1:42-55:0x10/.
> [  938.057236] Sending NMI from CPU 35 to CPUs 46:
> [  938.062048] NMI backtrace for cpu 46
> [  938.062050] CPU: 46 PID: 44184 Comm: modprobe Not tainted 6.0.0+ #11
> [  938.062052] Hardware name: HPE ProLiant DL360 Gen10/ProLiant DL360 Gen10, BIOS U32 03/08/2022
> [  938.062053] RIP: 0010:rude_sleep_cycles+0x13/0x27 [tcrypt]
> ...
> [  938.062151]  </TASK>
> [  938.073277] rcu: INFO: rcu_preempt detected expedited stalls on CPUs/tasks: { 46-.... } 289 jiffies s: 521 root: 0x8/.
> [  938.334292] rcu: blocking rcu_node structures (internal RCU debug): l=1:42-55:0x10/.
> [  938.342289] Sending NMI from CPU 35 to CPUs 46:
> [  938.347049] NMI backtrace for cpu 46
> [  938.347050] CPU: 46 PID: 44184 Comm: modprobe Not tainted 6.0.0+ #11
> [  938.347051] Hardware name: HPE ProLiant DL360 Gen10/ProLiant DL360 Gen10, BIOS U32 03/08/2022
> [  938.347052] RIP: 0010:rude_sleep_cycles+0x13/0x27 [tcrypt]
> ...
> [  938.347121]  </TASK>
> [  938.358275] rcu: INFO: rcu_preempt detected expedited stalls on CPUs/tasks: { 46-.... } 573 jiffies s: 521 root: 0x8/.
> [  938.619182] rcu: blocking rcu_node structures (internal RCU debug): l=1:42-55:0x10/.
> [  938.627179] Sending NMI from CPU 35 to CPUs 46:
> [  938.631939] NMI backtrace for cpu 46
> [  938.631939] CPU: 46 PID: 44184 Comm: modprobe Not tainted 6.0.0+ #11
> [  938.631941] Hardware name: HPE ProLiant DL360 Gen10/ProLiant DL360 Gen10, BIOS U32 03/08/2022
> [  938.631942] RIP: 0010:rude_sleep_cycles+0x13/0x27 [tcrypt]
> ...
> [  938.631997]  </TASK>
> [  938.643272] rcu: INFO: rcu_preempt detected expedited stalls on CPUs/tasks: {
> [  938.734267] rcu: INFO: rcu_preempt self-detected stall on CPU
> [  938.740111]  46-....
> [  938.747282] rcu:     46-....: (4948 ticks this GP) idle=a9cc/1/0x4000000000000000 softirq=7598/7598 fqs=1253
> [  938.747285] rcu:               hardirqs   softirqs          csw     system cond_resched
> [  938.747286] rcu:      number:        0          0            0                      0
> [  938.754461]  } 720 jiffies s: 521 root: 0x8/.
> [  938.757178] rcu:     cputime:       62          0                     2436              ==> 2500 (ms)
> [  938.757179] rcu:     current: in_kernel_fpu_begin=1 this_cpu_preemptible=0
> [  938.757181]  (t=5023 jiffies g=61053 q=704 ncpus=56)
> [  938.759640] rcu: blocking rcu_node structures (internal RCU debug):
> 
> The first "46-" line is from the expedited stall.
> The second "46-" line is from the self-detected stall.
> The table lines are from the self-detected stall.
> The "} 720" line is from the expedited stall.
> The "(t=5023" line is from the self-detected stall.
> The "blocking" line is from the expedited stall.
> 
> [  938.761745] CPU: 46 PID: 44184 Comm: modprobe Not tainted 6.0.0+ #11
> [  938.761747] Hardware name: HPE ProLiant DL360 Gen10/ProLiant DL360 Gen10, BIOS U32 03/08/2022
> [  938.982118] RIP: 0010:rude_sleep_cycles+0x13/0x27 [tcrypt]
> ...
> [  939.221833]  </TASK>
> [  939.224239]  l=1:42-55:0x10/.
> [  939.227440] Sending NMI from CPU 35 to CPUs 46:
> [  939.232204] NMI backtrace for cpu 46
> [  939.232205] CPU: 46 PID: 44184 Comm: modprobe Not tainted 6.0.0+ #11
> [  939.232206] Hardware name: HPE ProLiant DL360 Gen10/ProLiant DL360 Gen10, BIOS U32 03/08/2022
> [  939.232207] RIP: 0010:rude_sleep_cycles+0x13/0x27 [tcrypt]
> ...
> [  939.232262]  </TASK>
> [  939.243264] rcu: INFO: rcu_preempt detected expedited stalls on CPUs/tasks: { 46-.... } 1458 jiffies s: 521 root: 0x8/.
> [  939.503926] rcu: blocking rcu_node structures (internal RCU debug): l=1:42-55:0x10/.
> [  939.511920] Sending NMI from CPU 35 to CPUs 46:
> [  939.516678] NMI backtrace for cpu 46
> [  939.516679] CPU: 46 PID: 44184 Comm: modprobe Not tainted 6.0.0+ #11
> [  939.516680] Hardware name: HPE ProLiant DL360 Gen10/ProLiant DL360 Gen10, BIOS U32 03/08/2022
> [  939.516680] RIP: 0010:rude_sleep_cycles+0x13/0x27 [tcrypt]
> ...
> [  939.516732]  </TASK>
> [  939.528263] rcu: INFO: rcu_preempt detected expedited stalls on CPUs/tasks: {
> [  939.748586] initcall tcrypt_mod_init+0x0/0x1000 [tcrypt] returned -11 after 6020623 usecs
> [  939.753200]  46-.... } 1718 jiffies s: 521 root: 0x8/.
> [  939.791580] rcu: blocking rcu_node structures (internal RCU debug):
> 
>
Elliott, Robert (Servers) Nov. 8, 2022, 5:53 a.m. UTC | #14
> -----Original Message-----
> From: Leizhen (ThunderTown) <thunder.leizhen@huawei.com>
> Sent: Monday, November 7, 2022 9:07 PM
> Subject: Re: [PATCH v4 4/4] rcu: Add RCU stall diagnosis information
> 
> On 2022/11/8 5:57, Elliott, Robert (Servers) wrote:
> > I created a 22 second stall, which triggered two self-detected stall
> > messages. The second one covers 18 seconds (and reports 17444 ms
> > of system cputime), but still reports the half_timeout of 2.5 s on
> 
> Because I have not updated rsrp->gp_seq in print_cpu_stat_info().
> 
> Please add rsrp->gp_seq-- at the bottom of print_cpu_stat_info()
> and try again.
> 
> diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
> index 2e560a70d88fd87..6f6c95d599e6436 100644
> --- a/kernel/rcu/tree_stall.h
> +++ b/kernel/rcu/tree_stall.h
> @@ -455,6 +455,7 @@ static void print_cpu_stat_info(int cpu)
>                 div_u64(cpustat[CPUTIME_SOFTIRQ] - rsrp->cputime_softirq,
> NSEC_PER_MSEC),
>                 div_u64(cpustat[CPUTIME_SYSTEM] - rsrp->cputime_system,
> NSEC_PER_MSEC),
>                 jiffies64_to_msecs(half_timeout));
> +       rsrp->gp_seq--;
>  }


That causes a lot more expedited stall messages to be printed (52 of them).
[   80.709572] rcu: INFO: rcu_preempt detected expedited stalls on CPUs/tasks: { 22-.... } 3 jiffies s: 481 root: 0x2/.
[   80.743724] rcu: INFO: rcu_preempt detected expedited stalls on CPUs/tasks: { 22-.... } 311 jiffies s: 481 root: 0x2/.
[   81.028028] rcu: INFO: rcu_preempt detected expedited stalls on CPUs/tasks: { 22-.... } 620 jiffies s: 481 root: 0x2/.
[   81.313255] rcu: INFO: rcu_preempt detected expedited stalls on CPUs/tasks: { 22-.... } 929 jiffies s: 481 root: 0x2/.
[   81.598483] rcu: INFO: rcu_preempt detected expedited stalls on CPUs/tasks: { 22-.... } 1238 jiffies s: 481 root: 0x2/.
[   81.883710] rcu: INFO: rcu_preempt detected expedited stalls on CPUs/tasks: { 22-.... } 1547 jiffies s: 481 root: 0x2/.
...
[   95.077093] rcu: INFO: rcu_preempt detected expedited stalls on CPUs/tasks: { 22-.... } 15840 jiffies s: 481 root: 0x2/.
[   95.362327] rcu: INFO: rcu_preempt detected expedited stalls on CPUs/tasks: { 22-.... } 16149 jiffies s: 481 root: 0x2/.
[   95.647549] rcu: INFO: rcu_preempt detected expedited stalls on CPUs/tasks: {


Of the three self-detected prints, the second and third system cputimes 
of 8809 and 8655 are still greater than 1500.

[   73.774167] tcrypt: rcu testing - kernel_fpu_disable for rude 22 s
[   76.540071] rcu: INFO: rcu_preempt self-detected stall on CPU 22
[   76.546118] rcu:     22-....: (2994 ticks this GP) idle=716c/1/0x4000000000000000 softirq=2095/2095 fqs=751
[   76.555661] rcu:     CPU 22          hardirqs   softirqs          csw     system  cond_resched
[   76.564067] rcu:     CPU 22   count:        0          4             0                        0
[   76.572560] rcu:     CPU 22 cputime:        3          0                     1493               ==> 1500 (ms)
[   76.582277] rcu:     CPU 22 current: in_kernel_fpu_begin=1 this_cpu_preemptible=0
[   76.589546] rcu:     (self-detected stall on CPU 22 t=3054 jiffies g=13897 q=274 ncpus=56)
...
[   85.021211] rcu: INFO: rcu_preempt detected expedited stalls on CPUs/tasks: {
[   85.158741] rcu: INFO: rcu_preempt self-detected stall on CPU 22
[   85.159351]  22-....
[   85.163204] rcu:     22-....: (11839 ticks this GP) idle=716c/1/0x4000000000000000 softirq=2095/2095 fqs=2959
[   85.163207] rcu:     CPU 22          hardirqs   softirqs          csw     system  cond_resched
[   85.163208] rcu:     CPU 22   count:        0         43             0                        0
[   85.163210] rcu:     CPU 22 cputime:      491          0                     8809               ==> 1500 (ms)
[   85.166980]  } 4831 jiffies s: 481 root: 0x2/.
[   85.170745] rcu:     CPU 22 current: in_kernel_fpu_begin=1 this_cpu_preemptible=0
[   85.170747] rcu:     (self-detected stall on CPU 22 t=12351 jiffies g=13897 q=2542 ncpus=56)
...
[   93.899261] rcu: INFO: rcu_preempt detected expedited stalls on CPUs/tasks: {
[   93.920482] rcu: INFO: rcu_preempt self-detected stall on CPU 22
[   93.933812]  22-....
[   93.939063] rcu:     22-....: (20543 ticks this GP) idle=716c/1/0x4000000000000000 softirq=2095/2095 fqs=5216
[   93.939066] rcu:     CPU 22          hardirqs   softirqs          csw     system  cond_resched
[   93.939067] rcu:     CPU 22   count:        0         59             0                        0
[   93.946245]  } 14342 jiffies s: 481 root: 0x2/.
[   93.953415] rcu:     CPU 22 cputime:      777          0                     8655               ==> 1500 (ms)
[   93.953418] rcu:     CPU 22 current: in_kernel_fpu_begin=1 this_cpu_preemptible=0
[   93.953419] rcu:     (self-detected stall on CPU 22 t=21865 jiffies g=13897 q=6284 ncpus=56)
[   93.953421] CPU: 22 PID: 2604 Comm: modprobe Not tainted 6.0.0+ #13
[   93.960598] rcu: blocking rcu_node structures (internal RCU debug):

(I added "CPU %d" to all the self-detected lines to try to help the interleaving)
Leizhen (ThunderTown) Nov. 8, 2022, 6:41 a.m. UTC | #15
On 2022/11/8 13:53, Elliott, Robert (Servers) wrote:
> 
> 
>> -----Original Message-----
>> From: Leizhen (ThunderTown) <thunder.leizhen@huawei.com>
>> Sent: Monday, November 7, 2022 9:07 PM
>> Subject: Re: [PATCH v4 4/4] rcu: Add RCU stall diagnosis information
>>
>> On 2022/11/8 5:57, Elliott, Robert (Servers) wrote:
>>> I created a 22 second stall, which triggered two self-detected stall
>>> messages. The second one covers 18 seconds (and reports 17444 ms
>>> of system cputime), but still reports the half_timeout of 2.5 s on
>>
>> Because I have not updated rsrp->gp_seq in print_cpu_stat_info().
>>
>> Please add rsrp->gp_seq-- at the bottom of print_cpu_stat_info()
>> and try again.
>>
>> diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
>> index 2e560a70d88fd87..6f6c95d599e6436 100644
>> --- a/kernel/rcu/tree_stall.h
>> +++ b/kernel/rcu/tree_stall.h
>> @@ -455,6 +455,7 @@ static void print_cpu_stat_info(int cpu)
>>                 div_u64(cpustat[CPUTIME_SOFTIRQ] - rsrp->cputime_softirq,
>> NSEC_PER_MSEC),
>>                 div_u64(cpustat[CPUTIME_SYSTEM] - rsrp->cputime_system,
>> NSEC_PER_MSEC),
>>                 jiffies64_to_msecs(half_timeout));
>> +       rsrp->gp_seq--;
>>  }
> 
> 
> That causes a lot more expedited stall messages to be printed (52 of them).

There doesn't seem to be a causal relationship. I haven't figured it out yet.

> [   80.709572] rcu: INFO: rcu_preempt detected expedited stalls on CPUs/tasks: { 22-.... } 3 jiffies s: 481 root: 0x2/.
> [   80.743724] rcu: INFO: rcu_preempt detected expedited stalls on CPUs/tasks: { 22-.... } 311 jiffies s: 481 root: 0x2/.
> [   81.028028] rcu: INFO: rcu_preempt detected expedited stalls on CPUs/tasks: { 22-.... } 620 jiffies s: 481 root: 0x2/.
> [   81.313255] rcu: INFO: rcu_preempt detected expedited stalls on CPUs/tasks: { 22-.... } 929 jiffies s: 481 root: 0x2/.
> [   81.598483] rcu: INFO: rcu_preempt detected expedited stalls on CPUs/tasks: { 22-.... } 1238 jiffies s: 481 root: 0x2/.
> [   81.883710] rcu: INFO: rcu_preempt detected expedited stalls on CPUs/tasks: { 22-.... } 1547 jiffies s: 481 root: 0x2/.
> ...
> [   95.077093] rcu: INFO: rcu_preempt detected expedited stalls on CPUs/tasks: { 22-.... } 15840 jiffies s: 481 root: 0x2/.
> [   95.362327] rcu: INFO: rcu_preempt detected expedited stalls on CPUs/tasks: { 22-.... } 16149 jiffies s: 481 root: 0x2/.
> [   95.647549] rcu: INFO: rcu_preempt detected expedited stalls on CPUs/tasks: {
> 
> 
> Of the three self-detected prints, the second and third system cputimes 
> of 8809 and 8655 are still greater than 1500.

Maybe we should add another member to record jiffies.

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index b33165dba1ea745..6d9c876ea5c85a6 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -880,6 +880,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
                        rsrp->nr_hardirqs = kstat_cpu_irqs_sum(rdp->cpu);
                        rsrp->nr_softirqs = kstat_cpu_softirqs_sum(rdp->cpu);
                        rsrp->nr_csw = nr_context_switches_cpu(rdp->cpu);
+                       rsrp->jiffies = jiffies;
                        rsrp->gp_seq = rdp->gp_seq;
                }
        }
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index fa159a951ded42e..cbe4f09252dce20 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -172,6 +172,7 @@ struct rcu_snap_record {
        unsigned long   nr_hardirqs;    /* Accumulated number of hard irqs */
        unsigned int    nr_softirqs;    /* Accumulated number of soft irqs */
        unsigned long long nr_csw;      /* Accumulated number of task switches */
+       unsigned long   jiffies;        /* jiffies at the sampling time */
 };

 /* Per-CPU data for read-copy update. */
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index 2e560a70d88fd87..7b6afb9c7b96dbe 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -431,7 +431,6 @@ static bool rcu_is_rcuc_kthread_starving(struct rcu_data *rdp, unsigned long *jp
 static void print_cpu_stat_info(int cpu)
 {
        u64 *cpustat;
-       unsigned long half_timeout;
        struct rcu_snap_record *rsrp;
        struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);

@@ -443,7 +442,6 @@ static void print_cpu_stat_info(int cpu)
                return;

        cpustat = kcpustat_cpu(cpu).cpustat;
-       half_timeout = rcu_jiffies_till_stall_check() / 2;

        pr_err("         hardirqs   softirqs   csw/system\n");
        pr_err(" number: %8ld %10d %12lld\n",
@@ -454,7 +452,7 @@ static void print_cpu_stat_info(int cpu)
                div_u64(cpustat[CPUTIME_IRQ] - rsrp->cputime_irq, NSEC_PER_MSEC),
                div_u64(cpustat[CPUTIME_SOFTIRQ] - rsrp->cputime_softirq, NSEC_PER_MSEC),
                div_u64(cpustat[CPUTIME_SYSTEM] - rsrp->cputime_system, NSEC_PER_MSEC),
-               jiffies64_to_msecs(half_timeout));
+               jiffies64_to_msecs(jiffies - rsrp->jiffies));
 }

 /*


> 
> [   73.774167] tcrypt: rcu testing - kernel_fpu_disable for rude 22 s
> [   76.540071] rcu: INFO: rcu_preempt self-detected stall on CPU 22
> [   76.546118] rcu:     22-....: (2994 ticks this GP) idle=716c/1/0x4000000000000000 softirq=2095/2095 fqs=751
> [   76.555661] rcu:     CPU 22          hardirqs   softirqs          csw     system  cond_resched
> [   76.564067] rcu:     CPU 22   count:        0          4             0                        0
> [   76.572560] rcu:     CPU 22 cputime:        3          0                     1493               ==> 1500 (ms)
> [   76.582277] rcu:     CPU 22 current: in_kernel_fpu_begin=1 this_cpu_preemptible=0
> [   76.589546] rcu:     (self-detected stall on CPU 22 t=3054 jiffies g=13897 q=274 ncpus=56)
> ...
> [   85.021211] rcu: INFO: rcu_preempt detected expedited stalls on CPUs/tasks: {
> [   85.158741] rcu: INFO: rcu_preempt self-detected stall on CPU 22
> [   85.159351]  22-....
> [   85.163204] rcu:     22-....: (11839 ticks this GP) idle=716c/1/0x4000000000000000 softirq=2095/2095 fqs=2959
> [   85.163207] rcu:     CPU 22          hardirqs   softirqs          csw     system  cond_resched
> [   85.163208] rcu:     CPU 22   count:        0         43             0                        0
> [   85.163210] rcu:     CPU 22 cputime:      491          0                     8809               ==> 1500 (ms)
> [   85.166980]  } 4831 jiffies s: 481 root: 0x2/.
> [   85.170745] rcu:     CPU 22 current: in_kernel_fpu_begin=1 this_cpu_preemptible=0
> [   85.170747] rcu:     (self-detected stall on CPU 22 t=12351 jiffies g=13897 q=2542 ncpus=56)
> ...
> [   93.899261] rcu: INFO: rcu_preempt detected expedited stalls on CPUs/tasks: {
> [   93.920482] rcu: INFO: rcu_preempt self-detected stall on CPU 22
> [   93.933812]  22-....
> [   93.939063] rcu:     22-....: (20543 ticks this GP) idle=716c/1/0x4000000000000000 softirq=2095/2095 fqs=5216
> [   93.939066] rcu:     CPU 22          hardirqs   softirqs          csw     system  cond_resched
> [   93.939067] rcu:     CPU 22   count:        0         59             0                        0
> [   93.946245]  } 14342 jiffies s: 481 root: 0x2/.
> [   93.953415] rcu:     CPU 22 cputime:      777          0                     8655               ==> 1500 (ms)
> [   93.953418] rcu:     CPU 22 current: in_kernel_fpu_begin=1 this_cpu_preemptible=0
> [   93.953419] rcu:     (self-detected stall on CPU 22 t=21865 jiffies g=13897 q=6284 ncpus=56)
> [   93.953421] CPU: 22 PID: 2604 Comm: modprobe Not tainted 6.0.0+ #13
> [   93.960598] rcu: blocking rcu_node structures (internal RCU debug):
> 
> (I added "CPU %d" to all the self-detected lines to try to help the interleaving)
> 
> 
>
Elliott, Robert (Servers) Nov. 8, 2022, 7:29 p.m. UTC | #16
> Maybe we should add another member to record jiffies.
> 
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index b33165dba1ea745..6d9c876ea5c85a6 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -880,6 +880,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data
> *rdp)
>                         rsrp->nr_hardirqs = kstat_cpu_irqs_sum(rdp->cpu);
>                         rsrp->nr_softirqs = kstat_cpu_softirqs_sum(rdp-
> >cpu);
>                         rsrp->nr_csw = nr_context_switches_cpu(rdp->cpu);
> +                       rsrp->jiffies = jiffies;
>                         rsrp->gp_seq = rdp->gp_seq;

Yes, that works:

[  252.139108] tcrypt: rcu testing - kernel_fpu_disable for rude 22 s
[  255.140204] rcu: INFO: rcu_preempt self-detected stall on CPU 39
[  255.146438] rcu:     CPU 39-....: (2995 ticks this GP) idle=5014/1/0x4000000000000000 softirq=2373/2374 fqs=751
[  255.156512] rcu:     CPU 39          hardirqs   softirqs           csw     system  cond_resched
[  255.165182] rcu:     CPU 39   count:        0          3             0                        0
[  255.173853] rcu:     CPU 39 cputime:        3          0                     1493               ==> 1530 (ms)
[  255.183746] rcu:     CPU 39 current: in_kernel_fpu_begin=1 preemptible=0
[  255.190407] rcu:     CPU 39 self-detected stall t=3051 jiffies g=18181 q=373 ncpus=56
[  255.198205] CPU: 39 PID: 2607 Comm: modprobe Not tainted 6.0.0+ #14
...
[  264.450164] rcu: INFO: rcu_preempt self-detected stall on CPU 39
[  264.456416] rcu:     CPU 39-...k.: (12000 ticks this GP) idle=5014/1/0x4000000000000000 softirq=2373/2374 fqs=3006
[  264.466690] rcu:     CPU 39          hardirqs   softirqs           csw     system  cond_resched
[  264.475390] rcu:     CPU 39   count:        0         21             0                        0
[  264.484092] rcu:     CPU 39 cputime:      328          0                     8982               ==> 9293 (ms)
[  264.494015] rcu:     CPU 39 current: in_kernel_fpu_begin=1 preemptible=0
[  264.500706] rcu:     CPU 39 self-detected stall t=12361 jiffies g=18181 q=601 ncpus=56
[  264.508619] CPU: 39 PID: 2607 Comm: modprobe Not tainted 6.0.0+ #14
...
[  273.760124] rcu: INFO: rcu_preempt self-detected stall on CPU 39
[  273.766376] rcu:     CPU 39-....: (21004 ticks this GP) idle=5014/1/0x4000000000000000 softirq=2373/2374 fqs=5262
[  273.776649] rcu:     CPU 39          hardirqs   softirqs           csw     system  cond_resched
[  273.785348] rcu:     CPU 39   count:        0         17             0                        0
[  273.794050] rcu:     CPU 39 cputime:      328          0                     8981               ==> 9293 (ms)
[  273.803971] rcu:     CPU 39 current: in_kernel_fpu_begin=1 preemptible=0
[  273.810660] rcu:     CPU 39 self-detected stall t=21671 jiffies g=18181 q=1112 ncpus=56
[  273.818659] CPU: 39 PID: 2607 Comm: modprobe Not tainted 6.0.0+ #14
diff mbox series

Patch

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index a465d5242774af8..f7c0cfd1cdcacd3 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -5082,6 +5082,11 @@ 
 			rcupdate.rcu_cpu_stall_timeout to be used (after
 			conversion from seconds to milliseconds).
 
+	rcupdate.rcu_cpu_stall_cputime= [KNL]
+			Provide statistics on the cputime and count of
+			interrupts and tasks during the second half of
+			rcu stall timeout.
+
 	rcupdate.rcu_expedited= [KNL]
 			Use expedited grace-period primitives, for
 			example, synchronize_rcu_expedited() instead
diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug
index 1b0c41d490f0588..cd7190d6b34e790 100644
--- a/kernel/rcu/Kconfig.debug
+++ b/kernel/rcu/Kconfig.debug
@@ -95,6 +95,16 @@  config RCU_EXP_CPU_STALL_TIMEOUT
 	  says to use the RCU_CPU_STALL_TIMEOUT value converted from
 	  seconds to milliseconds.
 
+config RCU_CPU_STALL_CPUTIME
+	bool "Provide additional rcu stall debug information"
+	depends on RCU_STALL_COMMON
+	default n
+	help
+	  Statistics during the period from RCU_CPU_STALL_TIMEOUT/2 to
+	  RCU_CPU_STALL_TIMEOUT, such as the number of (hard interrupts, soft
+	  interrupts, task switches) and the cputime of (hard interrupts, soft
+	  interrupts, kerenl tasks) are added to the rcu stall report.
+
 config RCU_TRACE
 	bool "Enable tracing for RCU"
 	depends on DEBUG_KERNEL
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index be5979da07f5985..35816fbb4b82f17 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -224,6 +224,7 @@  extern int rcu_cpu_stall_ftrace_dump;
 extern int rcu_cpu_stall_suppress;
 extern int rcu_cpu_stall_timeout;
 extern int rcu_exp_cpu_stall_timeout;
+extern int rcu_cpu_stall_cputime;
 int rcu_jiffies_till_stall_check(void);
 int rcu_exp_jiffies_till_stall_check(void);
 
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 93416afebd59c7a..3fcb17d7829ac8e 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -931,6 +931,22 @@  static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
 			rdp->rcu_iw_gp_seq = rnp->gp_seq;
 			irq_work_queue_on(&rdp->rcu_iw, rdp->cpu);
 		}
+
+		if (rcu_cpu_stall_cputime && rdp->snap_record.gp_seq != rdp->gp_seq) {
+			u64 *cpustat;
+			struct rcu_snap_record *rsrp;
+
+			cpustat = kcpustat_cpu(rdp->cpu).cpustat;
+
+			rsrp = &rdp->snap_record;
+			rsrp->cputime_irq     = cpustat[CPUTIME_IRQ];
+			rsrp->cputime_softirq = cpustat[CPUTIME_SOFTIRQ];
+			rsrp->cputime_system  = cpustat[CPUTIME_SYSTEM];
+			rsrp->nr_hardirqs = kstat_cpu_irqs_sum(rdp->cpu);
+			rsrp->nr_softirqs = kstat_cpu_softirqs_sum(rdp->cpu);
+			rsrp->nr_csw = nr_context_switches_cpu(rdp->cpu);
+			rsrp->gp_seq = rdp->gp_seq;
+		}
 	}
 
 	return 0;
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index d4a97e40ea9c3e2..44d49819a31f050 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -158,6 +158,22 @@  union rcu_noqs {
 	u16 s; /* Set of bits, aggregate OR here. */
 };
 
+/*
+ * Record the snapshot of the core stats at 1/2 rcu stall timeout. The member
+ * gp_seq is used to ensure that all members are updated only once during the
+ * second half period. The snapshot is taken only if this gp_seq is not equal
+ * to rdp->gp_seq.
+ */
+struct rcu_snap_record {
+	unsigned long	gp_seq;		/* Track rdp->gp_seq counter */
+	u64		cputime_irq;	/* Accumulated cputime of hard irqs */
+	u64		cputime_softirq;/* Accumulated cputime of soft irqs */
+	u64		cputime_system; /* Accumulated cputime of kernel tasks */
+	unsigned long	nr_hardirqs;	/* Accumulated number of hard irqs */
+	unsigned int	nr_softirqs;	/* Accumulated number of soft irqs */
+	unsigned long long nr_csw;	/* Accumulated number of task switches */
+};
+
 /* Per-CPU data for read-copy update. */
 struct rcu_data {
 	/* 1) quiescent-state and grace-period handling : */
@@ -262,6 +278,7 @@  struct rcu_data {
 	short rcu_onl_gp_flags;		/* ->gp_flags at last online. */
 	unsigned long last_fqs_resched;	/* Time of last rcu_resched(). */
 	unsigned long last_sched_clock;	/* Jiffies of last rcu_sched_clock_irq(). */
+	struct rcu_snap_record snap_record; /* Snapshot of core stats at 1/2 rcu stall timeout */
 
 	int cpu;
 };
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index 5653560573e22d6..2e560a70d88fd87 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -428,6 +428,35 @@  static bool rcu_is_rcuc_kthread_starving(struct rcu_data *rdp, unsigned long *jp
 	return j > 2 * HZ;
 }
 
+static void print_cpu_stat_info(int cpu)
+{
+	u64 *cpustat;
+	unsigned long half_timeout;
+	struct rcu_snap_record *rsrp;
+	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+
+	if (!rcu_cpu_stall_cputime)
+		return;
+
+	rsrp = &rdp->snap_record;
+	if (rsrp->gp_seq != rdp->gp_seq)
+		return;
+
+	cpustat = kcpustat_cpu(cpu).cpustat;
+	half_timeout = rcu_jiffies_till_stall_check() / 2;
+
+	pr_err("         hardirqs   softirqs   csw/system\n");
+	pr_err(" number: %8ld %10d %12lld\n",
+		kstat_cpu_irqs_sum(cpu) - rsrp->nr_hardirqs,
+		kstat_cpu_softirqs_sum(cpu) - rsrp->nr_softirqs,
+		nr_context_switches_cpu(cpu) - rsrp->nr_csw);
+	pr_err("cputime: %8lld %10lld %12lld   ==> %lld(ms)\n",
+		div_u64(cpustat[CPUTIME_IRQ] - rsrp->cputime_irq, NSEC_PER_MSEC),
+		div_u64(cpustat[CPUTIME_SOFTIRQ] - rsrp->cputime_softirq, NSEC_PER_MSEC),
+		div_u64(cpustat[CPUTIME_SYSTEM] - rsrp->cputime_system, NSEC_PER_MSEC),
+		jiffies64_to_msecs(half_timeout));
+}
+
 /*
  * Print out diagnostic information for the specified stalled CPU.
  *
@@ -484,6 +513,8 @@  static void print_cpu_stall_info(int cpu)
 	       data_race(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart,
 	       rcuc_starved ? buf : "",
 	       falsepositive ? " (false positive?)" : "");
+
+	print_cpu_stat_info(cpu);
 }
 
 /* Complain about starvation of grace-period kthread.  */
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 738842c4886b235..aec76ccbe1e343b 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -508,6 +508,8 @@  int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
 module_param(rcu_cpu_stall_timeout, int, 0644);
 int rcu_exp_cpu_stall_timeout __read_mostly = CONFIG_RCU_EXP_CPU_STALL_TIMEOUT;
 module_param(rcu_exp_cpu_stall_timeout, int, 0644);
+int rcu_cpu_stall_cputime __read_mostly = IS_ENABLED(CONFIG_RCU_CPU_STALL_CPUTIME);
+module_param(rcu_cpu_stall_cputime, int, 0644);
 #endif /* #ifdef CONFIG_RCU_STALL_COMMON */
 
 // Suppress boot-time RCU CPU stall warnings and rcutorture writer stall