Message ID | 20221109093739.187-2-thunder.leizhen@huawei.com (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
Series | rcu: Add RCU stall diagnosis information | expand |
On Wed, Nov 09, 2022 at 05:37:37PM +0800, Zhen Lei wrote: > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c > index ed93ddb8203d42c..e1ff23b2a14d71d 100644 > --- a/kernel/rcu/tree.c > +++ b/kernel/rcu/tree.c > @@ -866,6 +866,23 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) > rdp->rcu_iw_gp_seq = rnp->gp_seq; > irq_work_queue_on(&rdp->rcu_iw, rdp->cpu); > } > + > + if (rcu_cpu_stall_cputime && rdp->snap_record.gp_seq != rdp->gp_seq) { > + u64 *cpustat; > + struct rcu_snap_record *rsrp; > + > + cpustat = kcpustat_cpu(rdp->cpu).cpustat; > + > + rsrp = &rdp->snap_record; > + rsrp->cputime_irq = cpustat[CPUTIME_IRQ]; > + rsrp->cputime_softirq = cpustat[CPUTIME_SOFTIRQ]; > + rsrp->cputime_system = cpustat[CPUTIME_SYSTEM]; You need to use kcpustat_field(), otherwise you'll get stalled values on nohz_full CPUs. > + rsrp->nr_hardirqs = kstat_cpu_irqs_sum(rdp->cpu); > + rsrp->nr_softirqs = kstat_cpu_softirqs_sum(rdp->cpu); > + rsrp->nr_csw = nr_context_switches_cpu(rdp->cpu); > + rsrp->jiffies = jiffies; > + rsrp->gp_seq = rdp->gp_seq; > + } > } > > return 0; > diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h > index 5653560573e22d6..7b6afb9c7b96dbe 100644 > --- a/kernel/rcu/tree_stall.h > +++ b/kernel/rcu/tree_stall.h > @@ -428,6 +428,33 @@ static bool rcu_is_rcuc_kthread_starving(struct rcu_data *rdp, unsigned long *jp > return j > 2 * HZ; > } > > +static void print_cpu_stat_info(int cpu) > +{ > + u64 *cpustat; > + struct rcu_snap_record *rsrp; > + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); > + > + if (!rcu_cpu_stall_cputime) > + return; > + > + rsrp = &rdp->snap_record; > + if (rsrp->gp_seq != rdp->gp_seq) > + return; > + > + cpustat = kcpustat_cpu(cpu).cpustat; > + > + pr_err(" hardirqs softirqs csw/system\n"); > + pr_err(" number: %8ld %10d %12lld\n", > + kstat_cpu_irqs_sum(cpu) - rsrp->nr_hardirqs, > + kstat_cpu_softirqs_sum(cpu) - rsrp->nr_softirqs, > + nr_context_switches_cpu(cpu) - rsrp->nr_csw); > + pr_err("cputime: %8lld %10lld %12lld ==> %lld(ms)\n", > + div_u64(cpustat[CPUTIME_IRQ] - rsrp->cputime_irq, NSEC_PER_MSEC), > + div_u64(cpustat[CPUTIME_SOFTIRQ] - rsrp->cputime_softirq, NSEC_PER_MSEC), > + div_u64(cpustat[CPUTIME_SYSTEM] - rsrp->cputime_system, > NSEC_PER_MSEC), Same here. Thanks.
> b/Documentation/admin-guide/kernel-parameters.txt > index a465d5242774af8..2729f3ad11d108b 100644 > --- a/Documentation/admin-guide/kernel-parameters.txt > +++ b/Documentation/admin-guide/kernel-parameters.txt > @@ -5082,6 +5082,12 @@ > rcupdate.rcu_cpu_stall_timeout to be used (after > conversion from seconds to milliseconds). > > + rcupdate.rcu_cpu_stall_cputime= [KNL] > + Provide statistics on the cputime and count of > + interrupts and tasks during the sampling period. For > + multiple continuous RCU stalls, all sampling periods > + begin at half of the first RCU stall timeout. This description should start with: "In kernels built with CONFIG_RCU_CPU_STALL_TIME=y, " Also, that parameter name seems like it contains a time value, but it's really just treated as zero vs. anything else. Consider renaming it to rcu_cpu_stall_cputime_en or describing the values in the description ("0 disables, all other values enable"). > diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h > +struct rcu_snap_record { > + unsigned long gp_seq; /* Track rdp->gp_seq counter */ > + u64 cputime_irq; /* Accumulated cputime of hard irqs */ > + u64 cputime_softirq;/* Accumulated cputime of soft irqs */ > + u64 cputime_system; /* Accumulated cputime of kernel tasks > */ > + unsigned long nr_hardirqs; /* Accumulated number of hard irqs */ > + unsigned int nr_softirqs; /* Accumulated number of soft irqs */ That should be "unsigned long" to match the other patch > diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h > +static void print_cpu_stat_info(int cpu) > +{ ... > + pr_err(" hardirqs softirqs csw/system\n"); > + pr_err(" number: %8ld %10d %12lld\n", > + kstat_cpu_irqs_sum(cpu) - rsrp->nr_hardirqs, > + kstat_cpu_softirqs_sum(cpu) - rsrp->nr_softirqs, > + nr_context_switches_cpu(cpu) - rsrp->nr_csw); > + pr_err("cputime: %8lld %10lld %12lld ==> %lld(ms)\n", Those should all start with "\t" to match other related prints.
> -----Original Message----- > From: Elliott, Robert (Servers) > Sent: Wednesday, November 9, 2022 10:56 AM > > b/Documentation/admin-guide/kernel-parameters.txt > > index a465d5242774af8..2729f3ad11d108b 100644 > > --- a/Documentation/admin-guide/kernel-parameters.txt > > +++ b/Documentation/admin-guide/kernel-parameters.txt > > @@ -5082,6 +5082,12 @@ > > rcupdate.rcu_cpu_stall_timeout to be used (after > > conversion from seconds to milliseconds). > > > > + rcupdate.rcu_cpu_stall_cputime= [KNL] > > + Provide statistics on the cputime and count of > > + interrupts and tasks during the sampling period. For > > + multiple continuous RCU stalls, all sampling periods > > + begin at half of the first RCU stall timeout. > > This description should start with: > "In kernels built with CONFIG_RCU_CPU_STALL_TIME=y, " Please ignore that comment - the module parameter is always present (only subject to CONFIG_RCU_STALL_COMMON like the others). The config option is just selecting the default value for that module parameter.
On 2022/11/9 23:20, Frederic Weisbecker wrote: > On Wed, Nov 09, 2022 at 05:37:37PM +0800, Zhen Lei wrote: >> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c >> index ed93ddb8203d42c..e1ff23b2a14d71d 100644 >> --- a/kernel/rcu/tree.c >> +++ b/kernel/rcu/tree.c >> @@ -866,6 +866,23 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) >> rdp->rcu_iw_gp_seq = rnp->gp_seq; >> irq_work_queue_on(&rdp->rcu_iw, rdp->cpu); >> } >> + >> + if (rcu_cpu_stall_cputime && rdp->snap_record.gp_seq != rdp->gp_seq) { >> + u64 *cpustat; >> + struct rcu_snap_record *rsrp; >> + >> + cpustat = kcpustat_cpu(rdp->cpu).cpustat; >> + >> + rsrp = &rdp->snap_record; >> + rsrp->cputime_irq = cpustat[CPUTIME_IRQ]; >> + rsrp->cputime_softirq = cpustat[CPUTIME_SOFTIRQ]; >> + rsrp->cputime_system = cpustat[CPUTIME_SYSTEM]; > > You need to use kcpustat_field(), otherwise you'll get stalled values on nohz_full CPUs. OK, I'll update it. Thanks. > >> + rsrp->nr_hardirqs = kstat_cpu_irqs_sum(rdp->cpu); >> + rsrp->nr_softirqs = kstat_cpu_softirqs_sum(rdp->cpu); >> + rsrp->nr_csw = nr_context_switches_cpu(rdp->cpu); >> + rsrp->jiffies = jiffies; >> + rsrp->gp_seq = rdp->gp_seq; >> + } >> } >> >> return 0; >> diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h >> index 5653560573e22d6..7b6afb9c7b96dbe 100644 >> --- a/kernel/rcu/tree_stall.h >> +++ b/kernel/rcu/tree_stall.h >> @@ -428,6 +428,33 @@ static bool rcu_is_rcuc_kthread_starving(struct rcu_data *rdp, unsigned long *jp >> return j > 2 * HZ; >> } >> >> +static void print_cpu_stat_info(int cpu) >> +{ >> + u64 *cpustat; >> + struct rcu_snap_record *rsrp; >> + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); >> + >> + if (!rcu_cpu_stall_cputime) >> + return; >> + >> + rsrp = &rdp->snap_record; >> + if (rsrp->gp_seq != rdp->gp_seq) >> + return; >> + >> + cpustat = kcpustat_cpu(cpu).cpustat; >> + >> + pr_err(" hardirqs softirqs csw/system\n"); >> + pr_err(" number: %8ld %10d %12lld\n", >> + kstat_cpu_irqs_sum(cpu) - rsrp->nr_hardirqs, >> + kstat_cpu_softirqs_sum(cpu) - rsrp->nr_softirqs, >> + nr_context_switches_cpu(cpu) - rsrp->nr_csw); >> + pr_err("cputime: %8lld %10lld %12lld ==> %lld(ms)\n", >> + div_u64(cpustat[CPUTIME_IRQ] - rsrp->cputime_irq, NSEC_PER_MSEC), >> + div_u64(cpustat[CPUTIME_SOFTIRQ] - rsrp->cputime_softirq, NSEC_PER_MSEC), >> + div_u64(cpustat[CPUTIME_SYSTEM] - rsrp->cputime_system, >> NSEC_PER_MSEC), > > Same here. OK > > Thanks. > . >
On 2022/11/10 0:55, Elliott, Robert (Servers) wrote: > > >> b/Documentation/admin-guide/kernel-parameters.txt >> index a465d5242774af8..2729f3ad11d108b 100644 >> --- a/Documentation/admin-guide/kernel-parameters.txt >> +++ b/Documentation/admin-guide/kernel-parameters.txt >> @@ -5082,6 +5082,12 @@ >> rcupdate.rcu_cpu_stall_timeout to be used (after >> conversion from seconds to milliseconds). >> >> + rcupdate.rcu_cpu_stall_cputime= [KNL] >> + Provide statistics on the cputime and count of >> + interrupts and tasks during the sampling period. For >> + multiple continuous RCU stalls, all sampling periods >> + begin at half of the first RCU stall timeout. > > This description should start with: > "In kernels built with CONFIG_RCU_CPU_STALL_TIME=y, " > > Also, that parameter name seems like it contains a time value, but > it's really just treated as zero vs. anything else. Consider renaming > it to rcu_cpu_stall_cputime_en or describing the values in the > description ("0 disables, all other values enable"). > >> diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h >> +struct rcu_snap_record { >> + unsigned long gp_seq; /* Track rdp->gp_seq counter */ >> + u64 cputime_irq; /* Accumulated cputime of hard irqs */ >> + u64 cputime_softirq;/* Accumulated cputime of soft irqs */ >> + u64 cputime_system; /* Accumulated cputime of kernel tasks >> */ >> + unsigned long nr_hardirqs; /* Accumulated number of hard irqs */ >> + unsigned int nr_softirqs; /* Accumulated number of soft irqs */ > > That should be "unsigned long" to match the other patch We have discussed this before. And you mentioned: irqs_sumstruct kernel_stat { unsigned long irqs_sum; unsigned int softirqs[NR_SOFTIRQS]; }; The softirqs field is an unsigned int, so the new function doesn't have this inconsistency. > > >> diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h >> +static void print_cpu_stat_info(int cpu) >> +{ > ... >> + pr_err(" hardirqs softirqs csw/system\n"); >> + pr_err(" number: %8ld %10d %12lld\n", >> + kstat_cpu_irqs_sum(cpu) - rsrp->nr_hardirqs, >> + kstat_cpu_softirqs_sum(cpu) - rsrp->nr_softirqs, >> + nr_context_switches_cpu(cpu) - rsrp->nr_csw); >> + pr_err("cputime: %8lld %10lld %12lld ==> %lld(ms)\n", > > Those should all start with "\t" to match other related prints. Right, thanks. > > > . >
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index a465d5242774af8..2729f3ad11d108b 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5082,6 +5082,12 @@ rcupdate.rcu_cpu_stall_timeout to be used (after conversion from seconds to milliseconds). + rcupdate.rcu_cpu_stall_cputime= [KNL] + Provide statistics on the cputime and count of + interrupts and tasks during the sampling period. For + multiple continuous RCU stalls, all sampling periods + begin at half of the first RCU stall timeout. + rcupdate.rcu_expedited= [KNL] Use expedited grace-period primitives, for example, synchronize_rcu_expedited() instead diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index 1b0c41d490f0588..025566a9ba44667 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -95,6 +95,17 @@ config RCU_EXP_CPU_STALL_TIMEOUT says to use the RCU_CPU_STALL_TIMEOUT value converted from seconds to milliseconds. +config RCU_CPU_STALL_CPUTIME + bool "Provide additional RCU stall debug information" + depends on RCU_STALL_COMMON + default n + help + Collect statistics during the sampling period, such as the number of + (hard interrupts, soft interrupts, task switches) and the cputime of + (hard interrupts, soft interrupts, kernel tasks) are added to the + RCU stall report. For multiple continuous RCU stalls, all sampling + periods begin at half of the first RCU stall timeout. + config RCU_TRACE bool "Enable tracing for RCU" depends on DEBUG_KERNEL diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 96122f203187f39..4844dec36bddb48 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -231,6 +231,7 @@ extern int rcu_cpu_stall_ftrace_dump; extern int rcu_cpu_stall_suppress; extern int rcu_cpu_stall_timeout; extern int rcu_exp_cpu_stall_timeout; +extern int rcu_cpu_stall_cputime; int rcu_jiffies_till_stall_check(void); int rcu_exp_jiffies_till_stall_check(void); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index ed93ddb8203d42c..e1ff23b2a14d71d 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -866,6 +866,23 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) rdp->rcu_iw_gp_seq = rnp->gp_seq; irq_work_queue_on(&rdp->rcu_iw, rdp->cpu); } + + if (rcu_cpu_stall_cputime && rdp->snap_record.gp_seq != rdp->gp_seq) { + u64 *cpustat; + struct rcu_snap_record *rsrp; + + cpustat = kcpustat_cpu(rdp->cpu).cpustat; + + rsrp = &rdp->snap_record; + rsrp->cputime_irq = cpustat[CPUTIME_IRQ]; + rsrp->cputime_softirq = cpustat[CPUTIME_SOFTIRQ]; + rsrp->cputime_system = cpustat[CPUTIME_SYSTEM]; + rsrp->nr_hardirqs = kstat_cpu_irqs_sum(rdp->cpu); + rsrp->nr_softirqs = kstat_cpu_softirqs_sum(rdp->cpu); + rsrp->nr_csw = nr_context_switches_cpu(rdp->cpu); + rsrp->jiffies = jiffies; + rsrp->gp_seq = rdp->gp_seq; + } } return 0; diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index fcb5d696eb1700d..192536916f9a607 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -158,6 +158,23 @@ union rcu_noqs { u16 s; /* Set of bits, aggregate OR here. */ }; +/* + * Record the snapshot of the core stats at half of the first RCU stall timeout. + * The member gp_seq is used to ensure that all members are updated only once + * during the sampling period. The snapshot is taken only if this gp_seq is not + * equal to rdp->gp_seq. + */ +struct rcu_snap_record { + unsigned long gp_seq; /* Track rdp->gp_seq counter */ + u64 cputime_irq; /* Accumulated cputime of hard irqs */ + u64 cputime_softirq;/* Accumulated cputime of soft irqs */ + u64 cputime_system; /* Accumulated cputime of kernel tasks */ + unsigned long nr_hardirqs; /* Accumulated number of hard irqs */ + unsigned int nr_softirqs; /* Accumulated number of soft irqs */ + unsigned long long nr_csw; /* Accumulated number of task switches */ + unsigned long jiffies; /* Track jiffies value */ +}; + /* Per-CPU data for read-copy update. */ struct rcu_data { /* 1) quiescent-state and grace-period handling : */ @@ -262,6 +279,8 @@ struct rcu_data { short rcu_onl_gp_flags; /* ->gp_flags at last online. */ unsigned long last_fqs_resched; /* Time of last rcu_resched(). */ unsigned long last_sched_clock; /* Jiffies of last rcu_sched_clock_irq(). */ + struct rcu_snap_record snap_record; /* Snapshot of core stats at half of */ + /* the first RCU stall timeout */ long lazy_len; /* Length of buffered lazy callbacks. */ int cpu; diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 5653560573e22d6..7b6afb9c7b96dbe 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -428,6 +428,33 @@ static bool rcu_is_rcuc_kthread_starving(struct rcu_data *rdp, unsigned long *jp return j > 2 * HZ; } +static void print_cpu_stat_info(int cpu) +{ + u64 *cpustat; + struct rcu_snap_record *rsrp; + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + + if (!rcu_cpu_stall_cputime) + return; + + rsrp = &rdp->snap_record; + if (rsrp->gp_seq != rdp->gp_seq) + return; + + cpustat = kcpustat_cpu(cpu).cpustat; + + pr_err(" hardirqs softirqs csw/system\n"); + pr_err(" number: %8ld %10d %12lld\n", + kstat_cpu_irqs_sum(cpu) - rsrp->nr_hardirqs, + kstat_cpu_softirqs_sum(cpu) - rsrp->nr_softirqs, + nr_context_switches_cpu(cpu) - rsrp->nr_csw); + pr_err("cputime: %8lld %10lld %12lld ==> %lld(ms)\n", + div_u64(cpustat[CPUTIME_IRQ] - rsrp->cputime_irq, NSEC_PER_MSEC), + div_u64(cpustat[CPUTIME_SOFTIRQ] - rsrp->cputime_softirq, NSEC_PER_MSEC), + div_u64(cpustat[CPUTIME_SYSTEM] - rsrp->cputime_system, NSEC_PER_MSEC), + jiffies64_to_msecs(jiffies - rsrp->jiffies)); +} + /* * Print out diagnostic information for the specified stalled CPU. * @@ -484,6 +511,8 @@ static void print_cpu_stall_info(int cpu) data_race(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart, rcuc_starved ? buf : "", falsepositive ? " (false positive?)" : ""); + + print_cpu_stat_info(cpu); } /* Complain about starvation of grace-period kthread. */ diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 738842c4886b235..aec76ccbe1e343b 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -508,6 +508,8 @@ int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; module_param(rcu_cpu_stall_timeout, int, 0644); int rcu_exp_cpu_stall_timeout __read_mostly = CONFIG_RCU_EXP_CPU_STALL_TIMEOUT; module_param(rcu_exp_cpu_stall_timeout, int, 0644); +int rcu_cpu_stall_cputime __read_mostly = IS_ENABLED(CONFIG_RCU_CPU_STALL_CPUTIME); +module_param(rcu_cpu_stall_cputime, int, 0644); #endif /* #ifdef CONFIG_RCU_STALL_COMMON */ // Suppress boot-time RCU CPU stall warnings and rcutorture writer stall