diff mbox series

[3/3] locking/csd-lock: Use backoff for repeated reports of same incident

Message ID 20240713165846.216174-3-neeraj.upadhyay@kernel.org (mailing list archive)
State Superseded
Headers show
Series CSD-lock diagnostics enhancements | expand

Commit Message

Neeraj Upadhyay July 13, 2024, 4:58 p.m. UTC
From: "Paul E. McKenney" <paulmck@kernel.org>

Currently, the CSD-lock diagnostics in CONFIG_CSD_LOCK_WAIT_DEBUG=y
kernels are emitted at five-second intervals.  Although this has proven
to be a good time interval for the first diagnostic, if the target CPU
keeps interrupts disabled for way longer than five seconds, the ratio
of useful new information to pointless repetition increases considerably.

Therefore, back off the time period for repeated reports of the same
incident, increasing linearly with the number of reports and logarithmicly
with the number of online CPUs.

[ paulmck: Apply Dan Carpenter feedback. ]

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Cc: Imran Khan <imran.f.khan@oracle.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Leonardo Bras <leobras@redhat.com>
Cc: "Peter Zijlstra (Intel)" <peterz@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>
---
 kernel/smp.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

Comments

Rik van Riel July 14, 2024, 12:59 a.m. UTC | #1
On Sat, 2024-07-13 at 22:28 +0530, neeraj.upadhyay@kernel.org wrote:
> From: "Paul E. McKenney" <paulmck@kernel.org>
> 
> Currently, the CSD-lock diagnostics in CONFIG_CSD_LOCK_WAIT_DEBUG=y
> kernels are emitted at five-second intervals.  Although this has
> proven
> to be a good time interval for the first diagnostic, if the target
> CPU
> keeps interrupts disabled for way longer than five seconds, the ratio
> of useful new information to pointless repetition increases
> considerably.
> 
> Therefore, back off the time period for repeated reports of the same
> incident, increasing linearly with the number of reports and
> logarithmicly
> with the number of online CPUs.
> 
> [ paulmck: Apply Dan Carpenter feedback. ]
> 
> Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
> Cc: Imran Khan <imran.f.khan@oracle.com>
> Cc: Ingo Molnar <mingo@kernel.org>
> Cc: Leonardo Bras <leobras@redhat.com>
> Cc: "Peter Zijlstra (Intel)" <peterz@infradead.org>
> Cc: Rik van Riel <riel@surriel.com>
> Signed-off-by: Neeraj Upadhyay <neeraj.upadhyay@kernel.org>

Reviewed-by: Rik van Riel <riel@surriel.com>
diff mbox series

Patch

diff --git a/kernel/smp.c b/kernel/smp.c
index c3e8241e9cbb..80c1173ce369 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -225,7 +225,7 @@  bool csd_lock_is_stuck(void)
  * the CSD_TYPE_SYNC/ASYNC types provide the destination CPU,
  * so waiting on other types gets much less information.
  */
-static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, int *bug_id)
+static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, int *bug_id, unsigned long *nmessages)
 {
 	int cpu = -1;
 	int cpux;
@@ -248,7 +248,9 @@  static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, in
 	ts2 = sched_clock();
 	/* How long since we last checked for a stuck CSD lock.*/
 	ts_delta = ts2 - *ts1;
-	if (likely(ts_delta <= csd_lock_timeout_ns || csd_lock_timeout_ns == 0))
+	if (likely(ts_delta <= csd_lock_timeout_ns * (*nmessages + 1) *
+			       (!*nmessages ? 1 : (ilog2(num_online_cpus()) / 2 + 1)) ||
+		   csd_lock_timeout_ns == 0))
 		return false;
 
 	firsttime = !*bug_id;
@@ -265,6 +267,7 @@  static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, in
 	pr_alert("csd: %s non-responsive CSD lock (#%d) on CPU#%d, waiting %lld ns for CPU#%02d %pS(%ps).\n",
 		 firsttime ? "Detected" : "Continued", *bug_id, raw_smp_processor_id(), (s64)ts_delta,
 		 cpu, csd->func, csd->info);
+	(*nmessages)++;
 	if (firsttime)
 		atomic_dec(&n_csd_lock_stuck);
 	/*
@@ -305,12 +308,13 @@  static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, in
  */
 static void __csd_lock_wait(call_single_data_t *csd)
 {
+	unsigned long nmessages = 0;
 	int bug_id = 0;
 	u64 ts0, ts1;
 
 	ts1 = ts0 = sched_clock();
 	for (;;) {
-		if (csd_lock_wait_toolong(csd, ts0, &ts1, &bug_id))
+		if (csd_lock_wait_toolong(csd, ts0, &ts1, &bug_id, &nmessages))
 			break;
 		cpu_relax();
 	}