diff mbox series

Diagnosing stall in synchronize_srcu() from rcu_tasks_postscan()

Message ID 20230111212736.GA1062057@paulmck-ThinkPad-P17-Gen-1 (mailing list archive)
State Accepted
Commit f1c73cc03a10ba8300792eb9cbd0686614efd2b2
Headers show
Series Diagnosing stall in synchronize_srcu() from rcu_tasks_postscan() | expand

Commit Message

Paul E. McKenney Jan. 11, 2023, 9:27 p.m. UTC
Hello, Mark,

A few days ago you mentioned stalls in RCU tasks.  Neeraj has supplied
the following diagnostic patch, which will confirm or invalidate my
assumptions about the cause of the stall.

Could you please try it out and let us know the outcome?

							Thanx, Paul

------------------------------------------------------------------------

commit 1e464bd08ee844fb43594b69f471c05eaeda5cda
Author: Neeraj Upadhyay <quic_neeraju@quicinc.com>
Date:   Wed Jan 11 13:15:00 2023 +0530

    rcu-tasks: Report stalls during synchronize_srcu() in rcu_tasks_postscan()
    
    The call to synchronize_srcu() from rcu_tasks_postscan() can be stalled
    by a task getting stuck in do_exit() between that function's calls to
    exit_tasks_rcu_start() and exit_tasks_rcu_finish().   To ease diagnosis
    of this situation, print a stall warning message every rcu_task_stall_info
    period when rcu_tasks_postscan() is stalled.
    
    Reported-by: Mark Brown <broonie@kernel.org>
    Signed-off-by: Neeraj Upadhyay <quic_neeraju@quicinc.com>
    Signed-off-by: Paul E. McKenney <paulmck@kernel.org>

Comments

Mark Brown Jan. 13, 2023, 8:56 p.m. UTC | #1
On Wed, Jan 11, 2023 at 01:27:36PM -0800, Paul E. McKenney wrote:
> Hello, Mark,
> 
> A few days ago you mentioned stalls in RCU tasks.  Neeraj has supplied
> the following diagnostic patch, which will confirm or invalidate my
> assumptions about the cause of the stall.
> 
> Could you please try it out and let us know the outcome?

As I mentioned the other day I'm having trouble getting anything I build
to show the problem, I'm noticing it only for jobs submitted by KernelCI
to my test lab.  I'm still poking at it a bit, and trying to persuade
the automation to process the branch I have which should be able to
trigger on random commits which would hopefully also let me trigger
things that way, but at the minute anything I build myself appears to
run fine even when I try to use the same toolchain and so on.
Paul E. McKenney Jan. 13, 2023, 9:36 p.m. UTC | #2
On Fri, Jan 13, 2023 at 08:56:43PM +0000, Mark Brown wrote:
> On Wed, Jan 11, 2023 at 01:27:36PM -0800, Paul E. McKenney wrote:
> > Hello, Mark,
> > 
> > A few days ago you mentioned stalls in RCU tasks.  Neeraj has supplied
> > the following diagnostic patch, which will confirm or invalidate my
> > assumptions about the cause of the stall.
> > 
> > Could you please try it out and let us know the outcome?
> 
> As I mentioned the other day I'm having trouble getting anything I build
> to show the problem, I'm noticing it only for jobs submitted by KernelCI
> to my test lab.  I'm still poking at it a bit, and trying to persuade
> the automation to process the branch I have which should be able to
> trigger on random commits which would hopefully also let me trigger
> things that way, but at the minute anything I build myself appears to
> run fine even when I try to use the same toolchain and so on.

OK, apologies for the noise!  It will hit mainline at some point, and
then the KernelCI folks will have to work to avoid it.  ;-)

							Thanx, Paul
diff mbox series

Patch

diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index bfb5e1549f2b2..53eb95748b4f0 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -139,6 +139,12 @@  static struct rcu_tasks rt_name =							\
 /* Track exiting tasks in order to allow them to be waited for. */
 DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu);
 
+#ifdef CONFIG_TASKS_RCU
+/* Report delay in synchronize_srcu() completion in rcu_tasks_postscan(). */
+static void tasks_rcu_exit_srcu_stall(struct timer_list *unused);
+static DEFINE_TIMER(tasks_rcu_exit_srcu_stall_timer, tasks_rcu_exit_srcu_stall);
+#endif
+
 /* Avoid IPIing CPUs early in the grace period. */
 #define RCU_TASK_IPI_DELAY (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) ? HZ / 2 : 0)
 static int rcu_task_ipi_delay __read_mostly = RCU_TASK_IPI_DELAY;
@@ -830,6 +836,11 @@  static void rcu_tasks_pertask(struct task_struct *t, struct list_head *hop)
 /* Processing between scanning taskslist and draining the holdout list. */
 static void rcu_tasks_postscan(struct list_head *hop)
 {
+	int rtsi = READ_ONCE(rcu_task_stall_info);
+
+	tasks_rcu_exit_srcu_stall_timer.expires = jiffies + rtsi;
+	add_timer(&tasks_rcu_exit_srcu_stall_timer);
+
 	/*
 	 * Exiting tasks may escape the tasklist scan. Those are vulnerable
 	 * until their final schedule() with TASK_DEAD state. To cope with
@@ -848,6 +859,7 @@  static void rcu_tasks_postscan(struct list_head *hop)
 	 * call to synchronize_rcu().
 	 */
 	synchronize_srcu(&tasks_rcu_exit_srcu);
+	del_timer_sync(&tasks_rcu_exit_srcu_stall_timer);
 }
 
 /* See if tasks are still holding out, complain if so. */
@@ -923,6 +935,18 @@  static void rcu_tasks_postgp(struct rcu_tasks *rtp)
 void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func);
 DEFINE_RCU_TASKS(rcu_tasks, rcu_tasks_wait_gp, call_rcu_tasks, "RCU Tasks");
 
+static void tasks_rcu_exit_srcu_stall(struct timer_list *unused)
+{
+	int rtsi = READ_ONCE(rcu_task_stall_info);
+
+	pr_info("%s: %s grace period number %lu (since boot) gp_state: %s is %lu jiffies old.\n",
+		__func__, rcu_tasks.kname, rcu_tasks.tasks_gp_seq,
+		tasks_gp_state_getname(&rcu_tasks), jiffies - rcu_tasks.gp_jiffies);
+	pr_info("Please check any exiting tasks stuck between calls to exit_tasks_rcu_start() and exit_tasks_rcu_finish()\n");
+	tasks_rcu_exit_srcu_stall_timer.expires = jiffies + rtsi;
+	add_timer(&tasks_rcu_exit_srcu_stall_timer);
+}
+
 /**
  * call_rcu_tasks() - Queue an RCU for invocation task-based grace period
  * @rhp: structure to be used for queueing the RCU updates.