diff mbox series

[v1,03/11] sched: Add sched tracepoints for RV task model

Message ID 20250211074622.58590-4-gmonaco@redhat.com (mailing list archive)
State Superseded
Headers show
Series rv: Add scheduler specification monitors | expand

Commit Message

Gabriele Monaco Feb. 11, 2025, 7:46 a.m. UTC
Add the following tracepoints:
* sched_entry(bool preempt, ip)
    Called while entering __schedule
* sched_exit(bool is_switch, ip)
    Called while exiting __schedule
* sched_set_state(task, curr_state, state)
    Called when a task changes its state (to and from running)

These tracepoints are useful to describe the Linux task model and are
adapted from the patches by Daniel Bristot de Oliveira
(https://bristot.me/linux-task-model/).

Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>
---
 include/linux/rv.h                 |  2 +-
 include/linux/sched.h              | 16 ++++++++++++++++
 include/trace/events/sched.h       | 13 +++++++++++++
 kernel/sched/core.c                | 22 +++++++++++++++++++++-
 tools/verification/rv/include/rv.h |  2 +-
 5 files changed, 52 insertions(+), 3 deletions(-)

Comments

Peter Zijlstra Feb. 11, 2025, 11:03 a.m. UTC | #1
On Tue, Feb 11, 2025 at 08:46:10AM +0100, Gabriele Monaco wrote:

> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 9632e3318e0d6..9ff0658095240 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -46,6 +46,7 @@
>  #include <linux/rv.h>
>  #include <linux/livepatch_sched.h>
>  #include <linux/uidgid_types.h>
> +#include <linux/tracepoint-defs.h>
>  #include <asm/kmap_size.h>
>  
>  /* task_struct member predeclarations (sorted alphabetically): */
> @@ -186,6 +187,12 @@ struct user_event_mm;
>  # define debug_rtlock_wait_restore_state()	do { } while (0)
>  #endif
>  
> +#define trace_set_current_state(state_value)                     \
> +	do {                                                     \
> +		if (tracepoint_enabled(sched_set_state_tp))      \
> +			do_trace_set_current_state(state_value); \
> +	} while (0)

Yeah, this is much nicer, thanks!

>  /*
>   * set_current_state() includes a barrier so that the write of current->__state
>   * is correctly serialised wrt the caller's subsequent test of whether to
> @@ -226,12 +233,14 @@ struct user_event_mm;
>  #define __set_current_state(state_value)				\
>  	do {								\
>  		debug_normal_state_change((state_value));		\
> +		trace_set_current_state(state_value);			\
>  		WRITE_ONCE(current->__state, (state_value));		\
>  	} while (0)
>  
>  #define set_current_state(state_value)					\
>  	do {								\
>  		debug_normal_state_change((state_value));		\
> +		trace_set_current_state(state_value);			\
>  		smp_store_mb(current->__state, (state_value));		\
>  	} while (0)
>  
> @@ -247,6 +256,7 @@ struct user_event_mm;
>  									\
>  		raw_spin_lock_irqsave(&current->pi_lock, flags);	\
>  		debug_special_state_change((state_value));		\
> +		trace_set_current_state(state_value);			\
>  		WRITE_ONCE(current->__state, (state_value));		\
>  		raw_spin_unlock_irqrestore(&current->pi_lock, flags);	\
>  	} while (0)
> @@ -282,6 +292,7 @@ struct user_event_mm;
>  		raw_spin_lock(&current->pi_lock);			\
>  		current->saved_state = current->__state;		\
>  		debug_rtlock_wait_set_state();				\
> +		trace_set_current_state(TASK_RTLOCK_WAIT);		\
>  		WRITE_ONCE(current->__state, TASK_RTLOCK_WAIT);		\
>  		raw_spin_unlock(&current->pi_lock);			\
>  	} while (0);
> @@ -291,6 +302,7 @@ struct user_event_mm;
>  		lockdep_assert_irqs_disabled();				\
>  		raw_spin_lock(&current->pi_lock);			\
>  		debug_rtlock_wait_restore_state();			\
> +		trace_set_current_state(TASK_RUNNING);			\
>  		WRITE_ONCE(current->__state, current->saved_state);	\
>  		current->saved_state = TASK_RUNNING;			\
>  		raw_spin_unlock(&current->pi_lock);			\


> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 165c90ba64ea9..9e33728bb57e8 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -491,6 +491,15 @@ sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { }
>  
>  #endif /* CONFIG_SCHED_CORE */
>  
> +/* need a wrapper since we may need to trace from modules */
> +EXPORT_TRACEPOINT_SYMBOL(sched_set_state_tp);

_GPL

> +
> +void do_trace_set_current_state(int state_value)
> +{
> +	trace_sched_set_state_tp(current, current->__state, state_value);

Should this be:

	__do_trace_sched_set_state_tp() ?

> +}
> +EXPORT_SYMBOL(do_trace_set_current_state);

_GPL
Gabriele Monaco Feb. 11, 2025, 12:54 p.m. UTC | #2
On Tue, 2025-02-11 at 12:03 +0100, Peter Zijlstra wrote:
> On Tue, Feb 11, 2025 at 08:46:10AM +0100, Gabriele Monaco wrote:
> 
> > diff --git a/include/linux/sched.h b/include/linux/sched.h
> > index 9632e3318e0d6..9ff0658095240 100644
> > --- a/include/linux/sched.h
> > +++ b/include/linux/sched.h
> > @@ -46,6 +46,7 @@
> >  #include <linux/rv.h>
> >  #include <linux/livepatch_sched.h>
> >  #include <linux/uidgid_types.h>
> > +#include <linux/tracepoint-defs.h>
> >  #include <asm/kmap_size.h>
> >  
> >  /* task_struct member predeclarations (sorted alphabetically): */
> > @@ -186,6 +187,12 @@ struct user_event_mm;
> >  # define debug_rtlock_wait_restore_state() do { } while (0)
> >  #endif
> >  
> > +#define trace_set_current_state(state_value)                     \
> > + do {                                                     \
> > + if (tracepoint_enabled(sched_set_state_tp))      \
> > + do_trace_set_current_state(state_value); \
> > + } while (0)
> 
> Yeah, this is much nicer, thanks!
> 
> >  /*
> >   * set_current_state() includes a barrier so that the write of
> > current->__state
> >   * is correctly serialised wrt the caller's subsequent test of
> > whether to
> > @@ -226,12 +233,14 @@ struct user_event_mm;
> >  #define __set_current_state(state_value) \
> >   do { \
> >   debug_normal_state_change((state_value)); \
> > + trace_set_current_state(state_value); \
> >   WRITE_ONCE(current->__state, (state_value)); \
> >   } while (0)
> >  
> >  #define set_current_state(state_value) \
> >   do { \
> >   debug_normal_state_change((state_value)); \
> > + trace_set_current_state(state_value); \
> >   smp_store_mb(current->__state, (state_value)); \
> >   } while (0)
> >  
> > @@ -247,6 +256,7 @@ struct user_event_mm;
> >   \
> >   raw_spin_lock_irqsave(&current->pi_lock, flags); \
> >   debug_special_state_change((state_value)); \
> > + trace_set_current_state(state_value); \
> >   WRITE_ONCE(current->__state, (state_value)); \
> >   raw_spin_unlock_irqrestore(&current->pi_lock, flags); \
> >   } while (0)
> > @@ -282,6 +292,7 @@ struct user_event_mm;
> >   raw_spin_lock(&current->pi_lock); \
> >   current->saved_state = current->__state; \
> >   debug_rtlock_wait_set_state(); \
> > + trace_set_current_state(TASK_RTLOCK_WAIT); \
> >   WRITE_ONCE(current->__state, TASK_RTLOCK_WAIT); \
> >   raw_spin_unlock(&current->pi_lock); \
> >   } while (0);
> > @@ -291,6 +302,7 @@ struct user_event_mm;
> >   lockdep_assert_irqs_disabled(); \
> >   raw_spin_lock(&current->pi_lock); \
> >   debug_rtlock_wait_restore_state(); \
> > + trace_set_current_state(TASK_RUNNING); \
> >   WRITE_ONCE(current->__state, current->saved_state); \
> >   current->saved_state = TASK_RUNNING; \
> >   raw_spin_unlock(&current->pi_lock); \
> 
> 
> > diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> > index 165c90ba64ea9..9e33728bb57e8 100644
> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -491,6 +491,15 @@ sched_core_dequeue(struct rq *rq, struct
> > task_struct *p, int flags) { }
> >  
> >  #endif /* CONFIG_SCHED_CORE */
> >  
> > +/* need a wrapper since we may need to trace from modules */
> > +EXPORT_TRACEPOINT_SYMBOL(sched_set_state_tp);
> 
> _GPL
> 
> > +
> > +void do_trace_set_current_state(int state_value)
> > +{
> > + trace_sched_set_state_tp(current, current->__state, state_value);
> 
> Should this be:
> 
>  __do_trace_sched_set_state_tp() ?
> 

Mmh, you mean avoiding the static_branch_unlikely in the helper
function, since it's supposed to be used before calling it?

I checked all references of tracepoint-defs.h's tracepoint_enabled (the
macro actually doing this static_branch_unlikely) and it seems they
never use the __do_trace_ function but only call trace_, in fact
repeating the branch.

Your suggestion seems definitely cleaner but I wonder if that can have
unwanted consequences (we are exposing a function unconditionally
calling the tracepoint) and for that reason it was preferred not to do
it in all other occurrences?

I'm not sure if saving one branch would justify the change only in this
case. Thoughts?

> > +}
> > +EXPORT_SYMBOL(do_trace_set_current_state);
> 
> _GPL
> 

I'm absolutely not against this change but, out of curiosity, would
that imply non-GPL modules are not going to be able to sleep going
forward? At least not using this pattern.

Thanks,
Gabriele
Peter Zijlstra Feb. 11, 2025, 2:37 p.m. UTC | #3
On Tue, Feb 11, 2025 at 01:54:44PM +0100, Gabriele Monaco wrote:

> > > +void do_trace_set_current_state(int state_value)
> > > +{
> > > + trace_sched_set_state_tp(current, current->__state, state_value);
> > 
> > Should this be:
> > 
> >  __do_trace_sched_set_state_tp() ?
> > 
> 
> Mmh, you mean avoiding the static_branch_unlikely in the helper
> function, since it's supposed to be used before calling it?

Yep, seems pointless to repeat that.

> > > +}
> > > +EXPORT_SYMBOL(do_trace_set_current_state);
> > 
> > _GPL
> > 
> 
> I'm absolutely not against this change but, out of curiosity, would
> that imply non-GPL modules are not going to be able to sleep going
> forward? At least not using this pattern.

Bah, you're right. Killing non-GPL modules seems like a worthy goal
though, but perhaps another day ;-)
diff mbox series

Patch

diff --git a/include/linux/rv.h b/include/linux/rv.h
index 8883b41d88ec4..55d458be53a4c 100644
--- a/include/linux/rv.h
+++ b/include/linux/rv.h
@@ -7,7 +7,7 @@ 
 #ifndef _LINUX_RV_H
 #define _LINUX_RV_H
 
-#define MAX_DA_NAME_LEN	24
+#define MAX_DA_NAME_LEN	32
 
 #ifdef CONFIG_RV
 /*
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9632e3318e0d6..9ff0658095240 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -46,6 +46,7 @@ 
 #include <linux/rv.h>
 #include <linux/livepatch_sched.h>
 #include <linux/uidgid_types.h>
+#include <linux/tracepoint-defs.h>
 #include <asm/kmap_size.h>
 
 /* task_struct member predeclarations (sorted alphabetically): */
@@ -186,6 +187,12 @@  struct user_event_mm;
 # define debug_rtlock_wait_restore_state()	do { } while (0)
 #endif
 
+#define trace_set_current_state(state_value)                     \
+	do {                                                     \
+		if (tracepoint_enabled(sched_set_state_tp))      \
+			do_trace_set_current_state(state_value); \
+	} while (0)
+
 /*
  * set_current_state() includes a barrier so that the write of current->__state
  * is correctly serialised wrt the caller's subsequent test of whether to
@@ -226,12 +233,14 @@  struct user_event_mm;
 #define __set_current_state(state_value)				\
 	do {								\
 		debug_normal_state_change((state_value));		\
+		trace_set_current_state(state_value);			\
 		WRITE_ONCE(current->__state, (state_value));		\
 	} while (0)
 
 #define set_current_state(state_value)					\
 	do {								\
 		debug_normal_state_change((state_value));		\
+		trace_set_current_state(state_value);			\
 		smp_store_mb(current->__state, (state_value));		\
 	} while (0)
 
@@ -247,6 +256,7 @@  struct user_event_mm;
 									\
 		raw_spin_lock_irqsave(&current->pi_lock, flags);	\
 		debug_special_state_change((state_value));		\
+		trace_set_current_state(state_value);			\
 		WRITE_ONCE(current->__state, (state_value));		\
 		raw_spin_unlock_irqrestore(&current->pi_lock, flags);	\
 	} while (0)
@@ -282,6 +292,7 @@  struct user_event_mm;
 		raw_spin_lock(&current->pi_lock);			\
 		current->saved_state = current->__state;		\
 		debug_rtlock_wait_set_state();				\
+		trace_set_current_state(TASK_RTLOCK_WAIT);		\
 		WRITE_ONCE(current->__state, TASK_RTLOCK_WAIT);		\
 		raw_spin_unlock(&current->pi_lock);			\
 	} while (0);
@@ -291,6 +302,7 @@  struct user_event_mm;
 		lockdep_assert_irqs_disabled();				\
 		raw_spin_lock(&current->pi_lock);			\
 		debug_rtlock_wait_restore_state();			\
+		trace_set_current_state(TASK_RUNNING);			\
 		WRITE_ONCE(current->__state, current->saved_state);	\
 		current->saved_state = TASK_RUNNING;			\
 		raw_spin_unlock(&current->pi_lock);			\
@@ -327,6 +339,10 @@  extern void io_schedule_finish(int token);
 extern long io_schedule_timeout(long timeout);
 extern void io_schedule(void);
 
+/* wrapper function to trace from this header file */
+DECLARE_TRACEPOINT(sched_set_state_tp);
+extern void do_trace_set_current_state(int state_value);
+
 /**
  * struct prev_cputime - snapshot of system and user cputime
  * @utime: time spent in user mode
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 9ea4c404bd4ef..cc3be04fe9986 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -824,6 +824,19 @@  DECLARE_TRACE(sched_compute_energy_tp,
 		 unsigned long max_util, unsigned long busy_time),
 	TP_ARGS(p, dst_cpu, energy, max_util, busy_time));
 
+DECLARE_TRACE(sched_entry_tp,
+	TP_PROTO(bool preempt, unsigned long ip),
+	TP_ARGS(preempt, ip));
+
+DECLARE_TRACE(sched_exit_tp,
+	TP_PROTO(bool is_switch, unsigned long ip),
+	TP_ARGS(is_switch, ip));
+
+DECLARE_TRACE_CONDITION(sched_set_state_tp,
+	TP_PROTO(struct task_struct *tsk, int curr_state, int state),
+	TP_ARGS(tsk, curr_state, state),
+	TP_CONDITION(!!curr_state != !!state));
+
 #endif /* _TRACE_SCHED_H */
 
 /* This part must be outside protection */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 165c90ba64ea9..9e33728bb57e8 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -491,6 +491,15 @@  sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { }
 
 #endif /* CONFIG_SCHED_CORE */
 
+/* need a wrapper since we may need to trace from modules */
+EXPORT_TRACEPOINT_SYMBOL(sched_set_state_tp);
+
+void do_trace_set_current_state(int state_value)
+{
+	trace_sched_set_state_tp(current, current->__state, state_value);
+}
+EXPORT_SYMBOL(do_trace_set_current_state);
+
 /*
  * Serialization rules:
  *
@@ -5306,6 +5315,12 @@  asmlinkage __visible void schedule_tail(struct task_struct *prev)
 	 */
 
 	finish_task_switch(prev);
+	/*
+	 * This is a special case: the newly created task has just
+	 * switched the context for the fist time. It is returning from
+	 * schedule for the first time in this path.
+	 */
+	trace_sched_exit_tp(true, CALLER_ADDR0);
 	preempt_enable();
 
 	if (current->set_child_tid)
@@ -6649,12 +6664,15 @@  static void __sched notrace __schedule(int sched_mode)
 	 * as a preemption by schedule_debug() and RCU.
 	 */
 	bool preempt = sched_mode > SM_NONE;
+	bool is_switch = false;
 	unsigned long *switch_count;
 	unsigned long prev_state;
 	struct rq_flags rf;
 	struct rq *rq;
 	int cpu;
 
+	trace_sched_entry_tp(preempt, CALLER_ADDR0);
+
 	cpu = smp_processor_id();
 	rq = cpu_rq(cpu);
 	prev = rq->curr;
@@ -6722,7 +6740,8 @@  static void __sched notrace __schedule(int sched_mode)
 	rq->last_seen_need_resched_ns = 0;
 #endif
 
-	if (likely(prev != next)) {
+	is_switch = prev != next;
+	if (likely(is_switch)) {
 		rq->nr_switches++;
 		/*
 		 * RCU users of rcu_dereference(rq->curr) may not see
@@ -6767,6 +6786,7 @@  static void __sched notrace __schedule(int sched_mode)
 		__balance_callbacks(rq);
 		raw_spin_rq_unlock_irq(rq);
 	}
+	trace_sched_exit_tp(is_switch, CALLER_ADDR0);
 }
 
 void __noreturn do_task_dead(void)
diff --git a/tools/verification/rv/include/rv.h b/tools/verification/rv/include/rv.h
index 770fd6da36107..0cab1037a98f7 100644
--- a/tools/verification/rv/include/rv.h
+++ b/tools/verification/rv/include/rv.h
@@ -1,7 +1,7 @@ 
 // SPDX-License-Identifier: GPL-2.0
 
 #define MAX_DESCRIPTION 1024
-#define MAX_DA_NAME_LEN	24
+#define MAX_DA_NAME_LEN	32
 
 struct monitor {
 	char name[MAX_DA_NAME_LEN];