diff mbox series

[RFC,03/11] sched: Add sched tracepoints for RV task model

Message ID 20250206080952.98478-4-gmonaco@redhat.com (mailing list archive)
State Superseded
Headers show
Series rv: Add scheduler specification monitors | expand

Commit Message

Gabriele Monaco Feb. 6, 2025, 8:09 a.m. UTC
Add the following tracepoints:
* sched_entry(bool preempt, ip)
    Called while entering __schedule
* sched_exit(bool is_switch, ip)
    Called while exiting __schedule
* sched_set_need_resched(task)
    Called when we set the need for reschedule
* sched_set_state(task, curr_state, state)
    Called when a task changes its state (to and from running)

These tracepoints are useful to describe the Linux task model and are
adapted from the patches by Daniel Bristot de Oliveira
(https://bristot.me/linux-task-model/).

Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>
---
 include/linux/rv.h                 |  2 +-
 include/linux/sched.h              |  7 +++++++
 include/trace/events/sched.h       | 17 +++++++++++++++++
 kernel/sched/core.c                | 21 ++++++++++++++++++++-
 tools/verification/rv/include/rv.h |  2 +-
 5 files changed, 46 insertions(+), 3 deletions(-)

Comments

Peter Zijlstra Feb. 6, 2025, 8:19 a.m. UTC | #1
On Thu, Feb 06, 2025 at 09:09:39AM +0100, Gabriele Monaco wrote:

> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 9632e3318e0d6..af9fa18035c71 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -226,12 +226,14 @@ struct user_event_mm;
>  #define __set_current_state(state_value)				\
>  	do {								\
>  		debug_normal_state_change((state_value));		\
> +		trace_set_current_state(state_value);			\
>  		WRITE_ONCE(current->__state, (state_value));		\
>  	} while (0)
>  
>  #define set_current_state(state_value)					\
>  	do {								\
>  		debug_normal_state_change((state_value));		\
> +		trace_set_current_state(state_value);			\
>  		smp_store_mb(current->__state, (state_value));		\
>  	} while (0)
>  
> @@ -247,6 +249,7 @@ struct user_event_mm;
>  									\
>  		raw_spin_lock_irqsave(&current->pi_lock, flags);	\
>  		debug_special_state_change((state_value));		\
> +		trace_set_current_state(state_value);			\
>  		WRITE_ONCE(current->__state, (state_value));		\
>  		raw_spin_unlock_irqrestore(&current->pi_lock, flags);	\
>  	} while (0)
> @@ -282,6 +285,7 @@ struct user_event_mm;
>  		raw_spin_lock(&current->pi_lock);			\
>  		current->saved_state = current->__state;		\
>  		debug_rtlock_wait_set_state();				\
> +		trace_set_current_state(TASK_RTLOCK_WAIT);		\
>  		WRITE_ONCE(current->__state, TASK_RTLOCK_WAIT);		\
>  		raw_spin_unlock(&current->pi_lock);			\
>  	} while (0);
> @@ -291,6 +295,7 @@ struct user_event_mm;
>  		lockdep_assert_irqs_disabled();				\
>  		raw_spin_lock(&current->pi_lock);			\
>  		debug_rtlock_wait_restore_state();			\
> +		trace_set_current_state(TASK_RUNNING);			\
>  		WRITE_ONCE(current->__state, current->saved_state);	\
>  		current->saved_state = TASK_RUNNING;			\
>  		raw_spin_unlock(&current->pi_lock);			\

> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 165c90ba64ea9..fb5f8aa61ef5d 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -491,6 +491,12 @@ sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { }
>  
>  #endif /* CONFIG_SCHED_CORE */
>  
> +void trace_set_current_state(int state_value)
> +{
> +	trace_sched_set_state_tp(current, current->__state, state_value);
> +}
> +EXPORT_SYMBOL(trace_set_current_state);

Urgh, why !?!
Gabriele Monaco Feb. 6, 2025, 8:36 a.m. UTC | #2
On Thu, 2025-02-06 at 09:19 +0100, Peter Zijlstra wrote:
> On Thu, Feb 06, 2025 at 09:09:39AM +0100, Gabriele Monaco wrote:
> 
> > diff --git a/include/linux/sched.h b/include/linux/sched.h
> > index 9632e3318e0d6..af9fa18035c71 100644
> > --- a/include/linux/sched.h
> > +++ b/include/linux/sched.h
> > @@ -226,12 +226,14 @@ struct user_event_mm;
> >  #define
> > __set_current_state(state_value)				\
> >  	do
> > {								\
> >  		debug_normal_state_change((state_value));	
> > 	\
> > +		trace_set_current_state(state_value);		
> > 	\
> >  		WRITE_ONCE(current->__state,
> > (state_value));		\
> >  	} while (0)
> >  
> >  #define
> > set_current_state(state_value)					\
> >  	do
> > {								\
> >  		debug_normal_state_change((state_value));	
> > 	\
> > +		trace_set_current_state(state_value);		
> > 	\
> >  		smp_store_mb(current->__state,
> > (state_value));		\
> >  	} while (0)
> >  
> > @@ -247,6 +249,7 @@ struct user_event_mm;
> >  								
> > 	\
> >  		raw_spin_lock_irqsave(&current->pi_lock,
> > flags);	\
> >  		debug_special_state_change((state_value));	
> > 	\
> > +		trace_set_current_state(state_value);		
> > 	\
> >  		WRITE_ONCE(current->__state,
> > (state_value));		\
> >  		raw_spin_unlock_irqrestore(&current->pi_lock,
> > flags);	\
> >  	} while (0)
> > @@ -282,6 +285,7 @@ struct user_event_mm;
> >  		raw_spin_lock(&current-
> > >pi_lock);			\
> >  		current->saved_state = current-
> > >__state;		\
> >  		debug_rtlock_wait_set_state();			
> > 	\
> > +		trace_set_current_state(TASK_RTLOCK_WAIT);	
> > 	\
> >  		WRITE_ONCE(current->__state,
> > TASK_RTLOCK_WAIT);		\
> >  		raw_spin_unlock(&current-
> > >pi_lock);			\
> >  	} while (0);
> > @@ -291,6 +295,7 @@ struct user_event_mm;
> >  		lockdep_assert_irqs_disabled();		
> > 		\
> >  		raw_spin_lock(&current-
> > >pi_lock);			\
> >  		debug_rtlock_wait_restore_state();		
> > 	\
> > +		trace_set_current_state(TASK_RUNNING);		
> > 	\
> >  		WRITE_ONCE(current->__state, current-
> > >saved_state);	\
> >  		current->saved_state =
> > TASK_RUNNING;			\
> >  		raw_spin_unlock(&current-
> > >pi_lock);			\
> 
> > diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> > index 165c90ba64ea9..fb5f8aa61ef5d 100644
> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -491,6 +491,12 @@ sched_core_dequeue(struct rq *rq, struct
> > task_struct *p, int flags) { }
> >  
> >  #endif /* CONFIG_SCHED_CORE */
> >  
> > +void trace_set_current_state(int state_value)
> > +{
> > +	trace_sched_set_state_tp(current, current->__state,
> > state_value);
> > +}
> > +EXPORT_SYMBOL(trace_set_current_state);
> 
> Urgh, why !?!
> 

Do you mean why exporting it?
At first I was puzzled too (this line is borrowed from Daniel), but the
thing is: set_current_state and friends are macros called by any sort
of code (e.g. modules), this seems the easiest way without messing up
with the current code.

I'm not sure if it would be cleaner to just drop this function and
define it directly in the header (including also trace/events/sched.h
there). It felt like files including sched shouldn't know about
tracepoints.

What do you think would be better?
Peter Zijlstra Feb. 6, 2025, 8:57 a.m. UTC | #3
On Thu, Feb 06, 2025 at 09:36:41AM +0100, Gabriele Monaco wrote:

> > > diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> > > index 165c90ba64ea9..fb5f8aa61ef5d 100644
> > > --- a/kernel/sched/core.c
> > > +++ b/kernel/sched/core.c
> > > @@ -491,6 +491,12 @@ sched_core_dequeue(struct rq *rq, struct
> > > task_struct *p, int flags) { }
> > >  
> > >  #endif /* CONFIG_SCHED_CORE */
> > >  
> > > +void trace_set_current_state(int state_value)
> > > +{
> > > +	trace_sched_set_state_tp(current, current->__state,
> > > state_value);
> > > +}
> > > +EXPORT_SYMBOL(trace_set_current_state);
> > 
> > Urgh, why !?!
> > 
> 
> Do you mean why exporting it?
> At first I was puzzled too (this line is borrowed from Daniel), but the
> thing is: set_current_state and friends are macros called by any sort
> of code (e.g. modules), this seems the easiest way without messing up
> with the current code.
> 
> I'm not sure if it would be cleaner to just drop this function and
> define it directly in the header (including also trace/events/sched.h
> there). It felt like files including sched shouldn't know about
> tracepoints.
> 
> What do you think would be better?

So I would think having the tracepoint in-line would be better. Because
as is, everything gets to have this pointless CALL to an empty function.

If this were x86_64 only, I would suggest using static_call(), but
barring that, the static_branch() already in the tracepoint is the best
we can do.
Gabriele Monaco Feb. 6, 2025, 11:47 a.m. UTC | #4
On Thu, 2025-02-06 at 09:57 +0100, Peter Zijlstra wrote:
> On Thu, Feb 06, 2025 at 09:36:41AM +0100, Gabriele Monaco wrote:
> 
> > > > diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> > > > index 165c90ba64ea9..fb5f8aa61ef5d 100644
> > > > --- a/kernel/sched/core.c
> > > > +++ b/kernel/sched/core.c
> > > > @@ -491,6 +491,12 @@ sched_core_dequeue(struct rq *rq, struct
> > > > task_struct *p, int flags) { }
> > > >  
> > > >  #endif /* CONFIG_SCHED_CORE */
> > > >  
> > > > +void trace_set_current_state(int state_value)
> > > > +{
> > > > +	trace_sched_set_state_tp(current, current->__state,
> > > > state_value);
> > > > +}
> > > > +EXPORT_SYMBOL(trace_set_current_state);
> > > 
> > > Urgh, why !?!
> > 
> > What do you think would be better?
> 
> So I would think having the tracepoint in-line would be better.
> Because
> as is, everything gets to have this pointless CALL to an empty
> function.
> 
> If this were x86_64 only, I would suggest using static_call(), but
> barring that, the static_branch() already in the tracepoint is the
> best
> we can do.
> 

Ok, I see your point now..

Adding the trace_ call inline seems far from trivial to me, but we
could indeed do what's suggested in tracepoint-defs.h and practically
use a static branch to call this trace_set_current_state, not sure if
this is already what you were suggesting, though.

Ignore the inconsistent naming, but something like this should work:

diff --git a/include/linux/sched.h b/include/linux/sched.h
index af9fa18035c7..7b9d84dbc2f5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -223,6 +223,12 @@ struct user_event_mm;
  *
  * Also see the comments of try_to_wake_up().
  */
+
+#define trace_set_current_state(state_value) do { \
+	if (tracepoint_enabled(sched_set_state_tp))   \
+		do_trace_set_current_state(state_value); \
+} while(0)
+
 #define __set_current_state(state_value)				\
 	do {								\
 		debug_normal_state_change((state_value));		\
@@ -332,7 +338,9 @@ extern void io_schedule_finish(int token);
 extern long io_schedule_timeout(long timeout);
 extern void io_schedule(void);
 
-extern void trace_set_current_state(int state_value);
+#include <linux/tracepoint-defs.h>
+DECLARE_TRACEPOINT(sched_set_state_tp);
+extern void do_trace_set_current_state(int state_value);
 
 /**
  * struct prev_cputime - snapshot of system and user cputime
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 149d55195532..9fc2be079bb5 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -491,11 +491,12 @@ sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { }
 
 #endif /* CONFIG_SCHED_CORE */
 
-void trace_set_current_state(int state_value)
+void do_trace_set_current_state(int state_value)
 {
 	trace_sched_set_state_tp(current, current->__state, state_value);
 }
-EXPORT_SYMBOL(trace_set_current_state);
+EXPORT_SYMBOL(do_trace_set_current_state);
+EXPORT_TRACEPOINT_SYMBOL(sched_set_state_tp);
 
 /*
  * Serialization rules:
Steven Rostedt Feb. 6, 2025, 1:36 p.m. UTC | #5
On Thu, 06 Feb 2025 12:47:17 +0100
Gabriele Monaco <gmonaco@redhat.com> wrote:

> > > What do you think would be better?  
> > 
> > So I would think having the tracepoint in-line would be better.
> > Because
> > as is, everything gets to have this pointless CALL to an empty
> > function.
> > 
> > If this were x86_64 only, I would suggest using static_call(), but
> > barring that, the static_branch() already in the tracepoint is the
> > best
> > we can do.
> >   
> 
> Ok, I see your point now..
> 
> Adding the trace_ call inline seems far from trivial to me, but we
> could indeed do what's suggested in tracepoint-defs.h and practically
> use a static branch to call this trace_set_current_state, not sure if
> this is already what you were suggesting, though.

Right, due to the macro magic of how tracepoints are created, you can't
have them in header files. It causes too many side effects that can easily
break the build.

-- Steve
diff mbox series

Patch

diff --git a/include/linux/rv.h b/include/linux/rv.h
index 8883b41d88ec4..55d458be53a4c 100644
--- a/include/linux/rv.h
+++ b/include/linux/rv.h
@@ -7,7 +7,7 @@ 
 #ifndef _LINUX_RV_H
 #define _LINUX_RV_H
 
-#define MAX_DA_NAME_LEN	24
+#define MAX_DA_NAME_LEN	32
 
 #ifdef CONFIG_RV
 /*
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9632e3318e0d6..af9fa18035c71 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -226,12 +226,14 @@  struct user_event_mm;
 #define __set_current_state(state_value)				\
 	do {								\
 		debug_normal_state_change((state_value));		\
+		trace_set_current_state(state_value);			\
 		WRITE_ONCE(current->__state, (state_value));		\
 	} while (0)
 
 #define set_current_state(state_value)					\
 	do {								\
 		debug_normal_state_change((state_value));		\
+		trace_set_current_state(state_value);			\
 		smp_store_mb(current->__state, (state_value));		\
 	} while (0)
 
@@ -247,6 +249,7 @@  struct user_event_mm;
 									\
 		raw_spin_lock_irqsave(&current->pi_lock, flags);	\
 		debug_special_state_change((state_value));		\
+		trace_set_current_state(state_value);			\
 		WRITE_ONCE(current->__state, (state_value));		\
 		raw_spin_unlock_irqrestore(&current->pi_lock, flags);	\
 	} while (0)
@@ -282,6 +285,7 @@  struct user_event_mm;
 		raw_spin_lock(&current->pi_lock);			\
 		current->saved_state = current->__state;		\
 		debug_rtlock_wait_set_state();				\
+		trace_set_current_state(TASK_RTLOCK_WAIT);		\
 		WRITE_ONCE(current->__state, TASK_RTLOCK_WAIT);		\
 		raw_spin_unlock(&current->pi_lock);			\
 	} while (0);
@@ -291,6 +295,7 @@  struct user_event_mm;
 		lockdep_assert_irqs_disabled();				\
 		raw_spin_lock(&current->pi_lock);			\
 		debug_rtlock_wait_restore_state();			\
+		trace_set_current_state(TASK_RUNNING);			\
 		WRITE_ONCE(current->__state, current->saved_state);	\
 		current->saved_state = TASK_RUNNING;			\
 		raw_spin_unlock(&current->pi_lock);			\
@@ -327,6 +332,8 @@  extern void io_schedule_finish(int token);
 extern long io_schedule_timeout(long timeout);
 extern void io_schedule(void);
 
+extern void trace_set_current_state(int state_value);
+
 /**
  * struct prev_cputime - snapshot of system and user cputime
  * @utime: time spent in user mode
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 9ea4c404bd4ef..1ac0e23b0733d 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -824,6 +824,23 @@  DECLARE_TRACE(sched_compute_energy_tp,
 		 unsigned long max_util, unsigned long busy_time),
 	TP_ARGS(p, dst_cpu, energy, max_util, busy_time));
 
+DECLARE_TRACE(sched_entry_tp,
+	TP_PROTO(bool preempt, unsigned long ip),
+	TP_ARGS(preempt, ip));
+
+DECLARE_TRACE(sched_exit_tp,
+	TP_PROTO(bool is_switch, unsigned long ip),
+	TP_ARGS(is_switch, ip));
+
+DECLARE_TRACE(sched_set_need_resched_tp,
+	TP_PROTO(struct task_struct *tsk),
+	TP_ARGS(tsk));
+
+DECLARE_TRACE_CONDITION(sched_set_state_tp,
+	TP_PROTO(struct task_struct *tsk, int curr_state, int state),
+	TP_ARGS(tsk, curr_state, state),
+	TP_CONDITION(!!curr_state != !!state));
+
 #endif /* _TRACE_SCHED_H */
 
 /* This part must be outside protection */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 165c90ba64ea9..fb5f8aa61ef5d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -491,6 +491,12 @@  sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { }
 
 #endif /* CONFIG_SCHED_CORE */
 
+void trace_set_current_state(int state_value)
+{
+	trace_sched_set_state_tp(current, current->__state, state_value);
+}
+EXPORT_SYMBOL(trace_set_current_state);
+
 /*
  * Serialization rules:
  *
@@ -1103,6 +1109,8 @@  static void __resched_curr(struct rq *rq, int tif)
 
 	cpu = cpu_of(rq);
 
+	trace_sched_set_need_resched_tp(curr);
+
 	if (cpu == smp_processor_id()) {
 		set_ti_thread_flag(cti, tif);
 		if (tif == TIF_NEED_RESCHED)
@@ -5306,6 +5314,12 @@  asmlinkage __visible void schedule_tail(struct task_struct *prev)
 	 */
 
 	finish_task_switch(prev);
+	/*
+	 * This is a special case: the newly created task has just
+	 * switched the context for the fist time. It is returning from
+	 * schedule for the first time in this path.
+	 */
+	trace_sched_exit_tp(true, CALLER_ADDR0);
 	preempt_enable();
 
 	if (current->set_child_tid)
@@ -6649,12 +6663,15 @@  static void __sched notrace __schedule(int sched_mode)
 	 * as a preemption by schedule_debug() and RCU.
 	 */
 	bool preempt = sched_mode > SM_NONE;
+	bool is_switch = false;
 	unsigned long *switch_count;
 	unsigned long prev_state;
 	struct rq_flags rf;
 	struct rq *rq;
 	int cpu;
 
+	trace_sched_entry_tp(preempt, CALLER_ADDR0);
+
 	cpu = smp_processor_id();
 	rq = cpu_rq(cpu);
 	prev = rq->curr;
@@ -6722,7 +6739,8 @@  static void __sched notrace __schedule(int sched_mode)
 	rq->last_seen_need_resched_ns = 0;
 #endif
 
-	if (likely(prev != next)) {
+	is_switch = prev != next;
+	if (likely(is_switch)) {
 		rq->nr_switches++;
 		/*
 		 * RCU users of rcu_dereference(rq->curr) may not see
@@ -6767,6 +6785,7 @@  static void __sched notrace __schedule(int sched_mode)
 		__balance_callbacks(rq);
 		raw_spin_rq_unlock_irq(rq);
 	}
+	trace_sched_exit_tp(is_switch, CALLER_ADDR0);
 }
 
 void __noreturn do_task_dead(void)
diff --git a/tools/verification/rv/include/rv.h b/tools/verification/rv/include/rv.h
index 770fd6da36107..0cab1037a98f7 100644
--- a/tools/verification/rv/include/rv.h
+++ b/tools/verification/rv/include/rv.h
@@ -1,7 +1,7 @@ 
 // SPDX-License-Identifier: GPL-2.0
 
 #define MAX_DESCRIPTION 1024
-#define MAX_DA_NAME_LEN	24
+#define MAX_DA_NAME_LEN	32
 
 struct monitor {
 	char name[MAX_DA_NAME_LEN];