diff mbox series

[RFC,v4,5/7] sched, tracing: add to report task state in symbolic chars

Message ID 20230802121116.324604-6-zegao@tencent.com (mailing list archive)
State Superseded
Headers show
Series fix task state report from sched tracepoint | expand

Commit Message

Ze Gao Aug. 2, 2023, 12:10 p.m. UTC
From: Ze Gao <zegao2021@gmail.com>

Internal representations of task state are likely to be changed
or ordered, and reporting them to userspace without exporting
them as part of API is basically wrong, which can easily break
a userspace observability tool as kernel evolves. For example,
perf suffers from this and still reports wrong states as of this
writing.

OTOH, some masqueraded states like TASK_REPORT_IDLE and
TASK_REPORT_MAX are also reported inadvertently, which confuses
things even more and most userspace tools do not even take them
into consideration.

So add a new variable in company with the old raw value to
report task state in symbolic chars, which are self-explaining
and no further translation is needed. Of course this does not
break any userspace tool.

Note for PREEMPT_ACTIVE, we introduce 'p' to report it and use
the old conventions for the rest.

Signed-off-by: Ze Gao <zegao@tencent.com>
Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Acked-by: Ian Rogers <irogers@google.com>
---
 include/trace/events/sched.h | 44 ++++++++++++++++++++++--------------
 1 file changed, 27 insertions(+), 17 deletions(-)

Comments

Steven Rostedt Aug. 2, 2023, 3:07 p.m. UTC | #1
On Wed,  2 Aug 2023 08:10:00 -0400
Ze Gao <zegao2021@gmail.com> wrote:

> From: Ze Gao <zegao2021@gmail.com>
> 
> Internal representations of task state are likely to be changed
> or ordered, and reporting them to userspace without exporting
> them as part of API is basically wrong, which can easily break
> a userspace observability tool as kernel evolves. For example,
> perf suffers from this and still reports wrong states as of this
> writing.
> 
> OTOH, some masqueraded states like TASK_REPORT_IDLE and
> TASK_REPORT_MAX are also reported inadvertently, which confuses
> things even more and most userspace tools do not even take them
> into consideration.
> 
> So add a new variable in company with the old raw value to
> report task state in symbolic chars, which are self-explaining
> and no further translation is needed. Of course this does not
> break any userspace tool.
> 
> Note for PREEMPT_ACTIVE, we introduce 'p' to report it and use
> the old conventions for the rest.
> 
> Signed-off-by: Ze Gao <zegao@tencent.com>
> Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
> Acked-by: Ian Rogers <irogers@google.com>
> ---
>  include/trace/events/sched.h | 44 ++++++++++++++++++++++--------------
>  1 file changed, 27 insertions(+), 17 deletions(-)
> 
> diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
> index 7d34db20b2c6..1c7b94793495 100644
> --- a/include/trace/events/sched.h
> +++ b/include/trace/events/sched.h
> @@ -6,6 +6,7 @@
>  #define _TRACE_SCHED_H
>  
>  #include <linux/kthread.h>
> +#include <linux/sched.h>
>  #include <linux/sched/numa_balancing.h>
>  #include <linux/tracepoint.h>
>  #include <linux/binfmts.h>
> @@ -214,6 +215,27 @@ static inline int __trace_sched_switch_state(bool preempt,
>  
>  	return state ? (1 << (state - 1)) : state;
>  }
> +
> +static inline char __trace_sched_switch_state_char(bool preempt,
> +						   unsigned int prev_state,
> +						   struct task_struct *p)
> +{
> +	long state;
> +
> +#ifdef CONFIG_SCHED_DEBUG
> +	BUG_ON(p != current);

BUG? Why not WARN_ON()?

-- Steve

> +#endif /* CONFIG_SCHED_DEBUG */
> +
> +	/*
> +	 * For PREEMPT_ACTIVE, we introduce 'p' to report it and use the old
> +	 * conventions for the rest.
> +	 */
> +	if (preempt)
> +		return 'p';
> +
> +	state = __task_state_index(prev_state, p->exit_state);
> +	return task_index_to_char(state);
> +}
>  #endif /* CREATE_TRACE_POINTS */
>  
>  /*
> @@ -236,6 +258,7 @@ TRACE_EVENT(sched_switch,
>  		__field(	int,	prev_state			)
>  		__array(	char,	prev_comm,	TASK_COMM_LEN	)
>  		__array(	char,	next_comm,	TASK_COMM_LEN	)
> +		__field(	char,	prev_state_char			)
>  	),
>  
>  	TP_fast_assign(
> @@ -246,26 +269,13 @@ TRACE_EVENT(sched_switch,
>  		__entry->prev_state		= __trace_sched_switch_state(preempt, prev_state, prev);
>  		memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
>  		memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
> +		__entry->prev_state_char	= __trace_sched_switch_state_char(preempt, prev_state, prev);
>  		/* XXX SCHED_DEADLINE */
>  	),
>  
> -	TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s%s ==> next_comm=%s next_pid=%d next_prio=%d",
> -		__entry->prev_comm, __entry->prev_pid, __entry->prev_prio,
> -
> -		(__entry->prev_state & (TASK_REPORT_MAX - 1)) ?
> -		  __print_flags(__entry->prev_state & (TASK_REPORT_MAX - 1), "|",
> -				{ TASK_INTERRUPTIBLE, "S" },
> -				{ TASK_UNINTERRUPTIBLE, "D" },
> -				{ __TASK_STOPPED, "T" },
> -				{ __TASK_TRACED, "t" },
> -				{ EXIT_DEAD, "X" },
> -				{ EXIT_ZOMBIE, "Z" },
> -				{ TASK_PARKED, "P" },
> -				{ TASK_DEAD, "I" }) :
> -		  "R",
> -
> -		__entry->prev_state & TASK_REPORT_MAX ? "+" : "",
> -		__entry->next_comm, __entry->next_pid, __entry->next_prio)
> +	TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%c ==> next_comm=%s next_pid=%d next_prio=%d",
> +		__entry->prev_comm, __entry->prev_pid, __entry->prev_prio, __entry->prev_state_char, __entry->next_comm,
> +		__entry->next_pid, __entry->next_prio)
>  );
>  
>  /*
Ze Gao Aug. 3, 2023, 2:38 a.m. UTC | #2
On Wed, Aug 2, 2023 at 11:07 PM Steven Rostedt <rostedt@goodmis.org> wrote:
>
> On Wed,  2 Aug 2023 08:10:00 -0400
> Ze Gao <zegao2021@gmail.com> wrote:
>
> > From: Ze Gao <zegao2021@gmail.com>
> >
> > Internal representations of task state are likely to be changed
> > or ordered, and reporting them to userspace without exporting
> > them as part of API is basically wrong, which can easily break
> > a userspace observability tool as kernel evolves. For example,
> > perf suffers from this and still reports wrong states as of this
> > writing.
> >
> > OTOH, some masqueraded states like TASK_REPORT_IDLE and
> > TASK_REPORT_MAX are also reported inadvertently, which confuses
> > things even more and most userspace tools do not even take them
> > into consideration.
> >
> > So add a new variable in company with the old raw value to
> > report task state in symbolic chars, which are self-explaining
> > and no further translation is needed. Of course this does not
> > break any userspace tool.
> >
> > Note for PREEMPT_ACTIVE, we introduce 'p' to report it and use
> > the old conventions for the rest.
> >
> > Signed-off-by: Ze Gao <zegao@tencent.com>
> > Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
> > Acked-by: Ian Rogers <irogers@google.com>
> > ---
> >  include/trace/events/sched.h | 44 ++++++++++++++++++++++--------------
> >  1 file changed, 27 insertions(+), 17 deletions(-)
> >
> > diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
> > index 7d34db20b2c6..1c7b94793495 100644
> > --- a/include/trace/events/sched.h
> > +++ b/include/trace/events/sched.h
> > @@ -6,6 +6,7 @@
> >  #define _TRACE_SCHED_H
> >
> >  #include <linux/kthread.h>
> > +#include <linux/sched.h>
> >  #include <linux/sched/numa_balancing.h>
> >  #include <linux/tracepoint.h>
> >  #include <linux/binfmts.h>
> > @@ -214,6 +215,27 @@ static inline int __trace_sched_switch_state(bool preempt,
> >
> >       return state ? (1 << (state - 1)) : state;
> >  }
> > +
> > +static inline char __trace_sched_switch_state_char(bool preempt,
> > +                                                unsigned int prev_state,
> > +                                                struct task_struct *p)
> > +{
> > +     long state;
> > +
> > +#ifdef CONFIG_SCHED_DEBUG
> > +     BUG_ON(p != current);
>
> BUG? Why not WARN_ON()?

I directly copied it from __trace_sched_switch_state since they
are very similar. I had doubt on this too but decided to keep it in
case people want to be 100% sure that the current task is exactly
the one that is being switched, otherwise it's a fatal problem for
scheduler at the point where trace_sched_switch is called.

If you think WARN_ON_ONCE is more appropriate, I can fix both
in v6.

Thoughts?

Regards,
Ze
Steven Rostedt Aug. 3, 2023, 2:48 a.m. UTC | #3
On Thu, 3 Aug 2023 10:38:43 +0800
Ze Gao <zegao2021@gmail.com> wrote:

> > > +static inline char __trace_sched_switch_state_char(bool preempt,
> > > +                                                unsigned int prev_state,
> > > +                                                struct task_struct *p)
> > > +{
> > > +     long state;
> > > +
> > > +#ifdef CONFIG_SCHED_DEBUG
> > > +     BUG_ON(p != current);  
> >
> > BUG? Why not WARN_ON()?  
> 
> I directly copied it from __trace_sched_switch_state since they
> are very similar. I had doubt on this too but decided to keep it in
> case people want to be 100% sure that the current task is exactly
> the one that is being switched, otherwise it's a fatal problem for
> scheduler at the point where trace_sched_switch is called.
> 
> If you think WARN_ON_ONCE is more appropriate, I can fix both
> in v6.
> 
> Thoughts?

Yeah, that BUG_ON is from 2014, and really should not BUG.

I'd fix both.

Thanks!

-- Steve
diff mbox series

Patch

diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 7d34db20b2c6..1c7b94793495 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -6,6 +6,7 @@ 
 #define _TRACE_SCHED_H
 
 #include <linux/kthread.h>
+#include <linux/sched.h>
 #include <linux/sched/numa_balancing.h>
 #include <linux/tracepoint.h>
 #include <linux/binfmts.h>
@@ -214,6 +215,27 @@  static inline int __trace_sched_switch_state(bool preempt,
 
 	return state ? (1 << (state - 1)) : state;
 }
+
+static inline char __trace_sched_switch_state_char(bool preempt,
+						   unsigned int prev_state,
+						   struct task_struct *p)
+{
+	long state;
+
+#ifdef CONFIG_SCHED_DEBUG
+	BUG_ON(p != current);
+#endif /* CONFIG_SCHED_DEBUG */
+
+	/*
+	 * For PREEMPT_ACTIVE, we introduce 'p' to report it and use the old
+	 * conventions for the rest.
+	 */
+	if (preempt)
+		return 'p';
+
+	state = __task_state_index(prev_state, p->exit_state);
+	return task_index_to_char(state);
+}
 #endif /* CREATE_TRACE_POINTS */
 
 /*
@@ -236,6 +258,7 @@  TRACE_EVENT(sched_switch,
 		__field(	int,	prev_state			)
 		__array(	char,	prev_comm,	TASK_COMM_LEN	)
 		__array(	char,	next_comm,	TASK_COMM_LEN	)
+		__field(	char,	prev_state_char			)
 	),
 
 	TP_fast_assign(
@@ -246,26 +269,13 @@  TRACE_EVENT(sched_switch,
 		__entry->prev_state		= __trace_sched_switch_state(preempt, prev_state, prev);
 		memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
 		memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
+		__entry->prev_state_char	= __trace_sched_switch_state_char(preempt, prev_state, prev);
 		/* XXX SCHED_DEADLINE */
 	),
 
-	TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s%s ==> next_comm=%s next_pid=%d next_prio=%d",
-		__entry->prev_comm, __entry->prev_pid, __entry->prev_prio,
-
-		(__entry->prev_state & (TASK_REPORT_MAX - 1)) ?
-		  __print_flags(__entry->prev_state & (TASK_REPORT_MAX - 1), "|",
-				{ TASK_INTERRUPTIBLE, "S" },
-				{ TASK_UNINTERRUPTIBLE, "D" },
-				{ __TASK_STOPPED, "T" },
-				{ __TASK_TRACED, "t" },
-				{ EXIT_DEAD, "X" },
-				{ EXIT_ZOMBIE, "Z" },
-				{ TASK_PARKED, "P" },
-				{ TASK_DEAD, "I" }) :
-		  "R",
-
-		__entry->prev_state & TASK_REPORT_MAX ? "+" : "",
-		__entry->next_comm, __entry->next_pid, __entry->next_prio)
+	TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%c ==> next_comm=%s next_pid=%d next_prio=%d",
+		__entry->prev_comm, __entry->prev_pid, __entry->prev_prio, __entry->prev_state_char, __entry->next_comm,
+		__entry->next_pid, __entry->next_prio)
 );
 
 /*