Message ID | 20230802121116.324604-6-zegao@tencent.com (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
Series | fix task state report from sched tracepoint | expand |
On Wed, 2 Aug 2023 08:10:00 -0400 Ze Gao <zegao2021@gmail.com> wrote: > From: Ze Gao <zegao2021@gmail.com> > > Internal representations of task state are likely to be changed > or ordered, and reporting them to userspace without exporting > them as part of API is basically wrong, which can easily break > a userspace observability tool as kernel evolves. For example, > perf suffers from this and still reports wrong states as of this > writing. > > OTOH, some masqueraded states like TASK_REPORT_IDLE and > TASK_REPORT_MAX are also reported inadvertently, which confuses > things even more and most userspace tools do not even take them > into consideration. > > So add a new variable in company with the old raw value to > report task state in symbolic chars, which are self-explaining > and no further translation is needed. Of course this does not > break any userspace tool. > > Note for PREEMPT_ACTIVE, we introduce 'p' to report it and use > the old conventions for the rest. > > Signed-off-by: Ze Gao <zegao@tencent.com> > Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org> > Acked-by: Ian Rogers <irogers@google.com> > --- > include/trace/events/sched.h | 44 ++++++++++++++++++++++-------------- > 1 file changed, 27 insertions(+), 17 deletions(-) > > diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h > index 7d34db20b2c6..1c7b94793495 100644 > --- a/include/trace/events/sched.h > +++ b/include/trace/events/sched.h > @@ -6,6 +6,7 @@ > #define _TRACE_SCHED_H > > #include <linux/kthread.h> > +#include <linux/sched.h> > #include <linux/sched/numa_balancing.h> > #include <linux/tracepoint.h> > #include <linux/binfmts.h> > @@ -214,6 +215,27 @@ static inline int __trace_sched_switch_state(bool preempt, > > return state ? (1 << (state - 1)) : state; > } > + > +static inline char __trace_sched_switch_state_char(bool preempt, > + unsigned int prev_state, > + struct task_struct *p) > +{ > + long state; > + > +#ifdef CONFIG_SCHED_DEBUG > + BUG_ON(p != current); BUG? Why not WARN_ON()? -- Steve > +#endif /* CONFIG_SCHED_DEBUG */ > + > + /* > + * For PREEMPT_ACTIVE, we introduce 'p' to report it and use the old > + * conventions for the rest. > + */ > + if (preempt) > + return 'p'; > + > + state = __task_state_index(prev_state, p->exit_state); > + return task_index_to_char(state); > +} > #endif /* CREATE_TRACE_POINTS */ > > /* > @@ -236,6 +258,7 @@ TRACE_EVENT(sched_switch, > __field( int, prev_state ) > __array( char, prev_comm, TASK_COMM_LEN ) > __array( char, next_comm, TASK_COMM_LEN ) > + __field( char, prev_state_char ) > ), > > TP_fast_assign( > @@ -246,26 +269,13 @@ TRACE_EVENT(sched_switch, > __entry->prev_state = __trace_sched_switch_state(preempt, prev_state, prev); > memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN); > memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN); > + __entry->prev_state_char = __trace_sched_switch_state_char(preempt, prev_state, prev); > /* XXX SCHED_DEADLINE */ > ), > > - TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s%s ==> next_comm=%s next_pid=%d next_prio=%d", > - __entry->prev_comm, __entry->prev_pid, __entry->prev_prio, > - > - (__entry->prev_state & (TASK_REPORT_MAX - 1)) ? > - __print_flags(__entry->prev_state & (TASK_REPORT_MAX - 1), "|", > - { TASK_INTERRUPTIBLE, "S" }, > - { TASK_UNINTERRUPTIBLE, "D" }, > - { __TASK_STOPPED, "T" }, > - { __TASK_TRACED, "t" }, > - { EXIT_DEAD, "X" }, > - { EXIT_ZOMBIE, "Z" }, > - { TASK_PARKED, "P" }, > - { TASK_DEAD, "I" }) : > - "R", > - > - __entry->prev_state & TASK_REPORT_MAX ? "+" : "", > - __entry->next_comm, __entry->next_pid, __entry->next_prio) > + TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%c ==> next_comm=%s next_pid=%d next_prio=%d", > + __entry->prev_comm, __entry->prev_pid, __entry->prev_prio, __entry->prev_state_char, __entry->next_comm, > + __entry->next_pid, __entry->next_prio) > ); > > /*
On Wed, Aug 2, 2023 at 11:07 PM Steven Rostedt <rostedt@goodmis.org> wrote: > > On Wed, 2 Aug 2023 08:10:00 -0400 > Ze Gao <zegao2021@gmail.com> wrote: > > > From: Ze Gao <zegao2021@gmail.com> > > > > Internal representations of task state are likely to be changed > > or ordered, and reporting them to userspace without exporting > > them as part of API is basically wrong, which can easily break > > a userspace observability tool as kernel evolves. For example, > > perf suffers from this and still reports wrong states as of this > > writing. > > > > OTOH, some masqueraded states like TASK_REPORT_IDLE and > > TASK_REPORT_MAX are also reported inadvertently, which confuses > > things even more and most userspace tools do not even take them > > into consideration. > > > > So add a new variable in company with the old raw value to > > report task state in symbolic chars, which are self-explaining > > and no further translation is needed. Of course this does not > > break any userspace tool. > > > > Note for PREEMPT_ACTIVE, we introduce 'p' to report it and use > > the old conventions for the rest. > > > > Signed-off-by: Ze Gao <zegao@tencent.com> > > Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org> > > Acked-by: Ian Rogers <irogers@google.com> > > --- > > include/trace/events/sched.h | 44 ++++++++++++++++++++++-------------- > > 1 file changed, 27 insertions(+), 17 deletions(-) > > > > diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h > > index 7d34db20b2c6..1c7b94793495 100644 > > --- a/include/trace/events/sched.h > > +++ b/include/trace/events/sched.h > > @@ -6,6 +6,7 @@ > > #define _TRACE_SCHED_H > > > > #include <linux/kthread.h> > > +#include <linux/sched.h> > > #include <linux/sched/numa_balancing.h> > > #include <linux/tracepoint.h> > > #include <linux/binfmts.h> > > @@ -214,6 +215,27 @@ static inline int __trace_sched_switch_state(bool preempt, > > > > return state ? (1 << (state - 1)) : state; > > } > > + > > +static inline char __trace_sched_switch_state_char(bool preempt, > > + unsigned int prev_state, > > + struct task_struct *p) > > +{ > > + long state; > > + > > +#ifdef CONFIG_SCHED_DEBUG > > + BUG_ON(p != current); > > BUG? Why not WARN_ON()? I directly copied it from __trace_sched_switch_state since they are very similar. I had doubt on this too but decided to keep it in case people want to be 100% sure that the current task is exactly the one that is being switched, otherwise it's a fatal problem for scheduler at the point where trace_sched_switch is called. If you think WARN_ON_ONCE is more appropriate, I can fix both in v6. Thoughts? Regards, Ze
On Thu, 3 Aug 2023 10:38:43 +0800 Ze Gao <zegao2021@gmail.com> wrote: > > > +static inline char __trace_sched_switch_state_char(bool preempt, > > > + unsigned int prev_state, > > > + struct task_struct *p) > > > +{ > > > + long state; > > > + > > > +#ifdef CONFIG_SCHED_DEBUG > > > + BUG_ON(p != current); > > > > BUG? Why not WARN_ON()? > > I directly copied it from __trace_sched_switch_state since they > are very similar. I had doubt on this too but decided to keep it in > case people want to be 100% sure that the current task is exactly > the one that is being switched, otherwise it's a fatal problem for > scheduler at the point where trace_sched_switch is called. > > If you think WARN_ON_ONCE is more appropriate, I can fix both > in v6. > > Thoughts? Yeah, that BUG_ON is from 2014, and really should not BUG. I'd fix both. Thanks! -- Steve
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 7d34db20b2c6..1c7b94793495 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -6,6 +6,7 @@ #define _TRACE_SCHED_H #include <linux/kthread.h> +#include <linux/sched.h> #include <linux/sched/numa_balancing.h> #include <linux/tracepoint.h> #include <linux/binfmts.h> @@ -214,6 +215,27 @@ static inline int __trace_sched_switch_state(bool preempt, return state ? (1 << (state - 1)) : state; } + +static inline char __trace_sched_switch_state_char(bool preempt, + unsigned int prev_state, + struct task_struct *p) +{ + long state; + +#ifdef CONFIG_SCHED_DEBUG + BUG_ON(p != current); +#endif /* CONFIG_SCHED_DEBUG */ + + /* + * For PREEMPT_ACTIVE, we introduce 'p' to report it and use the old + * conventions for the rest. + */ + if (preempt) + return 'p'; + + state = __task_state_index(prev_state, p->exit_state); + return task_index_to_char(state); +} #endif /* CREATE_TRACE_POINTS */ /* @@ -236,6 +258,7 @@ TRACE_EVENT(sched_switch, __field( int, prev_state ) __array( char, prev_comm, TASK_COMM_LEN ) __array( char, next_comm, TASK_COMM_LEN ) + __field( char, prev_state_char ) ), TP_fast_assign( @@ -246,26 +269,13 @@ TRACE_EVENT(sched_switch, __entry->prev_state = __trace_sched_switch_state(preempt, prev_state, prev); memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN); memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN); + __entry->prev_state_char = __trace_sched_switch_state_char(preempt, prev_state, prev); /* XXX SCHED_DEADLINE */ ), - TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s%s ==> next_comm=%s next_pid=%d next_prio=%d", - __entry->prev_comm, __entry->prev_pid, __entry->prev_prio, - - (__entry->prev_state & (TASK_REPORT_MAX - 1)) ? - __print_flags(__entry->prev_state & (TASK_REPORT_MAX - 1), "|", - { TASK_INTERRUPTIBLE, "S" }, - { TASK_UNINTERRUPTIBLE, "D" }, - { __TASK_STOPPED, "T" }, - { __TASK_TRACED, "t" }, - { EXIT_DEAD, "X" }, - { EXIT_ZOMBIE, "Z" }, - { TASK_PARKED, "P" }, - { TASK_DEAD, "I" }) : - "R", - - __entry->prev_state & TASK_REPORT_MAX ? "+" : "", - __entry->next_comm, __entry->next_pid, __entry->next_prio) + TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%c ==> next_comm=%s next_pid=%d next_prio=%d", + __entry->prev_comm, __entry->prev_pid, __entry->prev_prio, __entry->prev_state_char, __entry->next_comm, + __entry->next_pid, __entry->next_prio) ); /*