Message ID | 20230803083352.1585-5-zegao@tencent.com (mailing list archive) |
---|---|
State | Handled Elsewhere |
Headers | show |
Series | fix task state report from sched tracepoint | expand |
On Thu, Aug 03, 2023 at 04:33:51AM -0400, Ze Gao wrote: > Internal representations of task state are likely to be changed > or ordered, and reporting them to userspace without exporting > them as part of API is basically wrong, which can easily break > a userspace observability tool as kernel evolves. For example, > perf suffers from this and still reports wrong states as of this > writing. > > OTOH, some masqueraded states like TASK_REPORT_IDLE and > TASK_REPORT_MAX are also reported inadvertently, which confuses > things even more and most userspace tools do not even take them > into consideration. > > So add a new variable in company with the old raw value to > report task state in symbolic chars, which are self-explaining > and no further translation is needed. Of course this does not > break any userspace tool. > > Note for PREEMPT_ACTIVE, we introduce 'p' to report it and use > the old conventions for the rest. > > Signed-off-by: Ze Gao <zegao@tencent.com> > Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org> > Acked-by: Ian Rogers <irogers@google.com> I'm not sure you've actually read any of the things I've written. I hate this. Not going to happen.
On Thu, 3 Aug 2023 04:33:51 -0400 Ze Gao <zegao2021@gmail.com> wrote: > Internal representations of task state are likely to be changed > or ordered, and reporting them to userspace without exporting > them as part of API is basically wrong, which can easily break > a userspace observability tool as kernel evolves. For example, > perf suffers from this and still reports wrong states as of this > writing. > > OTOH, some masqueraded states like TASK_REPORT_IDLE and > TASK_REPORT_MAX are also reported inadvertently, which confuses > things even more and most userspace tools do not even take them > into consideration. > > So add a new variable in company with the old raw value to > report task state in symbolic chars, which are self-explaining > and no further translation is needed. Of course this does not > break any userspace tool. > > Note for PREEMPT_ACTIVE, we introduce 'p' to report it and use > the old conventions for the rest. The above is actually good. > > Signed-off-by: Ze Gao <zegao@tencent.com> > Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org> > Acked-by: Ian Rogers <irogers@google.com> > --- > include/trace/events/sched.h | 44 ++++++++++++++++++++++-------------- > 1 file changed, 27 insertions(+), 17 deletions(-) > > diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h > index 43492daefa6f..ae5b486cc969 100644 > --- a/include/trace/events/sched.h > +++ b/include/trace/events/sched.h > @@ -6,6 +6,7 @@ > #define _TRACE_SCHED_H > > #include <linux/kthread.h> > +#include <linux/sched.h> > #include <linux/sched/numa_balancing.h> > #include <linux/tracepoint.h> > #include <linux/binfmts.h> > @@ -214,6 +215,27 @@ static inline short __trace_sched_switch_state(bool preempt, > > return state ? (1 << (state - 1)) : state; > } > + > +static inline char __trace_sched_switch_state_char(bool preempt, > + unsigned int prev_state, > + struct task_struct *p) > +{ > + long state; > + > +#ifdef CONFIG_SCHED_DEBUG > + WARN_ON_ONCE(p != current); > +#endif /* CONFIG_SCHED_DEBUG */ > + > + /* > + * For PREEMPT_ACTIVE, we introduce 'p' to report it and use the old > + * conventions for the rest. > + */ > + if (preempt) > + return 'p'; > + > + state = __task_state_index(prev_state, p->exit_state); > + return task_index_to_char(state); > +} > #endif /* CREATE_TRACE_POINTS */ > > /* > @@ -236,6 +258,7 @@ TRACE_EVENT(sched_switch, > __array( char, prev_comm, TASK_COMM_LEN ) > __array( char, next_comm, TASK_COMM_LEN ) > __field( short, prev_state ) > + __field( char, prev_state_char ) > ), > > TP_fast_assign( > @@ -246,26 +269,13 @@ TRACE_EVENT(sched_switch, > memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN); > memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN); > __entry->prev_state = __trace_sched_switch_state(preempt, prev_state, prev); > + __entry->prev_state_char = __trace_sched_switch_state_char(preempt, prev_state, prev); > /* XXX SCHED_DEADLINE */ > ), > > - TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s%s ==> next_comm=%s next_pid=%d next_prio=%d", > - __entry->prev_comm, __entry->prev_pid, __entry->prev_prio, > - > - (__entry->prev_state & (TASK_REPORT_MAX - 1)) ? > - __print_flags(__entry->prev_state & (TASK_REPORT_MAX - 1), "|", > - { TASK_INTERRUPTIBLE, "S" }, > - { TASK_UNINTERRUPTIBLE, "D" }, > - { __TASK_STOPPED, "T" }, > - { __TASK_TRACED, "t" }, > - { EXIT_DEAD, "X" }, > - { EXIT_ZOMBIE, "Z" }, > - { TASK_PARKED, "P" }, > - { TASK_DEAD, "I" }) : > - "R", I just realized, I have user space code that looks at this. As in the format file we have: print fmt: "prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s%s ==> next_comm=%s next_pid=%d next_prio=%d", REC->prev_comm, REC->prev_pid, REC->prev_prio, (REC->prev_state & ((((0x00000000 | 0x00000001 | 0x00000002 | 0x00000004 | 0x00000008 | 0x00000010 | 0x00000020 | 0x00000040) + 1) << 1) - 1)) ? __print_flags(REC->prev_state & ((((0x00000000 | 0x00000001 | 0x00000002 | 0x00000004 | 0x00000008 | 0x00000010 | 0x00000020 | 0x00000040) + 1) << 1) - 1), "|", { 0x00000001, "S" }, { 0x00000002, "D" }, { 0x00000004, "T" }, { 0x00000008, "t" }, { 0x00000010, "X" }, { 0x00000020, "Z" }, { 0x00000040, "P" }, { 0x00000080, "I" }) : "R", REC->prev_state & (((0x00000000 | 0x00000001 | 0x00000002 | 0x00000004 | 0x00000008 | 0x00000010 | 0x00000020 | 0x00000040) + 1) << 1) ? "+" : "", REC->next_comm, REC->next_pid, REC->next_prio And I have used this in applications to find out what values "S" and "D" are. So, we need to keep that still. But we can add the prev_state_char to the output too. "prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s%s[%c] ==> next_comm=%s next_pid=%d next_prio=%d" > - > - __entry->prev_state & TASK_REPORT_MAX ? "+" : "", > - __entry->next_comm, __entry->next_pid, __entry->next_prio) > + TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%c ==> next_comm=%s next_pid=%d next_prio=%d", > + __entry->prev_comm, __entry->prev_pid, __entry->prev_prio, __entry->prev_state_char, __entry->next_comm, > + __entry->next_pid, __entry->next_prio) > ); > > /* -- Steve
On Thu, Aug 3, 2023 at 5:29 PM Steven Rostedt <rostedt@goodmis.org> wrote: > > On Thu, 3 Aug 2023 04:33:51 -0400 > Ze Gao <zegao2021@gmail.com> wrote: > > > Internal representations of task state are likely to be changed > > or ordered, and reporting them to userspace without exporting > > them as part of API is basically wrong, which can easily break > > a userspace observability tool as kernel evolves. For example, > > perf suffers from this and still reports wrong states as of this > > writing. > > > > OTOH, some masqueraded states like TASK_REPORT_IDLE and > > TASK_REPORT_MAX are also reported inadvertently, which confuses > > things even more and most userspace tools do not even take them > > into consideration. > > > > So add a new variable in company with the old raw value to > > report task state in symbolic chars, which are self-explaining > > and no further translation is needed. Of course this does not > > break any userspace tool. > > > > Note for PREEMPT_ACTIVE, we introduce 'p' to report it and use > > the old conventions for the rest. > > The above is actually good. > > > > > > Signed-off-by: Ze Gao <zegao@tencent.com> > > Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org> > > Acked-by: Ian Rogers <irogers@google.com> > > --- > > include/trace/events/sched.h | 44 ++++++++++++++++++++++-------------- > > 1 file changed, 27 insertions(+), 17 deletions(-) > > > > diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h > > index 43492daefa6f..ae5b486cc969 100644 > > --- a/include/trace/events/sched.h > > +++ b/include/trace/events/sched.h > > @@ -6,6 +6,7 @@ > > #define _TRACE_SCHED_H > > > > #include <linux/kthread.h> > > +#include <linux/sched.h> > > #include <linux/sched/numa_balancing.h> > > #include <linux/tracepoint.h> > > #include <linux/binfmts.h> > > @@ -214,6 +215,27 @@ static inline short __trace_sched_switch_state(bool preempt, > > > > return state ? (1 << (state - 1)) : state; > > } > > + > > +static inline char __trace_sched_switch_state_char(bool preempt, > > + unsigned int prev_state, > > + struct task_struct *p) > > +{ > > + long state; > > + > > +#ifdef CONFIG_SCHED_DEBUG > > + WARN_ON_ONCE(p != current); > > +#endif /* CONFIG_SCHED_DEBUG */ > > + > > + /* > > + * For PREEMPT_ACTIVE, we introduce 'p' to report it and use the old > > + * conventions for the rest. > > + */ > > + if (preempt) > > + return 'p'; > > + > > + state = __task_state_index(prev_state, p->exit_state); > > + return task_index_to_char(state); > > +} > > #endif /* CREATE_TRACE_POINTS */ > > > > /* > > @@ -236,6 +258,7 @@ TRACE_EVENT(sched_switch, > > __array( char, prev_comm, TASK_COMM_LEN ) > > __array( char, next_comm, TASK_COMM_LEN ) > > __field( short, prev_state ) > > + __field( char, prev_state_char ) > > ), > > > > TP_fast_assign( > > @@ -246,26 +269,13 @@ TRACE_EVENT(sched_switch, > > memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN); > > memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN); > > __entry->prev_state = __trace_sched_switch_state(preempt, prev_state, prev); > > + __entry->prev_state_char = __trace_sched_switch_state_char(preempt, prev_state, prev); > > /* XXX SCHED_DEADLINE */ > > ), > > > > - TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s%s ==> next_comm=%s next_pid=%d next_prio=%d", > > - __entry->prev_comm, __entry->prev_pid, __entry->prev_prio, > > - > > - (__entry->prev_state & (TASK_REPORT_MAX - 1)) ? > > - __print_flags(__entry->prev_state & (TASK_REPORT_MAX - 1), "|", > > - { TASK_INTERRUPTIBLE, "S" }, > > - { TASK_UNINTERRUPTIBLE, "D" }, > > - { __TASK_STOPPED, "T" }, > > - { __TASK_TRACED, "t" }, > > - { EXIT_DEAD, "X" }, > > - { EXIT_ZOMBIE, "Z" }, > > - { TASK_PARKED, "P" }, > > - { TASK_DEAD, "I" }) : > > - "R", > > I just realized, I have user space code that looks at this. As in the format file we have: > > print fmt: "prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s%s ==> next_comm=%s next_pid=%d next_prio=%d", REC->prev_comm, REC->prev_pid, REC->prev_prio, (REC->prev_state & ((((0x00000000 | 0x00000001 | 0x00000002 | 0x00000004 | 0x00000008 | 0x00000010 | 0x00000020 | 0x00000040) + 1) << 1) - 1)) ? __print_flags(REC->prev_state & ((((0x00000000 | 0x00000001 | 0x00000002 | 0x00000004 | 0x00000008 | 0x00000010 | 0x00000020 | 0x00000040) + 1) << 1) - 1), "|", { 0x00000001, "S" }, { 0x00000002, "D" }, { 0x00000004, "T" }, { 0x00000008, "t" }, { 0x00000010, "X" }, { 0x00000020, "Z" }, { 0x00000040, "P" }, { 0x00000080, "I" }) : "R", REC->prev_state & (((0x00000000 | 0x00000001 | 0x00000002 | 0x00000004 | 0x00000008 | 0x00000010 | 0x00000020 | 0x00000040) + 1) << 1) ? "+" : "", REC->next_comm, REC->next_pid, REC->next_prio > > And I have used this in applications to find out what values "S" and "D" > are. > > So, we need to keep that still. But we can add the prev_state_char to the > output too. > > "prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s%s[%c] ==> next_comm=%s next_pid=%d next_prio=%d" Good point! But I see Peter has strong opinions over this change badly. So I'm not sure if it's worth the effort to push this anymore :/ And apparently We failed to convince each other over this problem. How about we only keep all the fixing patches and throw away this 'prev_state_char' thing? Again, I'm not meant to upset anyone here. But Tons of thanks for your sage reviews on this. Thoughts? Thanks, Ze
On Thu, Aug 3, 2023 at 4:59 PM Peter Zijlstra <peterz@infradead.org> wrote: > > On Thu, Aug 03, 2023 at 04:33:51AM -0400, Ze Gao wrote: > > Internal representations of task state are likely to be changed > > or ordered, and reporting them to userspace without exporting > > them as part of API is basically wrong, which can easily break > > a userspace observability tool as kernel evolves. For example, > > perf suffers from this and still reports wrong states as of this > > writing. > > > > OTOH, some masqueraded states like TASK_REPORT_IDLE and > > TASK_REPORT_MAX are also reported inadvertently, which confuses > > things even more and most userspace tools do not even take them > > into consideration. > > > > So add a new variable in company with the old raw value to > > report task state in symbolic chars, which are self-explaining > > and no further translation is needed. Of course this does not > > break any userspace tool. > > > > Note for PREEMPT_ACTIVE, we introduce 'p' to report it and use > > the old conventions for the rest. > > > > Signed-off-by: Ze Gao <zegao@tencent.com> > > Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org> > > Acked-by: Ian Rogers <irogers@google.com> > > I'm not sure you've actually read any of the things I've written. I hate > this. Not going to happen. With all due respect, I'm afraid I've read so much carefully every word you replied, I am just not fully convinced and still believe this is something needs a fix but apparently you strongly refuse. So I'm considering giving up this patch and only fixing the surface problem to make everyone happy. Thanks for taking your time on this topic. Regards, Ze
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 43492daefa6f..ae5b486cc969 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -6,6 +6,7 @@ #define _TRACE_SCHED_H #include <linux/kthread.h> +#include <linux/sched.h> #include <linux/sched/numa_balancing.h> #include <linux/tracepoint.h> #include <linux/binfmts.h> @@ -214,6 +215,27 @@ static inline short __trace_sched_switch_state(bool preempt, return state ? (1 << (state - 1)) : state; } + +static inline char __trace_sched_switch_state_char(bool preempt, + unsigned int prev_state, + struct task_struct *p) +{ + long state; + +#ifdef CONFIG_SCHED_DEBUG + WARN_ON_ONCE(p != current); +#endif /* CONFIG_SCHED_DEBUG */ + + /* + * For PREEMPT_ACTIVE, we introduce 'p' to report it and use the old + * conventions for the rest. + */ + if (preempt) + return 'p'; + + state = __task_state_index(prev_state, p->exit_state); + return task_index_to_char(state); +} #endif /* CREATE_TRACE_POINTS */ /* @@ -236,6 +258,7 @@ TRACE_EVENT(sched_switch, __array( char, prev_comm, TASK_COMM_LEN ) __array( char, next_comm, TASK_COMM_LEN ) __field( short, prev_state ) + __field( char, prev_state_char ) ), TP_fast_assign( @@ -246,26 +269,13 @@ TRACE_EVENT(sched_switch, memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN); memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN); __entry->prev_state = __trace_sched_switch_state(preempt, prev_state, prev); + __entry->prev_state_char = __trace_sched_switch_state_char(preempt, prev_state, prev); /* XXX SCHED_DEADLINE */ ), - TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s%s ==> next_comm=%s next_pid=%d next_prio=%d", - __entry->prev_comm, __entry->prev_pid, __entry->prev_prio, - - (__entry->prev_state & (TASK_REPORT_MAX - 1)) ? - __print_flags(__entry->prev_state & (TASK_REPORT_MAX - 1), "|", - { TASK_INTERRUPTIBLE, "S" }, - { TASK_UNINTERRUPTIBLE, "D" }, - { __TASK_STOPPED, "T" }, - { __TASK_TRACED, "t" }, - { EXIT_DEAD, "X" }, - { EXIT_ZOMBIE, "Z" }, - { TASK_PARKED, "P" }, - { TASK_DEAD, "I" }) : - "R", - - __entry->prev_state & TASK_REPORT_MAX ? "+" : "", - __entry->next_comm, __entry->next_pid, __entry->next_prio) + TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%c ==> next_comm=%s next_pid=%d next_prio=%d", + __entry->prev_comm, __entry->prev_pid, __entry->prev_prio, __entry->prev_state_char, __entry->next_comm, + __entry->next_pid, __entry->next_prio) ); /*