diff mbox series

[RFC,v6,3/5] sched, tracing: reorganize fields of switch event struct

Message ID 20230803083352.1585-4-zegao@tencent.com (mailing list archive)
State Handled Elsewhere
Headers show
Series fix task state report from sched tracepoint | expand

Commit Message

Ze Gao Aug. 3, 2023, 8:33 a.m. UTC
Report prioritiy and prev_state in 'short' to save some buffer
space. And also reorder the fields so that we take struct
alignment into consideration to make the record compact.

Suggested-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: Ze Gao <zegao@tencent.com>
---
 include/trace/events/sched.h | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

Comments

Peter Zijlstra Aug. 3, 2023, 8:53 a.m. UTC | #1
On Thu, Aug 03, 2023 at 04:33:50AM -0400, Ze Gao wrote:
> Report prioritiy and prev_state in 'short' to save some buffer
> space. And also reorder the fields so that we take struct
> alignment into consideration to make the record compact.
> 
> Suggested-by: Steven Rostedt (Google) <rostedt@goodmis.org>
> Signed-off-by: Ze Gao <zegao@tencent.com>

I am really getting fed up with this. This again doesn't list any
reasons on why this is a sane thing to do.

Please review past discussions and collate the various things mentioned
into this Changelog.
Steven Rostedt Aug. 3, 2023, 9:18 a.m. UTC | #2
On Thu,  3 Aug 2023 04:33:50 -0400
Ze Gao <zegao2021@gmail.com> wrote:

> Report prioritiy and prev_state in 'short' to save some buffer
> space. And also reorder the fields so that we take struct
> alignment into consideration to make the record compact.

If I were to write this, I would have wrote:

  The prev_state field in the sched_switch event is assigned by
  __trace_sched_switch_state(). The largest number that function will return
  is TASK_REPORT_MAX which is just 0x100. There's no reason that the
  prev_state field is a full 32 bits when it is using just 9 bits max. In
  order to save space on the ring buffer, shrink the prev_state to 16 bits
  (short).

  Also, change the positions of the other fields to accommodate the short
  value of prev_state to eliminate any holes that were created in the
  structure.

See the difference?

>  
>  #ifdef CREATE_TRACE_POINTS
> -static inline long __trace_sched_switch_state(bool preempt,
> +static inline short __trace_sched_switch_state(bool preempt,
>  					      unsigned int prev_state,
>  					      struct task_struct *p)
>  {
>  	unsigned int state;
>  
>  #ifdef CONFIG_SCHED_DEBUG
> -	BUG_ON(p != current);
> +	WARN_ON_ONCE(p != current);
>  #endif /* CONFIG_SCHED_DEBUG */

The above needs to be a separate patch.

-- Steve

>  
>  	/*
> @@ -229,23 +229,23 @@ TRACE_EVENT(sched_switch,
>  	TP_ARGS(preempt, prev, next, prev_state),
>  
>  	TP_STRUCT__entry(
> -		__array(	char,	prev_comm,	TASK_COMM_LEN	)
>  		__field(	pid_t,	prev_pid			)
> -		__field(	int,	prev_prio			)
> -		__field(	long,	prev_state			)
> -		__array(	char,	next_comm,	TASK_COMM_LEN	)
>  		__field(	pid_t,	next_pid			)
> -		__field(	int,	next_prio			)
> +		__field(	short,	prev_prio			)
> +		__field(	short,	next_prio			)
> +		__array(	char,	prev_comm,	TASK_COMM_LEN	)
> +		__array(	char,	next_comm,	TASK_COMM_LEN	)
> +		__field(	short,	prev_state			)
>  	),
>  
>  	TP_fast_assign(
> -		memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
> -		__entry->prev_pid	= prev->pid;
> -		__entry->prev_prio	= prev->prio;
> -		__entry->prev_state	= __trace_sched_switch_state(preempt, prev_state, prev);
> +		__entry->prev_pid		= prev->pid;
> +		__entry->next_pid		= next->pid;
> +		__entry->prev_prio		= (short) prev->prio;
> +		__entry->next_prio		= (short) next->prio;
>  		memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
> -		__entry->next_pid	= next->pid;
> -		__entry->next_prio	= next->prio;
> +		memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
> +		__entry->prev_state		= __trace_sched_switch_state(preempt, prev_state, prev);
>  		/* XXX SCHED_DEADLINE */
>  	),
>
Peter Zijlstra Aug. 3, 2023, 9:51 a.m. UTC | #3
On Thu, Aug 03, 2023 at 05:18:26AM -0400, Steven Rostedt wrote:
> On Thu,  3 Aug 2023 04:33:50 -0400
> Ze Gao <zegao2021@gmail.com> wrote:
> 
> > Report prioritiy and prev_state in 'short' to save some buffer
> > space. And also reorder the fields so that we take struct
> > alignment into consideration to make the record compact.
> 
> If I were to write this, I would have wrote:
> 
>   The prev_state field in the sched_switch event is assigned by
>   __trace_sched_switch_state(). The largest number that function will return
>   is TASK_REPORT_MAX which is just 0x100. There's no reason that the
>   prev_state field is a full 32 bits when it is using just 9 bits max. In
>   order to save space on the ring buffer, shrink the prev_state to 16 bits
>   (short).
> 
>   Also, change the positions of the other fields to accommodate the short
>   value of prev_state to eliminate any holes that were created in the
>   structure.
> 
> See the difference?

This also doesn't mention you broke the data format for all trace events
a while back to ensure people are using libtracefs and are thus
confident this won't break things.
Ze Gao Aug. 3, 2023, 11:06 a.m. UTC | #4
On Thu, Aug 3, 2023 at 4:54 PM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Thu, Aug 03, 2023 at 04:33:50AM -0400, Ze Gao wrote:
> > Report prioritiy and prev_state in 'short' to save some buffer
> > space. And also reorder the fields so that we take struct
> > alignment into consideration to make the record compact.
> >
> > Suggested-by: Steven Rostedt (Google) <rostedt@goodmis.org>
> > Signed-off-by: Ze Gao <zegao@tencent.com>
>
> I am really getting fed up with this. This again doesn't list any
> reasons on why this is a sane thing to do.

Lesson learned from Steven's reply,  Will reorganize the changelog.

Thanks for pointing this out.

Regards,
Ze
Ze Gao Aug. 3, 2023, 12:54 p.m. UTC | #5
On Thu, Aug 3, 2023 at 5:18 PM Steven Rostedt <rostedt@goodmis.org> wrote:
>
> On Thu,  3 Aug 2023 04:33:50 -0400
> Ze Gao <zegao2021@gmail.com> wrote:
>
> > Report prioritiy and prev_state in 'short' to save some buffer
> > space. And also reorder the fields so that we take struct
> > alignment into consideration to make the record compact.
>
> If I were to write this, I would have wrote:
>
>   The prev_state field in the sched_switch event is assigned by
>   __trace_sched_switch_state(). The largest number that function will return
>   is TASK_REPORT_MAX which is just 0x100. There's no reason that the
>   prev_state field is a full 32 bits when it is using just 9 bits max. In
>   order to save space on the ring buffer, shrink the prev_state to 16 bits
>   (short).
>
>   Also, change the positions of the other fields to accommodate the short
>   value of prev_state to eliminate any holes that were created in the
>   structure.
>
> See the difference?
>
> >
> >  #ifdef CREATE_TRACE_POINTS
> > -static inline long __trace_sched_switch_state(bool preempt,
> > +static inline short __trace_sched_switch_state(bool preempt,
> >                                             unsigned int prev_state,
> >                                             struct task_struct *p)
> >  {
> >       unsigned int state;
> >
> >  #ifdef CONFIG_SCHED_DEBUG
> > -     BUG_ON(p != current);
> > +     WARN_ON_ONCE(p != current);
> >  #endif /* CONFIG_SCHED_DEBUG */
>
> The above needs to be a separate patch.

I've moved this to a new patch, and this is the changelog:

    sched, tracing: change BUG_ON to WARN_ON_ONCE in __trace_sched_switch_state

    BUG_ON() was introduced in 2014 and old, and we
    switch it to WARN_ON_ONCE() to not to crash the
    kernel when the sched-out task is unexpected than
    the current, as suggested by Steven.

    Signed-off-by: Ze Gao <zegao@tencent.com>

Regards,
Ze

> >
> >       /*
> > @@ -229,23 +229,23 @@ TRACE_EVENT(sched_switch,
> >       TP_ARGS(preempt, prev, next, prev_state),
> >
> >       TP_STRUCT__entry(
> > -             __array(        char,   prev_comm,      TASK_COMM_LEN   )
> >               __field(        pid_t,  prev_pid                        )
> > -             __field(        int,    prev_prio                       )
> > -             __field(        long,   prev_state                      )
> > -             __array(        char,   next_comm,      TASK_COMM_LEN   )
> >               __field(        pid_t,  next_pid                        )
> > -             __field(        int,    next_prio                       )
> > +             __field(        short,  prev_prio                       )
> > +             __field(        short,  next_prio                       )
> > +             __array(        char,   prev_comm,      TASK_COMM_LEN   )
> > +             __array(        char,   next_comm,      TASK_COMM_LEN   )
> > +             __field(        short,  prev_state                      )
> >       ),
> >
> >       TP_fast_assign(
> > -             memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
> > -             __entry->prev_pid       = prev->pid;
> > -             __entry->prev_prio      = prev->prio;
> > -             __entry->prev_state     = __trace_sched_switch_state(preempt, prev_state, prev);
> > +             __entry->prev_pid               = prev->pid;
> > +             __entry->next_pid               = next->pid;
> > +             __entry->prev_prio              = (short) prev->prio;
> > +             __entry->next_prio              = (short) next->prio;
> >               memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
> > -             __entry->next_pid       = next->pid;
> > -             __entry->next_prio      = next->prio;
> > +             memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
> > +             __entry->prev_state             = __trace_sched_switch_state(preempt, prev_state, prev);
> >               /* XXX SCHED_DEADLINE */
> >       ),
> >
>
Ze Gao Aug. 3, 2023, 12:57 p.m. UTC | #6
On Thu, Aug 3, 2023 at 5:18 PM Steven Rostedt <rostedt@goodmis.org> wrote:
>
> On Thu,  3 Aug 2023 04:33:50 -0400
> Ze Gao <zegao2021@gmail.com> wrote:
>
> > Report prioritiy and prev_state in 'short' to save some buffer
> > space. And also reorder the fields so that we take struct
> > alignment into consideration to make the record compact.
>
> If I were to write this, I would have wrote:
>
>   The prev_state field in the sched_switch event is assigned by
>   __trace_sched_switch_state(). The largest number that function will return
>   is TASK_REPORT_MAX which is just 0x100. There's no reason that the
>   prev_state field is a full 32 bits when it is using just 9 bits max. In
>   order to save space on the ring buffer, shrink the prev_state to 16 bits
>   (short).
>
>   Also, change the positions of the other fields to accommodate the short
>   value of prev_state to eliminate any holes that were created in the
>   structure.

Thanks for this elaboration! But I see Peter have comments on this, I'll wait
to see these resolved and then send my new changelog.

Thanks,
Ze

> >
> >  #ifdef CREATE_TRACE_POINTS
> > -static inline long __trace_sched_switch_state(bool preempt,
> > +static inline short __trace_sched_switch_state(bool preempt,
> >                                             unsigned int prev_state,
> >                                             struct task_struct *p)
> >  {
> >       unsigned int state;
> >
> >  #ifdef CONFIG_SCHED_DEBUG
> > -     BUG_ON(p != current);
> > +     WARN_ON_ONCE(p != current);
> >  #endif /* CONFIG_SCHED_DEBUG */
>
> The above needs to be a separate patch.
>
> -- Steve
>
> >
> >       /*
> > @@ -229,23 +229,23 @@ TRACE_EVENT(sched_switch,
> >       TP_ARGS(preempt, prev, next, prev_state),
> >
> >       TP_STRUCT__entry(
> > -             __array(        char,   prev_comm,      TASK_COMM_LEN   )
> >               __field(        pid_t,  prev_pid                        )
> > -             __field(        int,    prev_prio                       )
> > -             __field(        long,   prev_state                      )
> > -             __array(        char,   next_comm,      TASK_COMM_LEN   )
> >               __field(        pid_t,  next_pid                        )
> > -             __field(        int,    next_prio                       )
> > +             __field(        short,  prev_prio                       )
> > +             __field(        short,  next_prio                       )
> > +             __array(        char,   prev_comm,      TASK_COMM_LEN   )
> > +             __array(        char,   next_comm,      TASK_COMM_LEN   )
> > +             __field(        short,  prev_state                      )
> >       ),
> >
> >       TP_fast_assign(
> > -             memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
> > -             __entry->prev_pid       = prev->pid;
> > -             __entry->prev_prio      = prev->prio;
> > -             __entry->prev_state     = __trace_sched_switch_state(preempt, prev_state, prev);
> > +             __entry->prev_pid               = prev->pid;
> > +             __entry->next_pid               = next->pid;
> > +             __entry->prev_prio              = (short) prev->prio;
> > +             __entry->next_prio              = (short) next->prio;
> >               memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
> > -             __entry->next_pid       = next->pid;
> > -             __entry->next_prio      = next->prio;
> > +             memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
> > +             __entry->prev_state             = __trace_sched_switch_state(preempt, prev_state, prev);
> >               /* XXX SCHED_DEADLINE */
> >       ),
> >
>
Steven Rostedt Aug. 3, 2023, 2:45 p.m. UTC | #7
On Thu, 3 Aug 2023 11:51:32 +0200
Peter Zijlstra <peterz@infradead.org> wrote:

> > 
> > See the difference?  
> 
> This also doesn't mention you broke the data format for all trace events
> a while back to ensure people are using libtracefs and are thus
> confident this won't break things.


It was the meta data that happened a while ago. As other events change all
the time with this restriction (keeping field names around), I didn't
realize that it was required for this change log. Doesn't hurt adding it.

More details on that:

 commit b000c8065 ("tracing: Remove the extra 4 bytes of padding in events")

This was to get rid of the padding that powertop relied on.

Nit, it's libtraceevent not libtracefs, as libtracefs gets you to the
format files, but it's libtraceveent that parses them.


-- Steve
kernel test robot Aug. 23, 2023, 2:52 a.m. UTC | #8
Hello,

kernel test robot noticed "perf-sanity-tests.Parse_sched_tracepoints_fields.fail" on:

commit: 029aadfe946d99c4b11f1dd52f9ff76a09b21f69 ("[RFC PATCH v6 3/5] sched, tracing: reorganize fields of switch event struct")
url: https://github.com/intel-lab-lkp/linux/commits/Ze-Gao/perf-sched-sync-state-char-array-with-the-kernel/20230803-163946
base: https://git.kernel.org/cgit/linux/kernel/git/acme/linux.git perf/core
patch link: https://lore.kernel.org/all/20230803083352.1585-4-zegao@tencent.com/
patch subject: [RFC PATCH v6 3/5] sched, tracing: reorganize fields of switch event struct

in testcase: perf-sanity-tests
version: perf-x86_64-00c7b5f4ddc5-1_20230402
with following parameters:

	perf_compiler: gcc



compiler: gcc-12
test machine: 224 threads 2 sockets Intel(R) Xeon(R) Platinum 8480+ (Sapphire Rapids) with 256G memory

(please refer to attached dmesg/kmsg for entire log/backtrace)



If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <oliver.sang@intel.com>
| Closes: https://lore.kernel.org/oe-lkp/202308231018.8ddf15b7-oliver.sang@intel.com



 14: Roundtrip evsel->name                                           : Ok
 15: Parse sched tracepoints fields                                  : FAILED!
 16: syscalls:sys_enter_openat event fields                          : Ok



The kernel config and materials to reproduce are available at:
https://download.01.org/0day-ci/archive/20230823/202308231018.8ddf15b7-oliver.sang@intel.com
diff mbox series

Patch

diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index fbb99a61f714..43492daefa6f 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -187,14 +187,14 @@  DEFINE_EVENT(sched_wakeup_template, sched_wakeup_new,
 	     TP_ARGS(p));
 
 #ifdef CREATE_TRACE_POINTS
-static inline long __trace_sched_switch_state(bool preempt,
+static inline short __trace_sched_switch_state(bool preempt,
 					      unsigned int prev_state,
 					      struct task_struct *p)
 {
 	unsigned int state;
 
 #ifdef CONFIG_SCHED_DEBUG
-	BUG_ON(p != current);
+	WARN_ON_ONCE(p != current);
 #endif /* CONFIG_SCHED_DEBUG */
 
 	/*
@@ -229,23 +229,23 @@  TRACE_EVENT(sched_switch,
 	TP_ARGS(preempt, prev, next, prev_state),
 
 	TP_STRUCT__entry(
-		__array(	char,	prev_comm,	TASK_COMM_LEN	)
 		__field(	pid_t,	prev_pid			)
-		__field(	int,	prev_prio			)
-		__field(	long,	prev_state			)
-		__array(	char,	next_comm,	TASK_COMM_LEN	)
 		__field(	pid_t,	next_pid			)
-		__field(	int,	next_prio			)
+		__field(	short,	prev_prio			)
+		__field(	short,	next_prio			)
+		__array(	char,	prev_comm,	TASK_COMM_LEN	)
+		__array(	char,	next_comm,	TASK_COMM_LEN	)
+		__field(	short,	prev_state			)
 	),
 
 	TP_fast_assign(
-		memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
-		__entry->prev_pid	= prev->pid;
-		__entry->prev_prio	= prev->prio;
-		__entry->prev_state	= __trace_sched_switch_state(preempt, prev_state, prev);
+		__entry->prev_pid		= prev->pid;
+		__entry->next_pid		= next->pid;
+		__entry->prev_prio		= (short) prev->prio;
+		__entry->next_prio		= (short) next->prio;
 		memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
-		__entry->next_pid	= next->pid;
-		__entry->next_prio	= next->prio;
+		memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
+		__entry->prev_state		= __trace_sched_switch_state(preempt, prev_state, prev);
 		/* XXX SCHED_DEADLINE */
 	),