diff mbox series

[v2] libtraceevent plugins: Parse sched_switch "prev_state" field for state info

Message ID 20231206185759.7792f272@gandalf.local.home (mailing list archive)
State Superseded
Headers show
Series [v2] libtraceevent plugins: Parse sched_switch "prev_state" field for state info | expand

Commit Message

Steven Rostedt Dec. 6, 2023, 11:57 p.m. UTC
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>

The write_state() function uses a hard coded string "SDTtXZPI" to index
the sched_switch prev_state field bitmask. This is fine, except for when
the kernel changes this string, in which case this will break again.

Worse yet, there can be various saved trace files that have various
versions of this string, and updating the string may work for one trace
file, it will likely break another trace file.

Instead, look into the event itself, and how it parsed the "print fmt".
Using the tep_print_args, the mapping between the bits and the output that
the kernel uses is exposed to user space. Walk the print arguments until
the __print_flags() for the "prev_state" field is found, and use that to
build the states string for future parsing.

Save the "prev_state_field" pointer, as it should be the same for later
occurrences, but if more than one trace data (more than one tep handler)
is being parsed, the string will need to be updated each time a new field
is passed in, as this is not saved in the tep handle itself.

Link: https://lore.kernel.org/linux-trace-devel/20231206175506.14f6081d@gandalf.local.home/

Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
Changes since v1: https://lore.kernel.org/linux-trace-devel/20231206180750.68fc8648@gandalf.local.home/

- Added check for field is NULL if the prev_state does not exist.

 plugins/plugin_sched_switch.c | 133 ++++++++++++++++++++++++++++++++--
 1 file changed, 128 insertions(+), 5 deletions(-)

Comments

Ze Gao Dec. 8, 2023, 7:41 a.m. UTC | #1
On Thu, Dec 7, 2023 at 7:57 AM Steven Rostedt <rostedt@goodmis.org> wrote:
>
> From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
>
> The write_state() function uses a hard coded string "SDTtXZPI" to index
> the sched_switch prev_state field bitmask. This is fine, except for when
> the kernel changes this string, in which case this will break again.
>
> Worse yet, there can be various saved trace files that have various
> versions of this string, and updating the string may work for one trace
> file, it will likely break another trace file.
>
> Instead, look into the event itself, and how it parsed the "print fmt".
> Using the tep_print_args, the mapping between the bits and the output that
> the kernel uses is exposed to user space. Walk the print arguments until
> the __print_flags() for the "prev_state" field is found, and use that to
> build the states string for future parsing.
>
> Save the "prev_state_field" pointer, as it should be the same for later
> occurrences, but if more than one trace data (more than one tep handler)
> is being parsed, the string will need to be updated each time a new field
> is passed in, as this is not saved in the tep handle itself.
>
> Link: https://lore.kernel.org/linux-trace-devel/20231206175506.14f6081d@gandalf.local.home/
>
> Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
> ---
> Changes since v1: https://lore.kernel.org/linux-trace-devel/20231206180750.68fc8648@gandalf.local.home/
>
> - Added check for field is NULL if the prev_state does not exist.
>
>  plugins/plugin_sched_switch.c | 133 ++++++++++++++++++++++++++++++++--
>  1 file changed, 128 insertions(+), 5 deletions(-)
>
> diff --git a/plugins/plugin_sched_switch.c b/plugins/plugin_sched_switch.c
> index e0986ac9cc3d..bde68c4b602d 100644
> --- a/plugins/plugin_sched_switch.c
> +++ b/plugins/plugin_sched_switch.c
> @@ -9,13 +9,136 @@
>  #include "event-parse.h"
>  #include "trace-seq.h"
>
> -static void write_state(struct trace_seq *s, int val)
> +static const char *convert_sym(struct tep_print_flag_sym *sym)
>  {
> -       const char states[] = "SDTtXZPI";
> +       static char save_states[33];
> +
> +       memset(save_states, 0, sizeof(save_states));
> +
> +       /* This is the flags for the prev_state_field, now make them into a string */
> +       for (; sym; sym = sym->next) {
> +               long bitmask = strtoul(sym->value, NULL, 0);
> +               int i;
> +
> +               for (i = 0; !(bitmask & 1); i++)
> +                       bitmask >>= 1;

It's no big deal but I think glibc has a faster version of ffs here
we can directly call that one instead.

> +               if (i > 32)
> +                       continue; // warn?
> +
> +               save_states[i] = sym->str[0];
> +       }
> +
> +       return save_states;
> +}
> +
> +static struct tep_print_arg_field *
> +find_arg_field(struct tep_format_field *prev_state_field, struct tep_print_arg *arg)
> +{
> +       struct tep_print_arg_field *field;
> +
> +       if (!arg)
> +               return NULL;
> +
> +       if (arg->type == TEP_PRINT_FIELD)
> +               return &arg->field;
> +
> +       if (arg->type == TEP_PRINT_OP) {
> +               field = find_arg_field(prev_state_field, arg->op.left);
> +               if (field && field->field == prev_state_field)
> +                       return field;
> +               field = find_arg_field(prev_state_field, arg->op.right);
> +               if (field && field->field == prev_state_field)
> +                       return field;
> +       }
> +       return NULL;
> +}
> +
> +static struct tep_print_flag_sym *
> +test_flags(struct tep_format_field *prev_state_field, struct tep_print_arg *arg)
> +{
> +       struct tep_print_arg_field *field;
> +
> +       field = find_arg_field(prev_state_field, arg->flags.field);
> +       if (!field)
> +               return NULL;
> +
> +       return arg->flags.flags;
> +}
> +
> +static struct tep_print_flag_sym *
> +search_op(struct tep_format_field *prev_state_field, struct tep_print_arg *arg)
> +{
> +       struct tep_print_flag_sym *sym = NULL;
> +
> +       if (!arg)
> +               return NULL;
> +
> +       if (arg->type == TEP_PRINT_OP) {
> +               sym = search_op(prev_state_field, arg->op.left);
> +               if (sym)
> +                       return sym;
> +
> +               sym = search_op(prev_state_field, arg->op.right);
> +               if (sym)
> +                       return sym;
> +       } else if (arg->type == TEP_PRINT_FLAGS) {
> +               sym = test_flags(prev_state_field, arg);
> +       }
> +
> +       return sym;
> +}
> +
> +static const char *get_states(struct tep_format_field *prev_state_field)
> +{
> +       struct tep_print_flag_sym *sym;
> +       struct tep_print_arg *arg;
> +       struct tep_event *event;
> +
> +       event = prev_state_field->event;
> +
> +       /*
> +        * Look at the event format fields, and search for where
> +        * the prev_state is parsed via the format flags.
> +        */
> +       for (arg = event->print_fmt.args; arg; arg = arg->next) {
> +               /*
> +                * Currently, the __print_flags() for the prev_state
> +                * is embedded in operations, so they too must be
> +                * searched.
> +                */
> +               sym = search_op(prev_state_field, arg);
> +               if (sym)
> +                       return convert_sym(sym);
> +       }
> +       return NULL;
> +}
> +
> +static void write_state(struct trace_seq *s, struct tep_format_field *field,
> +                       struct tep_record *record)
> +{
> +       static struct tep_format_field *prev_state_field;
> +       static const char *states;
> +       unsigned long long val;
>         int found = 0;
> +       int len;
>         int i;
>
> -       for (i = 0; i < (sizeof(states) - 1); i++) {
> +       if (!field)
> +               return;
> +
> +       if (!states || field != prev_state_field) {
> +               states = get_states(field);
> +               if (!states)
> +                       states = "SDTtXZPI";
> +               printf("states = '%s'\n", states);

printf here introduces some noises when I use perf script to inspect
the raw events,
and it always shows this "out-of-nowhere" message in the first
sched_switch record.
Still, no big deal, and better without this one.

> +               prev_state_field = field;
> +       }
> +
> +       tep_read_number_field(field, record->data, &val);
> +
> +       len = strlen(states);
> +       for (i = 0; i < len; i++) {
>                 if (!(val & (1 << i)))
>                         continue;

I think here is another case to consider, the preemption state. it's
exposed as well IIRC.
and the kernel denotes this as "R+", maybe we can follow that
convention by adding
checks like:

        if (!found) {
                trace_seq_putc(s, 'R');
                if (val)
                     trace_seq_putc(s,  ‘+’);
        }

Just in case, if you were not intentional to make it as it is now.

> @@ -99,8 +222,8 @@ static int sched_switch_handler(struct trace_seq *s,
>         if (tep_get_field_val(s, event, "prev_prio", record, &val, 1) == 0)
>                 trace_seq_printf(s, "[%d] ", (int) val);
>
> -       if (tep_get_field_val(s,  event, "prev_state", record, &val, 1) == 0)
> -               write_state(s, val);
> +       field = tep_find_any_field(event, "prev_state");
> +       write_state(s, field, record);
>
>         trace_seq_puts(s, " ==> ");
>
> --
> 2.42.0
>

Overall, I did the tests on my settings, and the patch worked. It's much more
robust in finding and validating TEP_PRINT_FLAG than the one I posted and
This is the first time I know we are capable of doing analyses on multiple data
records simultaneously. Learned a lot!

Last but the least,  I had trouble applying the diff to the branch libtraceevent
initially but had to copy and paste on my own.  Just FYI. Still no big deal :)

Thanks,
Ze
Steven Rostedt Dec. 8, 2023, 1:52 p.m. UTC | #2
On Fri, 8 Dec 2023 15:41:01 +0800
Ze Gao <zegao2021@gmail.com> wrote:

> > diff --git a/plugins/plugin_sched_switch.c b/plugins/plugin_sched_switch.c
> > index e0986ac9cc3d..bde68c4b602d 100644
> > --- a/plugins/plugin_sched_switch.c
> > +++ b/plugins/plugin_sched_switch.c
> > @@ -9,13 +9,136 @@
> >  #include "event-parse.h"
> >  #include "trace-seq.h"
> >
> > -static void write_state(struct trace_seq *s, int val)
> > +static const char *convert_sym(struct tep_print_flag_sym *sym)
> >  {
> > -       const char states[] = "SDTtXZPI";
> > +       static char save_states[33];
> > +
> > +       memset(save_states, 0, sizeof(save_states));
> > +
> > +       /* This is the flags for the prev_state_field, now make them into a string */
> > +       for (; sym; sym = sym->next) {
> > +               long bitmask = strtoul(sym->value, NULL, 0);
> > +               int i;
> > +
> > +               for (i = 0; !(bitmask & 1); i++)
> > +                       bitmask >>= 1;  
> 
> It's no big deal but I think glibc has a faster version of ffs here
> we can directly call that one instead.

I'm use to using ffs in the kernel, but I'm always weary of glibc. No real
reason, just a personal reaction. Anyway, this is far from a fast path if
it's only calculated when hitting a new prev_state field. Although this can
happen often with host/guest tracing.

I figured I'd optimize that in another patch.

> 
> > +               if (i > 32)
> > +                       continue; // warn?
> > +
> > +               save_states[i] = sym->str[0];
> > +       }
> > +
> > +       return save_states;
> > +}
> > +


> > +static void write_state(struct trace_seq *s, struct tep_format_field *field,
> > +                       struct tep_record *record)
> > +{
> > +       static struct tep_format_field *prev_state_field;
> > +       static const char *states;
> > +       unsigned long long val;
> >         int found = 0;
> > +       int len;
> >         int i;
> >
> > -       for (i = 0; i < (sizeof(states) - 1); i++) {
> > +       if (!field)
> > +               return;
> > +
> > +       if (!states || field != prev_state_field) {
> > +               states = get_states(field);
> > +               if (!states)
> > +                       states = "SDTtXZPI";
> > +               printf("states = '%s'\n", states);  
> 
> printf here introduces some noises when I use perf script to inspect
> the raw events,
> and it always shows this "out-of-nowhere" message in the first
> sched_switch record.
> Still, no big deal, and better without this one.

Bah, that was for debugging. I forgot to nuke it. That's what I get when I
work on user space tooling while compiling the kernel for updates I'm doing
there. :-p

> 
> > +               prev_state_field = field;
> > +       }
> > +
> > +       tep_read_number_field(field, record->data, &val);
> > +
> > +       len = strlen(states);
> > +       for (i = 0; i < len; i++) {
> >                 if (!(val & (1 << i)))
> >                         continue;  
> 
> I think here is another case to consider, the preemption state. it's
> exposed as well IIRC.
> and the kernel denotes this as "R+", maybe we can follow that
> convention by adding
> checks like:
> 
>         if (!found) {
>                 trace_seq_putc(s, 'R');
>                 if (val)
>                      trace_seq_putc(s,  ‘+’);
>         }
> 
> Just in case, if you were not intentional to make it as it is now.

Sounds good. I'd do that in a separate patch too.

> 
> > @@ -99,8 +222,8 @@ static int sched_switch_handler(struct trace_seq *s,
> >         if (tep_get_field_val(s, event, "prev_prio", record, &val, 1) == 0)
> >                 trace_seq_printf(s, "[%d] ", (int) val);
> >
> > -       if (tep_get_field_val(s,  event, "prev_state", record, &val, 1) == 0)
> > -               write_state(s, val);
> > +       field = tep_find_any_field(event, "prev_state");
> > +       write_state(s, field, record);
> >
> >         trace_seq_puts(s, " ==> ");
> >
> > --
> > 2.42.0
> >  
> 
> Overall, I did the tests on my settings, and the patch worked. It's much more
> robust in finding and validating TEP_PRINT_FLAG than the one I posted and
> This is the first time I know we are capable of doing analyses on multiple data
> records simultaneously. Learned a lot!
> 
> Last but the least,  I had trouble applying the diff to the branch libtraceevent
> initially but had to copy and paste on my own.  Just FYI. Still no big deal :)

I have three patches I have yet to push on top of this one. I'm way behind
in my user space work. As mentioned above, I tend to screw up when I try to
focus both on the kernel and user space libraries (unless I'm working on
both because they are related, but that's not currently the case).

Hopefully this will get done before the end of the year.

Thanks!

-- Steve
diff mbox series

Patch

diff --git a/plugins/plugin_sched_switch.c b/plugins/plugin_sched_switch.c
index e0986ac9cc3d..bde68c4b602d 100644
--- a/plugins/plugin_sched_switch.c
+++ b/plugins/plugin_sched_switch.c
@@ -9,13 +9,136 @@ 
 #include "event-parse.h"
 #include "trace-seq.h"
 
-static void write_state(struct trace_seq *s, int val)
+static const char *convert_sym(struct tep_print_flag_sym *sym)
 {
-	const char states[] = "SDTtXZPI";
+	static char save_states[33];
+
+	memset(save_states, 0, sizeof(save_states));
+
+	/* This is the flags for the prev_state_field, now make them into a string */
+	for (; sym; sym = sym->next) {
+		long bitmask = strtoul(sym->value, NULL, 0);
+		int i;
+
+		for (i = 0; !(bitmask & 1); i++)
+			bitmask >>= 1;
+
+		if (i > 32)
+			continue; // warn?
+
+		save_states[i] = sym->str[0];
+	}
+
+	return save_states;
+}
+
+static struct tep_print_arg_field *
+find_arg_field(struct tep_format_field *prev_state_field, struct tep_print_arg *arg)
+{
+	struct tep_print_arg_field *field;
+
+	if (!arg)
+		return NULL;
+
+	if (arg->type == TEP_PRINT_FIELD)
+		return &arg->field;
+
+	if (arg->type == TEP_PRINT_OP) {
+		field = find_arg_field(prev_state_field, arg->op.left);
+		if (field && field->field == prev_state_field)
+			return field;
+		field = find_arg_field(prev_state_field, arg->op.right);
+		if (field && field->field == prev_state_field)
+			return field;
+	}
+	return NULL;
+}
+
+static struct tep_print_flag_sym *
+test_flags(struct tep_format_field *prev_state_field, struct tep_print_arg *arg)
+{
+	struct tep_print_arg_field *field;
+
+	field = find_arg_field(prev_state_field, arg->flags.field);
+	if (!field)
+		return NULL;
+
+	return arg->flags.flags;
+}
+
+static struct tep_print_flag_sym *
+search_op(struct tep_format_field *prev_state_field, struct tep_print_arg *arg)
+{
+	struct tep_print_flag_sym *sym = NULL;
+
+	if (!arg)
+		return NULL;
+
+	if (arg->type == TEP_PRINT_OP) {
+		sym = search_op(prev_state_field, arg->op.left);
+		if (sym)
+			return sym;
+
+		sym = search_op(prev_state_field, arg->op.right);
+		if (sym)
+			return sym;
+	} else if (arg->type == TEP_PRINT_FLAGS) {
+		sym = test_flags(prev_state_field, arg);
+	}
+
+	return sym;
+}
+
+static const char *get_states(struct tep_format_field *prev_state_field)
+{
+	struct tep_print_flag_sym *sym;
+	struct tep_print_arg *arg;
+	struct tep_event *event;
+
+	event = prev_state_field->event;
+
+	/*
+	 * Look at the event format fields, and search for where
+	 * the prev_state is parsed via the format flags.
+	 */
+	for (arg = event->print_fmt.args; arg; arg = arg->next) {
+		/*
+		 * Currently, the __print_flags() for the prev_state
+		 * is embedded in operations, so they too must be
+		 * searched.
+		 */
+		sym = search_op(prev_state_field, arg);
+		if (sym)
+			return convert_sym(sym);
+	}
+	return NULL;
+}
+
+static void write_state(struct trace_seq *s, struct tep_format_field *field,
+			struct tep_record *record)
+{
+	static struct tep_format_field *prev_state_field;
+	static const char *states;
+	unsigned long long val;
 	int found = 0;
+	int len;
 	int i;
 
-	for (i = 0; i < (sizeof(states) - 1); i++) {
+	if (!field)
+		return;
+
+	if (!states || field != prev_state_field) {
+		states = get_states(field);
+		if (!states)
+			states = "SDTtXZPI";
+		printf("states = '%s'\n", states);
+		prev_state_field = field;
+	}
+
+	tep_read_number_field(field, record->data, &val);
+
+	len = strlen(states);
+	for (i = 0; i < len; i++) {
 		if (!(val & (1 << i)))
 			continue;
 
@@ -99,8 +222,8 @@  static int sched_switch_handler(struct trace_seq *s,
 	if (tep_get_field_val(s, event, "prev_prio", record, &val, 1) == 0)
 		trace_seq_printf(s, "[%d] ", (int) val);
 
-	if (tep_get_field_val(s,  event, "prev_state", record, &val, 1) == 0)
-		write_state(s, val);
+	field = tep_find_any_field(event, "prev_state");
+	write_state(s, field, record);
 
 	trace_seq_puts(s, " ==> ");