diff mbox series

[v1,1/4] perf record: Dump off-cpu samples directly

Message ID 20240422083645.1930939-1-howardchu95@gmail.com (mailing list archive)
State Superseded
Headers show
Series Dump off-cpu samples directly | expand

Checks

Context Check Description
netdev/tree_selection success Not a local patch

Commit Message

Howard Chu April 22, 2024, 8:36 a.m. UTC
Parse off-cpu events using parse_event(). Change the placement of
record__config_off_cpu to after record__open because we need to write
mmapped fds into BPF's perf_event_array map, also, write 
sample_id/sample_type into BPF. In record__pushfn and record__aio_pushfn, 
handle off-cpu samples using off_cpu_strip. This is because the off-cpu 
samples that we want to write to perf.data is in off-cpu samples' raw_data 
section:

regular samples:
[sample: sample_data]

off-cpu samples:
[sample: [raw_data: sample_data]]

We need to extract the real useful sample data out before writing.

Hooks record_done just before evlist__disable to stop BPF program from
outputting, otherwise, we lose some samples.

After samples are collected, change sample_type of off-cpu event to
the OFFCPU_SAMPLE_TYPES for parsing correctly, it was PERF_SAMPLE_RAW and
some others, because BPF can only output to a specific type of perf_event,
which is why `evsel->core.attr.sample_type &= OFFCPU_SAMPLE_TYPES;` is
deleted in util/evsel.c. 

Signed-off-by: Howard Chu <howardchu95@gmail.com>
---
 tools/perf/builtin-record.c | 98 ++++++++++++++++++++++++++++++++++---
 tools/perf/util/evsel.c     |  8 ---
 2 files changed, 91 insertions(+), 15 deletions(-)

Comments

Arnaldo Carvalho de Melo April 29, 2024, 6:53 p.m. UTC | #1
On Mon, Apr 22, 2024 at 04:36:45PM +0800, Howard Chu wrote:
> Parse off-cpu events using parse_event(). Change the placement of
> record__config_off_cpu to after record__open because we need to write
> mmapped fds into BPF's perf_event_array map, also, write 
> sample_id/sample_type into BPF. In record__pushfn and record__aio_pushfn, 
> handle off-cpu samples using off_cpu_strip. This is because the off-cpu 
> samples that we want to write to perf.data is in off-cpu samples' raw_data 
> section:

Hey,

	This lacks a cover letter and the chainig of patches so that b4
can fetch the series.

	Also all 5 patches have the same summary and different
descriptions and contents, can you please rework the patch series, using
'git format-patch', and make the description reflect what each patch is
doing?

Thanks,

- Arnaldo
 
> regular samples:
> [sample: sample_data]
> 
> off-cpu samples:
> [sample: [raw_data: sample_data]]
> 
> We need to extract the real useful sample data out before writing.
> 
> Hooks record_done just before evlist__disable to stop BPF program from
> outputting, otherwise, we lose some samples.
> 
> After samples are collected, change sample_type of off-cpu event to
> the OFFCPU_SAMPLE_TYPES for parsing correctly, it was PERF_SAMPLE_RAW and
> some others, because BPF can only output to a specific type of perf_event,
> which is why `evsel->core.attr.sample_type &= OFFCPU_SAMPLE_TYPES;` is
> deleted in util/evsel.c. 
> 
> Signed-off-by: Howard Chu <howardchu95@gmail.com>
> ---
>  tools/perf/builtin-record.c | 98 ++++++++++++++++++++++++++++++++++---
>  tools/perf/util/evsel.c     |  8 ---
>  2 files changed, 91 insertions(+), 15 deletions(-)
> 
> diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
> index 2ff718d3e202..c31b23905f1b 100644
> --- a/tools/perf/builtin-record.c
> +++ b/tools/perf/builtin-record.c
> @@ -389,6 +389,8 @@ struct record_aio {
>  static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size)
>  {
>  	struct record_aio *aio = to;
> +	char *bf_stripped = NULL;
> +	size_t stripped;
>  
>  	/*
>  	 * map->core.base data pointed by buf is copied into free map->aio.data[] buffer
> @@ -404,6 +406,31 @@ static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size
>  	 * from the beginning of the kernel buffer till the end of the data chunk.
>  	 */
>  
> +	if (aio->rec->off_cpu) {
> +		if (size == 0)
> +			return 0;
> +
> +		map->core.start -= size;
> +		size = map->core.end - map->core.start;
> +
> +		bf_stripped = malloc(size);
> +
> +		if (bf_stripped == NULL) {
> +			pr_err("Failed to allocate off-cpu strip buffer\n");
> +			return -ENOMEM;
> +		}
> +
> +		stripped = off_cpu_strip(aio->rec->evlist, map, bf_stripped, size);
> +
> +		if (stripped < 0) {
> +			size = (int)stripped;
> +			goto out;
> +		}
> +
> +		size = stripped;
> +		buf = bf_stripped;
> +	}
> +
>  	if (record__comp_enabled(aio->rec)) {
>  		ssize_t compressed = zstd_compress(aio->rec->session, NULL, aio->data + aio->size,
>  						   mmap__mmap_len(map) - aio->size,
> @@ -432,6 +459,9 @@ static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size
>  
>  	aio->size += size;
>  
> +out:
> +	free(bf_stripped);
> +
>  	return size;
>  }
>  
> @@ -635,6 +665,38 @@ static int process_locked_synthesized_event(struct perf_tool *tool,
>  static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
>  {
>  	struct record *rec = to;
> +	int err;
> +	char *bf_stripped = NULL;
> +	size_t stripped;
> +
> +	if (rec->off_cpu) {
> +		/*
> +		 * We'll read all the events at once without masking.
> +		 * When reading the remainder from a map, the size is 0, because
> +		 * start is shifted to the end so no more data is to be read.
> +		 */
> +		if (size == 0)
> +			return 0;
> +
> +		map->core.start -= size;
> +		/* get the total size */
> +		size = map->core.end - map->core.start;
> +
> +		bf_stripped = malloc(size);
> +
> +		if (bf_stripped == NULL) {
> +			pr_err("Failed to allocate off-cpu strip buffer\n");
> +			return -ENOMEM;
> +		}
> +
> +		stripped = off_cpu_strip(rec->evlist, map, bf_stripped, size);
> +
> +		if (stripped < 0)
> +			return (int)stripped;
> +
> +		size = stripped;
> +		bf = bf_stripped;
> +	}
>  
>  	if (record__comp_enabled(rec)) {
>  		ssize_t compressed = zstd_compress(rec->session, map, map->data,
> @@ -648,7 +710,11 @@ static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
>  	}
>  
>  	thread->samples++;
> -	return record__write(rec, map, bf, size);
> +	err = record__write(rec, map, bf, size);
> +
> +	free(bf_stripped);
> +
> +	return err;
>  }
>  
>  static volatile sig_atomic_t signr = -1;
> @@ -1790,6 +1856,7 @@ record__finish_output(struct record *rec)
>  		if (rec->buildid_all)
>  			perf_session__dsos_hit_all(rec->session);
>  	}
> +
>  	perf_session__write_header(rec->session, rec->evlist, fd, true);
>  
>  	return;
> @@ -2501,6 +2568,14 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
>  		}
>  	}
>  
> +	if (rec->off_cpu) {
> +		err = record__config_off_cpu(rec);
> +		if (err) {
> +			pr_err("record__config_off_cpu failed, error %d\n", err);
> +			goto out_free_threads;
> +		}
> +	}
> +
>  	/*
>  	 * Normally perf_session__new would do this, but it doesn't have the
>  	 * evlist.
> @@ -2764,6 +2839,7 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
>  		 * disable events in this case.
>  		 */
>  		if (done && !disabled && !target__none(&opts->target)) {
> +			perf_hooks__invoke_record_done();
>  			trigger_off(&auxtrace_snapshot_trigger);
>  			evlist__disable(rec->evlist);
>  			disabled = true;
> @@ -2827,14 +2903,17 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
>  	} else
>  		status = err;
>  
> -	if (rec->off_cpu)
> -		rec->bytes_written += off_cpu_write(rec->session);
> -
>  	record__read_lost_samples(rec);
>  	record__synthesize(rec, true);
>  	/* this will be recalculated during process_buildids() */
>  	rec->samples = 0;
>  
> +	/* change to the correct sample type for parsing */
> +	if (rec->off_cpu && off_cpu_change_type(rec->evlist)) {
> +		pr_err("ERROR: Failed to change sample type for off-cpu event\n");
> +		goto out_delete_session;
> +	}
> +
>  	if (!err) {
>  		if (!rec->timestamp_filename) {
>  			record__finish_output(rec);
> @@ -3198,7 +3277,7 @@ static int switch_output_setup(struct record *rec)
>  	unsigned long val;
>  
>  	/*
> -	 * If we're using --switch-output-events, then we imply its 
> +	 * If we're using --switch-output-events, then we imply its
>  	 * --switch-output=signal, as we'll send a SIGUSR2 from the side band
>  	 *  thread to its parent.
>  	 */
> @@ -4221,9 +4300,14 @@ int cmd_record(int argc, const char **argv)
>  	}
>  
>  	if (rec->off_cpu) {
> -		err = record__config_off_cpu(rec);
> +		char off_cpu_event[64];
> +
> +		snprintf(off_cpu_event, sizeof(off_cpu_event),
> +			 "bpf-output/no-inherit=1,name=%s/", OFFCPU_EVENT);
> +
> +		err = parse_event(rec->evlist, off_cpu_event);
>  		if (err) {
> -			pr_err("record__config_off_cpu failed, error %d\n", err);
> +			pr_err("Failed to open off-cpu event\n");
>  			goto out;
>  		}
>  	}
> diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
> index 3536404e9447..c08ae6a3c8d6 100644
> --- a/tools/perf/util/evsel.c
> +++ b/tools/perf/util/evsel.c
> @@ -1092,11 +1092,6 @@ static void evsel__set_default_freq_period(struct record_opts *opts,
>  	}
>  }
>  
> -static bool evsel__is_offcpu_event(struct evsel *evsel)
> -{
> -	return evsel__is_bpf_output(evsel) && evsel__name_is(evsel, OFFCPU_EVENT);
> -}
> -
>  /*
>   * The enable_on_exec/disabled value strategy:
>   *
> @@ -1363,9 +1358,6 @@ void evsel__config(struct evsel *evsel, struct record_opts *opts,
>  	if (evsel__is_dummy_event(evsel))
>  		evsel__reset_sample_bit(evsel, BRANCH_STACK);
>  
> -	if (evsel__is_offcpu_event(evsel))
> -		evsel->core.attr.sample_type &= OFFCPU_SAMPLE_TYPES;
> -
>  	arch__post_evsel_config(evsel, attr);
>  }
>  
> -- 
> 2.44.0
Namhyung Kim April 29, 2024, 10:06 p.m. UTC | #2
On Mon, Apr 29, 2024 at 11:53 AM Arnaldo Carvalho de Melo
<acme@kernel.org> wrote:
>
> On Mon, Apr 22, 2024 at 04:36:45PM +0800, Howard Chu wrote:
> > Parse off-cpu events using parse_event(). Change the placement of
> > record__config_off_cpu to after record__open because we need to write
> > mmapped fds into BPF's perf_event_array map, also, write
> > sample_id/sample_type into BPF. In record__pushfn and record__aio_pushfn,
> > handle off-cpu samples using off_cpu_strip. This is because the off-cpu
> > samples that we want to write to perf.data is in off-cpu samples' raw_data
> > section:
>
> Hey,
>
>         This lacks a cover letter and the chainig of patches so that b4
> can fetch the series.
>
>         Also all 5 patches have the same summary and different
> descriptions and contents, can you please rework the patch series, using
> 'git format-patch', and make the description reflect what each patch is
> doing?

He already sent out v2.

Thanks,
Namhyung

https://lore.kernel.org/r/20240424024805.144759-1-howardchu95@gmail.com/
diff mbox series

Patch

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 2ff718d3e202..c31b23905f1b 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -389,6 +389,8 @@  struct record_aio {
 static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size)
 {
 	struct record_aio *aio = to;
+	char *bf_stripped = NULL;
+	size_t stripped;
 
 	/*
 	 * map->core.base data pointed by buf is copied into free map->aio.data[] buffer
@@ -404,6 +406,31 @@  static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size
 	 * from the beginning of the kernel buffer till the end of the data chunk.
 	 */
 
+	if (aio->rec->off_cpu) {
+		if (size == 0)
+			return 0;
+
+		map->core.start -= size;
+		size = map->core.end - map->core.start;
+
+		bf_stripped = malloc(size);
+
+		if (bf_stripped == NULL) {
+			pr_err("Failed to allocate off-cpu strip buffer\n");
+			return -ENOMEM;
+		}
+
+		stripped = off_cpu_strip(aio->rec->evlist, map, bf_stripped, size);
+
+		if (stripped < 0) {
+			size = (int)stripped;
+			goto out;
+		}
+
+		size = stripped;
+		buf = bf_stripped;
+	}
+
 	if (record__comp_enabled(aio->rec)) {
 		ssize_t compressed = zstd_compress(aio->rec->session, NULL, aio->data + aio->size,
 						   mmap__mmap_len(map) - aio->size,
@@ -432,6 +459,9 @@  static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size
 
 	aio->size += size;
 
+out:
+	free(bf_stripped);
+
 	return size;
 }
 
@@ -635,6 +665,38 @@  static int process_locked_synthesized_event(struct perf_tool *tool,
 static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
 {
 	struct record *rec = to;
+	int err;
+	char *bf_stripped = NULL;
+	size_t stripped;
+
+	if (rec->off_cpu) {
+		/*
+		 * We'll read all the events at once without masking.
+		 * When reading the remainder from a map, the size is 0, because
+		 * start is shifted to the end so no more data is to be read.
+		 */
+		if (size == 0)
+			return 0;
+
+		map->core.start -= size;
+		/* get the total size */
+		size = map->core.end - map->core.start;
+
+		bf_stripped = malloc(size);
+
+		if (bf_stripped == NULL) {
+			pr_err("Failed to allocate off-cpu strip buffer\n");
+			return -ENOMEM;
+		}
+
+		stripped = off_cpu_strip(rec->evlist, map, bf_stripped, size);
+
+		if (stripped < 0)
+			return (int)stripped;
+
+		size = stripped;
+		bf = bf_stripped;
+	}
 
 	if (record__comp_enabled(rec)) {
 		ssize_t compressed = zstd_compress(rec->session, map, map->data,
@@ -648,7 +710,11 @@  static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
 	}
 
 	thread->samples++;
-	return record__write(rec, map, bf, size);
+	err = record__write(rec, map, bf, size);
+
+	free(bf_stripped);
+
+	return err;
 }
 
 static volatile sig_atomic_t signr = -1;
@@ -1790,6 +1856,7 @@  record__finish_output(struct record *rec)
 		if (rec->buildid_all)
 			perf_session__dsos_hit_all(rec->session);
 	}
+
 	perf_session__write_header(rec->session, rec->evlist, fd, true);
 
 	return;
@@ -2501,6 +2568,14 @@  static int __cmd_record(struct record *rec, int argc, const char **argv)
 		}
 	}
 
+	if (rec->off_cpu) {
+		err = record__config_off_cpu(rec);
+		if (err) {
+			pr_err("record__config_off_cpu failed, error %d\n", err);
+			goto out_free_threads;
+		}
+	}
+
 	/*
 	 * Normally perf_session__new would do this, but it doesn't have the
 	 * evlist.
@@ -2764,6 +2839,7 @@  static int __cmd_record(struct record *rec, int argc, const char **argv)
 		 * disable events in this case.
 		 */
 		if (done && !disabled && !target__none(&opts->target)) {
+			perf_hooks__invoke_record_done();
 			trigger_off(&auxtrace_snapshot_trigger);
 			evlist__disable(rec->evlist);
 			disabled = true;
@@ -2827,14 +2903,17 @@  static int __cmd_record(struct record *rec, int argc, const char **argv)
 	} else
 		status = err;
 
-	if (rec->off_cpu)
-		rec->bytes_written += off_cpu_write(rec->session);
-
 	record__read_lost_samples(rec);
 	record__synthesize(rec, true);
 	/* this will be recalculated during process_buildids() */
 	rec->samples = 0;
 
+	/* change to the correct sample type for parsing */
+	if (rec->off_cpu && off_cpu_change_type(rec->evlist)) {
+		pr_err("ERROR: Failed to change sample type for off-cpu event\n");
+		goto out_delete_session;
+	}
+
 	if (!err) {
 		if (!rec->timestamp_filename) {
 			record__finish_output(rec);
@@ -3198,7 +3277,7 @@  static int switch_output_setup(struct record *rec)
 	unsigned long val;
 
 	/*
-	 * If we're using --switch-output-events, then we imply its 
+	 * If we're using --switch-output-events, then we imply its
 	 * --switch-output=signal, as we'll send a SIGUSR2 from the side band
 	 *  thread to its parent.
 	 */
@@ -4221,9 +4300,14 @@  int cmd_record(int argc, const char **argv)
 	}
 
 	if (rec->off_cpu) {
-		err = record__config_off_cpu(rec);
+		char off_cpu_event[64];
+
+		snprintf(off_cpu_event, sizeof(off_cpu_event),
+			 "bpf-output/no-inherit=1,name=%s/", OFFCPU_EVENT);
+
+		err = parse_event(rec->evlist, off_cpu_event);
 		if (err) {
-			pr_err("record__config_off_cpu failed, error %d\n", err);
+			pr_err("Failed to open off-cpu event\n");
 			goto out;
 		}
 	}
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 3536404e9447..c08ae6a3c8d6 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -1092,11 +1092,6 @@  static void evsel__set_default_freq_period(struct record_opts *opts,
 	}
 }
 
-static bool evsel__is_offcpu_event(struct evsel *evsel)
-{
-	return evsel__is_bpf_output(evsel) && evsel__name_is(evsel, OFFCPU_EVENT);
-}
-
 /*
  * The enable_on_exec/disabled value strategy:
  *
@@ -1363,9 +1358,6 @@  void evsel__config(struct evsel *evsel, struct record_opts *opts,
 	if (evsel__is_dummy_event(evsel))
 		evsel__reset_sample_bit(evsel, BRANCH_STACK);
 
-	if (evsel__is_offcpu_event(evsel))
-		evsel->core.attr.sample_type &= OFFCPU_SAMPLE_TYPES;
-
 	arch__post_evsel_config(evsel, attr);
 }