diff mbox series

libtracefs: Add ring buffer memory mapping APIs

Message ID 20240105152906.743d7e03@gandalf.local.home (mailing list archive)
State Superseded
Headers show
Series libtracefs: Add ring buffer memory mapping APIs | expand

Commit Message

Steven Rostedt Jan. 5, 2024, 8:29 p.m. UTC
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>

Add the following APIs:

 tracefs_cpu_open_mapped()
 tracefs_cpu_is_mapped()
 tracefs_cpu_map()
 tracefs_cpu_unmap()

This will allow applications to choose to memory map the tracing ring buffer
if it is supported. This will improve the performance of tracefs_cpu_read()
and tracefs_cpu_read_buf(), but it is not done by default because it will
also hurt the performance of tracefs_cpu_buffered_read() and
tracefs_cpu_buffered_read_buf() as those use splicing, and with the ring
buffer memory mapped, the splice has to do a copy instead of a copyless
subbuffer move.

Since this change relies on the libtraceevent APIs:

   kbuffer_dup()
   kbuffer_subbuffer()
   kbuffer_refresh()
   kbuffer_read_buffer()

Which are available after version 1.8, up the minimum version to 1.8.

Note, the samples and utest rely on:

   tep_get_sub_buffer_data_size()

which is in 1.8.1.

Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
Changes since v1: https://lore.kernel.org/linux-trace-devel/20231228201100.78aae259@rorschach.local.home

- Update the kbuf current position to the reader.read on initial mapping (Vincent Donnefort)

- Change man page example to use "read_subbuf()" instead of "read_page()" (Vincent Donnefort)

 Documentation/libtracefs-cpu-map.txt | 194 ++++++++++++++++++++++++++
 Documentation/libtracefs.txt         |   7 +
 Makefile                             |   3 +-
 include/tracefs-local.h              |   6 +
 include/tracefs.h                    |   7 +
 samples/Makefile                     |   1 +
 src/Makefile                         |   1 +
 src/tracefs-mmap.c                   | 201 +++++++++++++++++++++++++++
 src/tracefs-record.c                 |  61 ++++++++
 9 files changed, 480 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/libtracefs-cpu-map.txt
 create mode 100644 src/tracefs-mmap.c

Comments

Steven Rostedt Jan. 6, 2024, 1:08 p.m. UTC | #1
On Fri, 5 Jan 2024 15:29:06 -0500
Steven Rostedt <rostedt@goodmis.org> wrote:

> From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
> 
> Add the following APIs:
> 
>  tracefs_cpu_open_mapped()
>  tracefs_cpu_is_mapped()
>  tracefs_cpu_map()
>  tracefs_cpu_unmap()
> 
> This will allow applications to choose to memory map the tracing ring buffer
> if it is supported. This will improve the performance of tracefs_cpu_read()
> and tracefs_cpu_read_buf(), but it is not done by default because it will
> also hurt the performance of tracefs_cpu_buffered_read() and
> tracefs_cpu_buffered_read_buf() as those use splicing, and with the ring
> buffer memory mapped, the splice has to do a copy instead of a copyless
> subbuffer move.
> 
> Since this change relies on the libtraceevent APIs:
> 
>    kbuffer_dup()
>    kbuffer_subbuffer()
>    kbuffer_refresh()
>    kbuffer_read_buffer()
> 
> Which are available after version 1.8, up the minimum version to 1.8.
> 
> Note, the samples and utest rely on:
> 
>    tep_get_sub_buffer_data_size()
> 
> which is in 1.8.1.
> 
> Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
> ---
> Changes since v1: https://lore.kernel.org/linux-trace-devel/20231228201100.78aae259@rorschach.local.home

Updated the subject to say this is v2 :-p

-- Steve
Vincent Donnefort Jan. 8, 2024, 2:25 p.m. UTC | #2
[...]

> +/**
> + * trace_mmap - try to mmap the ring buffer
> + * @fd: The file descriptor to the trace_pipe_raw file
> + * @kbuf: The kbuffer to load the subbuffer to
> + *
> + * Will try to mmap the ring buffer if it is supported, and
> + * if not, will return NULL, otherwise it returns a descriptor
> + * to handle the mapping.
> + */
> +__hidden void *trace_mmap(int fd, struct kbuffer *kbuf)
> +{
> +	struct trace_mmap *tmap;
> +	int page_size;
> +	void *meta;
> +	void *data;
> +
> +	page_size = getpagesize();
> +	meta = mmap(NULL, page_size, PROT_READ, MAP_SHARED, fd, 0);
> +	if (meta == MAP_FAILED)
> +		return NULL;
> +
> +	tmap = calloc(1, sizeof(*tmap));
> +	if (!tmap) {
> +		munmap(meta, page_size);
> +		return NULL;
> +	}
> +
> +	tmap->kbuf = kbuffer_dup(kbuf);
> +	if (!tmap->kbuf) {
> +		munmap(meta, page_size);
> +		free(tmap);
> +	}
> +
> +	tmap->fd = fd;
> +
> +	tmap->map = meta;
> +	tmap->meta_len = tmap->map->meta_page_size;
> +
> +	if (tmap->meta_len > page_size) {
> +		munmap(meta, page_size);
> +		meta = mmap(NULL, tmap->meta_len, PROT_READ, MAP_SHARED, fd, 0);
> +		if (meta == MAP_FAILED) {
> +			kbuffer_free(tmap->kbuf);
> +			free(tmap);
> +			return NULL;
> +		}
> +		tmap->map = meta;
> +	}
> +
> +	tmap->data_pages = meta + tmap->meta_len;
> +
> +	tmap->data_len = tmap->map->subbuf_size * tmap->map->nr_subbufs;
> +
> +	tmap->data = mmap(NULL, tmap->data_len, PROT_READ, MAP_SHARED,
> +			  fd, tmap->meta_len);
> +	if (tmap->data == MAP_FAILED) {
> +		munmap(meta, tmap->meta_len);
> +		kbuffer_free(tmap->kbuf);
> +		free(tmap);
> +		return NULL;
> +	}
> +
> +	tmap->last_idx = tmap->map->reader.id;
> +
> +	data = tmap->data + tmap->map->subbuf_size * tmap->last_idx;
> +	kbuffer_load_subbuffer(kbuf, data);
> +
> +	/*
> +	 * The page could have left over data on it that was already
> +	 * consumed. Move the "read" forward in that case.
> +	 */
> +	if (tmap->map->reader.read) {
> +		int size = kbuffer_start_of_data(kbuf) + tmap->map->reader.read;
> +		char tmpbuf[size];
> +		kbuffer_read_buffer(kbuf, tmpbuf, size);

It does not seem to update the kbuf timestamp. To observe the problem I did:

### Create few events on the page

$ echo 0 > /sys/kernel/tracing/trace
$ <read ring-buffer>
$ cat /proc/uptime | awk '{print $1}' > /sys/kernel/debug/tracing/trace_marker
<...>-2305 279515.453542096     print: tracing_mark_write: 279515.33
<...>-2307 279522.090413680     print: tracing_mark_write: 279521.97
<...>-2309 279522.960932976     print: tracing_mark_write: 279522.85
$ <unmap and close ring-buffer>


### Re-map again the ring-buffer to trigger the fast-forward

$ <read ring-buffer>
  before fast-forward kbuf->timestamp=279515453542096
  after fast-forward kbuf->timestamp=279515453542096
$ cat /proc/uptime | awk '{print $1}' > /sys/kernel/debug/tracing/trace_marker
<...>-2312 279549.725524688     print: tracing_mark_write: 279557.12

The timestamp above is a few seconds off, which I believe might be due to an
outdated kbuf->timestamp.

> +	}
> +
> +	return tmap;
> +}
> +
> +__hidden void trace_unmap(void *mapping)
> +{
> +	struct trace_mmap *tmap = mapping;
> +
> +	munmap(tmap->data, tmap->data_len);
> +	munmap(tmap->map, tmap->meta_len);
> +	kbuffer_free(tmap->kbuf);
> +	free(tmap);
> +}
> +
Steven Rostedt Jan. 8, 2024, 5:16 p.m. UTC | #3
On Mon, 8 Jan 2024 14:25:03 +0000
Vincent Donnefort <vdonnefort@google.com> wrote:

> [...]
> 
> > +/**
> > + * trace_mmap - try to mmap the ring buffer
> > + * @fd: The file descriptor to the trace_pipe_raw file
> > + * @kbuf: The kbuffer to load the subbuffer to
> > + *
> > + * Will try to mmap the ring buffer if it is supported, and
> > + * if not, will return NULL, otherwise it returns a descriptor
> > + * to handle the mapping.
> > + */
> > +__hidden void *trace_mmap(int fd, struct kbuffer *kbuf)
> > +{
> > +	struct trace_mmap *tmap;
> > +	int page_size;
> > +	void *meta;
> > +	void *data;
> > +
> > +	page_size = getpagesize();
> > +	meta = mmap(NULL, page_size, PROT_READ, MAP_SHARED, fd, 0);
> > +	if (meta == MAP_FAILED)
> > +		return NULL;
> > +
> > +	tmap = calloc(1, sizeof(*tmap));
> > +	if (!tmap) {
> > +		munmap(meta, page_size);
> > +		return NULL;
> > +	}
> > +
> > +	tmap->kbuf = kbuffer_dup(kbuf);
> > +	if (!tmap->kbuf) {
> > +		munmap(meta, page_size);
> > +		free(tmap);
> > +	}
> > +
> > +	tmap->fd = fd;
> > +
> > +	tmap->map = meta;
> > +	tmap->meta_len = tmap->map->meta_page_size;
> > +
> > +	if (tmap->meta_len > page_size) {
> > +		munmap(meta, page_size);
> > +		meta = mmap(NULL, tmap->meta_len, PROT_READ, MAP_SHARED, fd, 0);
> > +		if (meta == MAP_FAILED) {
> > +			kbuffer_free(tmap->kbuf);
> > +			free(tmap);
> > +			return NULL;
> > +		}
> > +		tmap->map = meta;
> > +	}
> > +
> > +	tmap->data_pages = meta + tmap->meta_len;
> > +
> > +	tmap->data_len = tmap->map->subbuf_size * tmap->map->nr_subbufs;
> > +
> > +	tmap->data = mmap(NULL, tmap->data_len, PROT_READ, MAP_SHARED,
> > +			  fd, tmap->meta_len);
> > +	if (tmap->data == MAP_FAILED) {
> > +		munmap(meta, tmap->meta_len);
> > +		kbuffer_free(tmap->kbuf);
> > +		free(tmap);
> > +		return NULL;
> > +	}
> > +
> > +	tmap->last_idx = tmap->map->reader.id;
> > +
> > +	data = tmap->data + tmap->map->subbuf_size * tmap->last_idx;
> > +	kbuffer_load_subbuffer(kbuf, data);
> > +
> > +	/*
> > +	 * The page could have left over data on it that was already
> > +	 * consumed. Move the "read" forward in that case.
> > +	 */
> > +	if (tmap->map->reader.read) {
> > +		int size = kbuffer_start_of_data(kbuf) + tmap->map->reader.read;
> > +		char tmpbuf[size];
> > +		kbuffer_read_buffer(kbuf, tmpbuf, size);  
> 
> It does not seem to update the kbuf timestamp. To observe the problem I did:
> 
> ### Create few events on the page
> 
> $ echo 0 > /sys/kernel/tracing/trace
> $ <read ring-buffer>
> $ cat /proc/uptime | awk '{print $1}' > /sys/kernel/debug/tracing/trace_marker
> <...>-2305 279515.453542096     print: tracing_mark_write: 279515.33
> <...>-2307 279522.090413680     print: tracing_mark_write: 279521.97
> <...>-2309 279522.960932976     print: tracing_mark_write: 279522.85
> $ <unmap and close ring-buffer>
> 
> 
> ### Re-map again the ring-buffer to trigger the fast-forward
> 
> $ <read ring-buffer>
>   before fast-forward kbuf->timestamp=279515453542096
>   after fast-forward kbuf->timestamp=279515453542096
> $ cat /proc/uptime | awk '{print $1}' > /sys/kernel/debug/tracing/trace_marker
> <...>-2312 279549.725524688     print: tracing_mark_write: 279557.12
> 
> The timestamp above is a few seconds off, which I believe might be due to an
> outdated kbuf->timestamp.
> 

Bah, it looks like we can't just simply copy. Can you try this patch?

-- Steve

diff --git a/src/kbuffer-parse.c b/src/kbuffer-parse.c
index 1e1d168..192925a 100644
--- a/src/kbuffer-parse.c
+++ b/src/kbuffer-parse.c
@@ -961,19 +961,12 @@ kbuffer_raw_get(struct kbuffer *kbuf, void *subbuf, struct kbuffer_raw_info *inf
  */
 int kbuffer_read_buffer(struct kbuffer *kbuf, void *buffer, int len)
 {
-	int subbuf_size = kbuf->start + kbuf->size;
 	unsigned long long ts;
 	unsigned int type_len_ts;
 	bool do_swap = false;
 	int last_next;
 	int save_curr;
 
-	if (!kbuf->curr && len >= subbuf_size) {
-		memcpy(buffer, kbuf->subbuffer, subbuf_size);
-		set_curr_to_end(kbuf);
-		return kbuf->size;
-	}
-
 	/* Are we at the end of the buffer */
 	if (kbuf->curr >= kbuf->size)
 		return 0;
@@ -996,24 +989,13 @@ int kbuffer_read_buffer(struct kbuffer *kbuf, void *buffer, int len)
 
 	save_curr = kbuf->curr;
 
-	/* Copy the rest of the buffer if it fits */
-	if (len >= kbuf->size - kbuf->curr) {
-		set_curr_to_end(kbuf);
-		last_next = kbuf->size;
-	} else {
-		/*
-		 * The length doesn't hold the rest,
-		 * need to find the last that fits
-		 */
+	/* Due to timestamps, we must save the current next to use */
+	last_next = kbuf->next;
 
-		/* Due to timestamps, we must save the current next to use */
+	while (len >= kbuf->next - save_curr) {
 		last_next = kbuf->next;
-
-		while (len >= kbuf->next - save_curr) {
-			last_next = kbuf->next;
-			if (!kbuffer_next_event(kbuf, &ts))
-				break;
-		}
+		if (!kbuffer_next_event(kbuf, &ts))
+			break;
 	}
 
 	len = last_next - save_curr;
Vincent Donnefort Jan. 8, 2024, 5:34 p.m. UTC | #4
On Mon, Jan 08, 2024 at 12:16:25PM -0500, Steven Rostedt wrote:
> On Mon, 8 Jan 2024 14:25:03 +0000
> Vincent Donnefort <vdonnefort@google.com> wrote:
> 
> > [...]
> > 
> > > +/**
> > > + * trace_mmap - try to mmap the ring buffer
> > > + * @fd: The file descriptor to the trace_pipe_raw file
> > > + * @kbuf: The kbuffer to load the subbuffer to
> > > + *
> > > + * Will try to mmap the ring buffer if it is supported, and
> > > + * if not, will return NULL, otherwise it returns a descriptor
> > > + * to handle the mapping.
> > > + */
> > > +__hidden void *trace_mmap(int fd, struct kbuffer *kbuf)
> > > +{
> > > +	struct trace_mmap *tmap;
> > > +	int page_size;
> > > +	void *meta;
> > > +	void *data;
> > > +
> > > +	page_size = getpagesize();
> > > +	meta = mmap(NULL, page_size, PROT_READ, MAP_SHARED, fd, 0);
> > > +	if (meta == MAP_FAILED)
> > > +		return NULL;
> > > +
> > > +	tmap = calloc(1, sizeof(*tmap));
> > > +	if (!tmap) {
> > > +		munmap(meta, page_size);
> > > +		return NULL;
> > > +	}
> > > +
> > > +	tmap->kbuf = kbuffer_dup(kbuf);
> > > +	if (!tmap->kbuf) {
> > > +		munmap(meta, page_size);
> > > +		free(tmap);
> > > +	}
> > > +
> > > +	tmap->fd = fd;
> > > +
> > > +	tmap->map = meta;
> > > +	tmap->meta_len = tmap->map->meta_page_size;
> > > +
> > > +	if (tmap->meta_len > page_size) {
> > > +		munmap(meta, page_size);
> > > +		meta = mmap(NULL, tmap->meta_len, PROT_READ, MAP_SHARED, fd, 0);
> > > +		if (meta == MAP_FAILED) {
> > > +			kbuffer_free(tmap->kbuf);
> > > +			free(tmap);
> > > +			return NULL;
> > > +		}
> > > +		tmap->map = meta;
> > > +	}
> > > +
> > > +	tmap->data_pages = meta + tmap->meta_len;
> > > +
> > > +	tmap->data_len = tmap->map->subbuf_size * tmap->map->nr_subbufs;
> > > +
> > > +	tmap->data = mmap(NULL, tmap->data_len, PROT_READ, MAP_SHARED,
> > > +			  fd, tmap->meta_len);
> > > +	if (tmap->data == MAP_FAILED) {
> > > +		munmap(meta, tmap->meta_len);
> > > +		kbuffer_free(tmap->kbuf);
> > > +		free(tmap);
> > > +		return NULL;
> > > +	}
> > > +
> > > +	tmap->last_idx = tmap->map->reader.id;
> > > +
> > > +	data = tmap->data + tmap->map->subbuf_size * tmap->last_idx;
> > > +	kbuffer_load_subbuffer(kbuf, data);
> > > +
> > > +	/*
> > > +	 * The page could have left over data on it that was already
> > > +	 * consumed. Move the "read" forward in that case.
> > > +	 */
> > > +	if (tmap->map->reader.read) {
> > > +		int size = kbuffer_start_of_data(kbuf) + tmap->map->reader.read;
> > > +		char tmpbuf[size];
> > > +		kbuffer_read_buffer(kbuf, tmpbuf, size);  
> > 
> > It does not seem to update the kbuf timestamp. To observe the problem I did:
> > 
> > ### Create few events on the page
> > 
> > $ echo 0 > /sys/kernel/tracing/trace
> > $ <read ring-buffer>
> > $ cat /proc/uptime | awk '{print $1}' > /sys/kernel/debug/tracing/trace_marker
> > <...>-2305 279515.453542096     print: tracing_mark_write: 279515.33
> > <...>-2307 279522.090413680     print: tracing_mark_write: 279521.97
> > <...>-2309 279522.960932976     print: tracing_mark_write: 279522.85
> > $ <unmap and close ring-buffer>
> > 
> > 
> > ### Re-map again the ring-buffer to trigger the fast-forward
> > 
> > $ <read ring-buffer>
> >   before fast-forward kbuf->timestamp=279515453542096
> >   after fast-forward kbuf->timestamp=279515453542096
> > $ cat /proc/uptime | awk '{print $1}' > /sys/kernel/debug/tracing/trace_marker
> > <...>-2312 279549.725524688     print: tracing_mark_write: 279557.12
> > 
> > The timestamp above is a few seconds off, which I believe might be due to an
> > outdated kbuf->timestamp.
> > 
> 
> Bah, it looks like we can't just simply copy. Can you try this patch?

That works!

> 
> -- Steve
> 
> diff --git a/src/kbuffer-parse.c b/src/kbuffer-parse.c
> index 1e1d168..192925a 100644
> --- a/src/kbuffer-parse.c
> +++ b/src/kbuffer-parse.c
> @@ -961,19 +961,12 @@ kbuffer_raw_get(struct kbuffer *kbuf, void *subbuf, struct kbuffer_raw_info *inf
>   */
>  int kbuffer_read_buffer(struct kbuffer *kbuf, void *buffer, int len)
>  {
> -	int subbuf_size = kbuf->start + kbuf->size;
>  	unsigned long long ts;
>  	unsigned int type_len_ts;
>  	bool do_swap = false;
>  	int last_next;
>  	int save_curr;
>  
> -	if (!kbuf->curr && len >= subbuf_size) {
> -		memcpy(buffer, kbuf->subbuffer, subbuf_size);
> -		set_curr_to_end(kbuf);
> -		return kbuf->size;
> -	}
> -
>  	/* Are we at the end of the buffer */
>  	if (kbuf->curr >= kbuf->size)
>  		return 0;
> @@ -996,24 +989,13 @@ int kbuffer_read_buffer(struct kbuffer *kbuf, void *buffer, int len)
>  
>  	save_curr = kbuf->curr;
>  
> -	/* Copy the rest of the buffer if it fits */
> -	if (len >= kbuf->size - kbuf->curr) {
> -		set_curr_to_end(kbuf);
> -		last_next = kbuf->size;
> -	} else {
> -		/*
> -		 * The length doesn't hold the rest,
> -		 * need to find the last that fits
> -		 */
> +	/* Due to timestamps, we must save the current next to use */
> +	last_next = kbuf->next;
>  
> -		/* Due to timestamps, we must save the current next to use */
> +	while (len >= kbuf->next - save_curr) {
>  		last_next = kbuf->next;
> -
> -		while (len >= kbuf->next - save_curr) {
> -			last_next = kbuf->next;
> -			if (!kbuffer_next_event(kbuf, &ts))
> -				break;
> -		}
> +		if (!kbuffer_next_event(kbuf, &ts))
> +			break;
>  	}
>  
>  	len = last_next - save_curr;
Vincent Donnefort Jan. 23, 2024, 9:52 a.m. UTC | #5
[...]

> +/**
> + * trace_mmap - try to mmap the ring buffer
> + * @fd: The file descriptor to the trace_pipe_raw file
> + * @kbuf: The kbuffer to load the subbuffer to
> + *
> + * Will try to mmap the ring buffer if it is supported, and
> + * if not, will return NULL, otherwise it returns a descriptor
> + * to handle the mapping.
> + */
> +__hidden void *trace_mmap(int fd, struct kbuffer *kbuf)
> +{
> +	struct trace_mmap *tmap;
> +	int page_size;
> +	void *meta;
> +	void *data;
> +
> +	page_size = getpagesize();
> +	meta = mmap(NULL, page_size, PROT_READ, MAP_SHARED, fd, 0);
> +	if (meta == MAP_FAILED)
> +		return NULL;
> +
> +	tmap = calloc(1, sizeof(*tmap));
> +	if (!tmap) {
> +		munmap(meta, page_size);
> +		return NULL;
> +	}
> +
> +	tmap->kbuf = kbuffer_dup(kbuf);
> +	if (!tmap->kbuf) {
> +		munmap(meta, page_size);
> +		free(tmap);
> +	}
> +
> +	tmap->fd = fd;
> +
> +	tmap->map = meta;
> +	tmap->meta_len = tmap->map->meta_page_size;
> +
> +	if (tmap->meta_len > page_size) {
> +		munmap(meta, page_size);
> +		meta = mmap(NULL, tmap->meta_len, PROT_READ, MAP_SHARED, fd, 0);
> +		if (meta == MAP_FAILED) {
> +			kbuffer_free(tmap->kbuf);
> +			free(tmap);
> +			return NULL;
> +		}
> +		tmap->map = meta;
> +	}
> +
> +	tmap->data_pages = meta + tmap->meta_len;
> +
> +	tmap->data_len = tmap->map->subbuf_size * tmap->map->nr_subbufs;
> +
> +	tmap->data = mmap(NULL, tmap->data_len, PROT_READ, MAP_SHARED,
> +			  fd, tmap->meta_len);
> +	if (tmap->data == MAP_FAILED) {
> +		munmap(meta, tmap->meta_len);
> +		kbuffer_free(tmap->kbuf);
> +		free(tmap);
> +		return NULL;
> +	}
> +
> +	tmap->last_idx = tmap->map->reader.id;
> +
> +	data = tmap->data + tmap->map->subbuf_size * tmap->last_idx;
> +	kbuffer_load_subbuffer(kbuf, data);
> +
> +	/*
> +	 * The page could have left over data on it that was already
> +	 * consumed. Move the "read" forward in that case.
> +	 */
> +	if (tmap->map->reader.read) {
> +		int size = kbuffer_start_of_data(kbuf) + tmap->map->reader.read;
> +		char tmpbuf[size];
> +		kbuffer_read_buffer(kbuf, tmpbuf, size);
> +	}

We're fast-forwarding tmap->kbuf here. But in tracefs_cpu_read_buf(), we are
using tcpu->kbuf. So overall, it seems this has no effect on what will be read
later.

Sorry, not sure how I missed that when I worked with those changes before.

[...]
Steven Rostedt Jan. 23, 2024, 3:15 p.m. UTC | #6
On Tue, 23 Jan 2024 09:52:55 +0000
Vincent Donnefort <vdonnefort@google.com> wrote:

> [...]
> 

> > +	data = tmap->data + tmap->map->subbuf_size * tmap->last_idx;
> > +	kbuffer_load_subbuffer(kbuf, data);

Only the tmap->kbuf is loaded above.


> > +
> > +	/*
> > +	 * The page could have left over data on it that was already
> > +	 * consumed. Move the "read" forward in that case.
> > +	 */
> > +	if (tmap->map->reader.read) {
> > +		int size = kbuffer_start_of_data(kbuf) + tmap->map->reader.read;
> > +		char tmpbuf[size];
> > +		kbuffer_read_buffer(kbuf, tmpbuf, size);
> > +	}  
> 
> We're fast-forwarding tmap->kbuf here. But in tracefs_cpu_read_buf(), we are
> using tcpu->kbuf. So overall, it seems this has no effect on what will be read
> later.
> 
> Sorry, not sure how I missed that when I worked with those changes before.

So we would need to add this in trace_mmap_load_subbuf()?


	if (data != kbuffer_subbuffer(kbuf)) {
		kbuffer_load_subbuffer(kbuf, data);
+		/* Move the read pointer forward if need be */
+		if (kbuffer_curr_index(tmap->kbuf)) {
+			int size = kbuffer_curr_index(tmap->kbuf);
+			char tmpbuf[size];
+			kbuffer_read_buffer(kbuf, tmpbuf, size);
+		}
		
		return 1;
	}

I think that could solve that.

-- Steve
diff mbox series

Patch

diff --git a/Documentation/libtracefs-cpu-map.txt b/Documentation/libtracefs-cpu-map.txt
new file mode 100644
index 000000000000..dd5981f32daf
--- /dev/null
+++ b/Documentation/libtracefs-cpu-map.txt
@@ -0,0 +1,194 @@ 
+libtracefs(3)
+=============
+
+NAME
+----
+tracefs_cpu_open_mapped, tracefs_cpu_is_mapped, tracefs_cpu_map, tracefs_cpu_unmap - Memory mapping of the ring buffer
+
+SYNOPSIS
+--------
+[verse]
+--
+*#include <tracefs.h>*
+
+bool *tracefs_cpu_is_mapped*(struct tracefs_cpu pass:[*]tcpu);
+int *tracefs_cpu_map*(struct tracefs_cpu pass:[*]tcpu);
+void *tracefs_cpu_unmap*(struct tracefs_cpu pass:[*]tcpu);
+struct tracefs_cpu pass:[*]*tracefs_cpu_open_mapped*(struct tracefs_instance pass:[*]instance,
+					    int cpu, bool nonblock);
+--
+
+DESCRIPTION
+-----------
+If the trace_pipe_raw supports memory mapping, this is usually a more efficient
+method to stream data from the kernel ring buffer than by reading it, as it does
+not require copying the memory that is being read.
+
+If memory mapping is supported by the kernel and the application asks to use the
+memory mapping via either *tracefs_cpu_map()* or by *tracefs_cpu_open_mapped()*
+then the functions *tracefs_cpu_read*(3) and *tracefs_cpu_read_buf*(3) will use
+the mapping directly instead of calling the read system call.
+
+Note, mapping can also slow down *tracefs_cpu_buffered_read*(3) and
+*tracefs_cpu_buffered_read_buf*(3), as those use splice piping and when the
+kernel ring buffer is memory mapped, splice does a copy instead of using the
+ring buffer directly. Thus care must be used when determining to map the
+ring buffer or not, and why it does not get mapped by default.
+
+The *tracefs_cpu_is_mapped()* function will return true if _tcpu_ currently has
+its ring buffer memory mapped and false otherwise. This does not return whether or
+not that the kernel supports memory mapping, but that can usually be determined
+by calling *tracefs_cpu_map()*.
+
+The *tracefs_cpu_map()* function will attempt to map the ring buffer associated
+to _tcpu_ if it is not already mapped.
+
+The *tracefs_cpu_unmap()* function will unmap the ring buffer associated to
+_tcpu_ if it is mapped.
+
+The *tracefs_cpu_open_mapped()* is equivalent to calling *tracefs_cpu_open*(3) followed
+by *tracefs_cpu_map()* on the returned _tcpu_ of *tracefs_cpu_open*(3). Note, this
+will still succeed if the mapping fails, in which case it acts the same as
+*tracefs_cpu_open*(3). If knowing if the mapping succeed or not, *tracefs_cpu_is_mapped()*
+should be called on the return _tcpu_.
+
+RETURN VALUE
+------------
+*tracefs_cpu_is_mapped()* returns true if the given _tcpu_ has its ring buffer
+memory mapped or false otherwise.
+
+*tracefs_cpu_map()* returns 0 on success and -1 on error in mapping. If 0 is
+returned then *tracefs_cpu_is_mapped()* will return true afterward, or false
+if the mapping failed.
+
+*tracefs_cpu_open_mapped()* returns an allocated tracefs_cpu on success of creation
+regardless if it succeed in mapping the ring buffer or not. It returns NULL for
+the same reasons *tracefs_cpu_open*(3) returns NULL. If success of mapping is
+to be known, then calling *tracefs_cpu_is_mapped()* afterward is required.
+
+EXAMPLE
+-------
+[source,c]
+--
+#include <stdlib.h>
+#include <ctype.h>
+#include <tracefs.h>
+
+static void read_subbuf(struct tep_handle *tep, struct kbuffer *kbuf)
+{
+	static struct trace_seq seq;
+	struct tep_record record;
+
+	if (seq.buffer)
+		trace_seq_reset(&seq);
+	else
+		trace_seq_init(&seq);
+
+	while ((record.data = kbuffer_read_event(kbuf, &record.ts))) {
+		record.size = kbuffer_event_size(kbuf);
+		kbuffer_next_event(kbuf, NULL);
+		tep_print_event(tep, &seq, &record,
+				"%s-%d %9d\t%s: %s\n",
+				TEP_PRINT_COMM,
+				TEP_PRINT_PID,
+				TEP_PRINT_TIME,
+				TEP_PRINT_NAME,
+				TEP_PRINT_INFO);
+		trace_seq_do_printf(&seq);
+		trace_seq_reset(&seq);
+	}
+}
+
+int main (int argc, char **argv)
+{
+	struct tracefs_cpu *tcpu;
+	struct tep_handle *tep;
+	struct kbuffer *kbuf;
+	bool mapped;
+	int cpu;
+
+	if (argc < 2 || !isdigit(argv[1][0])) {
+		printf("usage: %s cpu\n\n", argv[0]);
+		exit(-1);
+	}
+
+	cpu = atoi(argv[1]);
+
+	tep = tracefs_local_events(NULL);
+	if (!tep) {
+		perror("Reading trace event formats");
+		exit(-1);
+	}
+
+	tcpu = tracefs_cpu_open_mapped(NULL, cpu, 0);
+	if (!tcpu) {
+		perror("Open CPU 0 file");
+		exit(-1);
+	}
+
+	/*
+	 * If this kernel supports mapping, use normal read,
+	 * otherwise use the piped buffer read.
+	 */
+	mapped = tracefs_cpu_is_mapped(tcpu);
+	if (!mapped)
+		printf("Was not able to map, falling back to buffered read\n");
+	while ((kbuf = mapped ? tracefs_cpu_read_buf(tcpu, true) :
+			tracefs_cpu_buffered_read_buf(tcpu, true))) {
+		read_subbuf(tep, kbuf);
+	}
+
+	kbuf = tracefs_cpu_flush_buf(tcpu);
+	if (kbuf)
+		read_subbuf(tep, kbuf);
+
+	tracefs_cpu_close(tcpu);
+	tep_free(tep);
+
+	return 0;
+}
+--
+
+FILES
+-----
+[verse]
+--
+*tracefs.h*
+	Header file to include in order to have access to the library APIs.
+*-ltracefs*
+	Linker switch to add when building a program that uses the library.
+--
+
+SEE ALSO
+--------
+*tracefs_cpu_open*(3),
+*tracefs_cpu_read*(3),
+*tracefs_cpu_read_buf*(3),
+*tracefs_cpu_buffered_read*(3),
+*tracefs_cpu_buffered_read_buf*(3),
+*libtracefs*(3),
+*libtraceevent*(3),
+*trace-cmd*(1)
+
+AUTHOR
+------
+[verse]
+--
+*Steven Rostedt* <rostedt@goodmis.org>
+--
+REPORTING BUGS
+--------------
+Report bugs to  <linux-trace-devel@vger.kernel.org>
+
+LICENSE
+-------
+libtracefs is Free Software licensed under the GNU LGPL 2.1
+
+RESOURCES
+---------
+https://git.kernel.org/pub/scm/libs/libtrace/libtracefs.git/
+
+COPYING
+-------
+Copyright \(C) 2022 Google, Inc. Free use of this software is granted under
+the terms of the GNU Public License (GPL).
diff --git a/Documentation/libtracefs.txt b/Documentation/libtracefs.txt
index b0aaa6222ec7..8dc3ba7386e3 100644
--- a/Documentation/libtracefs.txt
+++ b/Documentation/libtracefs.txt
@@ -138,6 +138,13 @@  Trace stream:
 	ssize_t *tracefs_trace_pipe_print*(struct tracefs_instance pass:[*]_instance_, int _flags_);
 	void *tracefs_trace_pipe_stop*(struct tracefs_instance pass:[*]_instance_);
 
+Memory mapping the ring buffer:
+	bool *tracefs_cpu_is_mapped*(struct tracefs_cpu pass:[*]tcpu);
+	int *tracefs_cpu_map*(struct tracefs_cpu pass:[*]tcpu);
+	void *tracefs_cpu_unmap*(struct tracefs_cpu pass:[*]tcpu);
+	struct tracefs_cpu pass:[*]*tracefs_cpu_open_mapped*(struct tracefs_instance pass:[*]instance,
+						int cpu, bool nonblock);
+
 Trace options:
 	const struct tracefs_options_mask pass:[*]*tracefs_options_get_supported*(struct tracefs_instance pass:[*]_instance_);
 	bool *tracefs_option_is_supported*(struct tracefs_instance pass:[*]_instance_, enum tracefs_option_id _id_);
diff --git a/Makefile b/Makefile
index 80076aa2cff3..e915e14b74e6 100644
--- a/Makefile
+++ b/Makefile
@@ -10,7 +10,8 @@  export TFS_PATCHLEVEL
 export TFS_EXTRAVERSION
 export TRACEFS_VERSION
 
-LIBTRACEEVENT_MIN_VERSION = 1.3
+# Note, samples and utests need 1.8.1
+LIBTRACEEVENT_MIN_VERSION = 1.8
 
 # taken from trace-cmd
 MAKEFLAGS += --no-print-directory
diff --git a/include/tracefs-local.h b/include/tracefs-local.h
index 9cae73c8b806..ffc9d33b1796 100644
--- a/include/tracefs-local.h
+++ b/include/tracefs-local.h
@@ -6,6 +6,7 @@ 
 #ifndef _TRACE_FS_LOCAL_H
 #define _TRACE_FS_LOCAL_H
 
+#include <tracefs.h>
 #include <pthread.h>
 
 #define __hidden __attribute__((visibility ("hidden")))
@@ -116,6 +117,11 @@  int trace_append_filter(char **filter, unsigned int *state,
 			enum tracefs_compare compare,
 			 const char *val);
 
+void *trace_mmap(int fd, struct kbuffer *kbuf);
+void trace_unmap(void *mapping);
+int trace_mmap_load_subbuf(void *mapping, struct kbuffer *kbuf);
+int trace_mmap_read(void *mapping, void *buffer);
+
 struct tracefs_synth *synth_init_from(struct tep_handle *tep,
 				      const char *start_system,
 				      const char *start_event);
diff --git a/include/tracefs.h b/include/tracefs.h
index 989112c851c8..8569171247b7 100644
--- a/include/tracefs.h
+++ b/include/tracefs.h
@@ -693,6 +693,13 @@  int tracefs_snapshot_snap(struct tracefs_instance *instance);
 int tracefs_snapshot_clear(struct tracefs_instance *instance);
 int tracefs_snapshot_free(struct tracefs_instance *instance);
 
+/* Memory mapping of ring buffer */
+bool tracefs_cpu_is_mapped(struct tracefs_cpu *tcpu);
+int tracefs_cpu_map(struct tracefs_cpu *tcpu);
+void tracefs_cpu_unmap(struct tracefs_cpu *tcpu);
+struct tracefs_cpu *tracefs_cpu_open_mapped(struct tracefs_instance *instance,
+					    int cpu, bool nonblock);
+
 /* Mapping vsocket cids to pids using tracing */
 int tracefs_instance_find_cid_pid(struct tracefs_instance *instance, int cid);
 int tracefs_find_cid_pid(int cid);
diff --git a/samples/Makefile b/samples/Makefile
index 77739c8b0aa7..81c8006f823e 100644
--- a/samples/Makefile
+++ b/samples/Makefile
@@ -26,6 +26,7 @@  EXAMPLES += guest
 EXAMPLES += cpu-buf
 EXAMPLES += instances-stat
 EXAMPLES += instances-subbuf
+EXAMPLES += cpu-map
 
 TARGETS :=
 TARGETS += sqlhist
diff --git a/src/Makefile b/src/Makefile
index faa3b25c4002..be81059ce10a 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -16,6 +16,7 @@  OBJS += tracefs-dynevents.o
 OBJS += tracefs-eprobes.o
 OBJS += tracefs-uprobes.o
 OBJS += tracefs-record.o
+OBJS += tracefs-mmap.o
 ifeq ($(VSOCK_DEFINED), 1)
 OBJS += tracefs-vsock.o
 endif
diff --git a/src/tracefs-mmap.c b/src/tracefs-mmap.c
new file mode 100644
index 000000000000..5bed234189bf
--- /dev/null
+++ b/src/tracefs-mmap.c
@@ -0,0 +1,201 @@ 
+// SPDX-License-Identifier: LGPL-2.1
+/*
+ * Copyright (C) 2023 Google Inc, Steven Rostedt <rostedt@goodmis.org>
+ */
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <sys/ioctl.h>
+#include <asm/types.h>
+#include "tracefs-local.h"
+
+struct trace_buffer_meta {
+	unsigned long	entries;
+	unsigned long	overrun;
+	unsigned long	read;
+
+	unsigned long	subbufs_touched;
+	unsigned long	subbufs_lost;
+	unsigned long	subbufs_read;
+
+	struct {
+		unsigned long	lost_events;	/* Events lost at the time of the reader swap */
+		__u32		id;		/* Reader subbuf ID from 0 to nr_subbufs - 1 */
+		__u32		read;		/* Number of bytes read on the reader subbuf */
+	} reader;
+
+	__u32		subbuf_size;		/* Size of each subbuf including the header */
+	__u32		nr_subbufs;		/* Number of subbufs in the ring-buffer */
+
+	__u32		meta_page_size;		/* Size of the meta-page */
+	__u32		meta_struct_len;	/* Len of this struct */
+};
+
+#define TRACE_MMAP_IOCTL_GET_READER		_IO('T', 0x1)
+
+struct trace_mmap {
+	struct trace_buffer_meta	*map;
+	struct kbuffer			*kbuf;
+	void				*data;
+	int				*data_pages;
+	int				fd;
+	int				last_idx;
+	int				last_read;
+	int				meta_len;
+	int				data_len;
+};
+
+/**
+ * trace_mmap - try to mmap the ring buffer
+ * @fd: The file descriptor to the trace_pipe_raw file
+ * @kbuf: The kbuffer to load the subbuffer to
+ *
+ * Will try to mmap the ring buffer if it is supported, and
+ * if not, will return NULL, otherwise it returns a descriptor
+ * to handle the mapping.
+ */
+__hidden void *trace_mmap(int fd, struct kbuffer *kbuf)
+{
+	struct trace_mmap *tmap;
+	int page_size;
+	void *meta;
+	void *data;
+
+	page_size = getpagesize();
+	meta = mmap(NULL, page_size, PROT_READ, MAP_SHARED, fd, 0);
+	if (meta == MAP_FAILED)
+		return NULL;
+
+	tmap = calloc(1, sizeof(*tmap));
+	if (!tmap) {
+		munmap(meta, page_size);
+		return NULL;
+	}
+
+	tmap->kbuf = kbuffer_dup(kbuf);
+	if (!tmap->kbuf) {
+		munmap(meta, page_size);
+		free(tmap);
+	}
+
+	tmap->fd = fd;
+
+	tmap->map = meta;
+	tmap->meta_len = tmap->map->meta_page_size;
+
+	if (tmap->meta_len > page_size) {
+		munmap(meta, page_size);
+		meta = mmap(NULL, tmap->meta_len, PROT_READ, MAP_SHARED, fd, 0);
+		if (meta == MAP_FAILED) {
+			kbuffer_free(tmap->kbuf);
+			free(tmap);
+			return NULL;
+		}
+		tmap->map = meta;
+	}
+
+	tmap->data_pages = meta + tmap->meta_len;
+
+	tmap->data_len = tmap->map->subbuf_size * tmap->map->nr_subbufs;
+
+	tmap->data = mmap(NULL, tmap->data_len, PROT_READ, MAP_SHARED,
+			  fd, tmap->meta_len);
+	if (tmap->data == MAP_FAILED) {
+		munmap(meta, tmap->meta_len);
+		kbuffer_free(tmap->kbuf);
+		free(tmap);
+		return NULL;
+	}
+
+	tmap->last_idx = tmap->map->reader.id;
+
+	data = tmap->data + tmap->map->subbuf_size * tmap->last_idx;
+	kbuffer_load_subbuffer(kbuf, data);
+
+	/*
+	 * The page could have left over data on it that was already
+	 * consumed. Move the "read" forward in that case.
+	 */
+	if (tmap->map->reader.read) {
+		int size = kbuffer_start_of_data(kbuf) + tmap->map->reader.read;
+		char tmpbuf[size];
+		kbuffer_read_buffer(kbuf, tmpbuf, size);
+	}
+
+	return tmap;
+}
+
+__hidden void trace_unmap(void *mapping)
+{
+	struct trace_mmap *tmap = mapping;
+
+	munmap(tmap->data, tmap->data_len);
+	munmap(tmap->map, tmap->meta_len);
+	kbuffer_free(tmap->kbuf);
+	free(tmap);
+}
+
+__hidden int trace_mmap_load_subbuf(void *mapping, struct kbuffer *kbuf)
+{
+	struct trace_mmap *tmap = mapping;
+	void *data;
+	int id;
+
+	id = tmap->map->reader.id;
+	data = tmap->data + tmap->map->subbuf_size * id;
+
+	/*
+	 * If kbuf doesn't point to the current sub-buffer
+	 * just load it and return.
+	 */
+	if (data != kbuffer_subbuffer(kbuf)) {
+		kbuffer_load_subbuffer(kbuf, data);
+		return 1;
+	}
+
+	/*
+	 * Perhaps the reader page had a write that added
+	 * more data.
+	 */
+	kbuffer_refresh(kbuf);
+
+	/* Are there still events to read? */
+	if (kbuffer_curr_size(kbuf))
+		return 1;
+
+	/* See if a new page is ready? */
+	if (ioctl(tmap->fd, TRACE_MMAP_IOCTL_GET_READER) < 0)
+		return -1;
+	id = tmap->map->reader.id;
+	data = tmap->data + tmap->map->subbuf_size * id;
+
+	/*
+	 * If the sub-buffer hasn't changed, then there's no more
+	 * events to read.
+	 */
+	if (data == kbuffer_subbuffer(kbuf))
+		return 0;
+
+	kbuffer_load_subbuffer(kbuf, data);
+	return 1;
+}
+
+__hidden int trace_mmap_read(void *mapping, void *buffer)
+{
+	struct trace_mmap *tmap = mapping;
+	struct kbuffer *kbuf;
+	int ret;
+
+	if (!tmap)
+		return -1;
+
+	kbuf = tmap->kbuf;
+
+	ret = trace_mmap_load_subbuf(mapping, kbuf);
+	/* Return for error or no more events */
+	if (ret <= 0)
+		return ret;
+
+	/* Update the buffer */
+	return kbuffer_read_buffer(kbuf, buffer, tmap->map->subbuf_size);
+}
diff --git a/src/tracefs-record.c b/src/tracefs-record.c
index e8be3335070b..f51e18420bc7 100644
--- a/src/tracefs-record.c
+++ b/src/tracefs-record.c
@@ -36,6 +36,7 @@  struct tracefs_cpu {
 	int		splice_read_flags;
 	struct kbuffer	*kbuf;
 	void		*buffer;
+	void		*mapping;
 };
 
 /**
@@ -229,6 +230,31 @@  int tracefs_snapshot_free(struct tracefs_instance *instance)
 	return ret < 0 ? -1 : 0;
 }
 
+/**
+ * tracefs_cpu_open_mapped - open an instance raw trace file and map it
+ * @instance: the instance (NULL for toplevel) of the cpu raw file to open
+ * @cpu: The CPU that the raw trace file is associated with
+ * @nonblock: If true, the file will be opened in O_NONBLOCK mode
+ *
+ * Return a descriptor that can read the tracefs trace_pipe_raw file
+ * for a give @cpu in a given @instance.
+ *
+ * Returns NULL on error.
+ */
+struct tracefs_cpu *
+tracefs_cpu_open_mapped(struct tracefs_instance *instance, int cpu, bool nonblock)
+{
+	struct tracefs_cpu *tcpu;
+
+	tcpu = tracefs_cpu_open(instance, cpu, nonblock);
+	if (!tcpu)
+		return NULL;
+
+	tracefs_cpu_map(tcpu);
+
+	return tcpu;
+}
+
 static void close_fd(int fd)
 {
 	if (fd < 0)
@@ -285,6 +311,28 @@  int tracefs_cpu_read_size(struct tracefs_cpu *tcpu)
 	return tcpu->subbuf_size;
 }
 
+bool tracefs_cpu_is_mapped(struct tracefs_cpu *tcpu)
+{
+	return tcpu->mapping != NULL;
+}
+
+int tracefs_cpu_map(struct tracefs_cpu *tcpu)
+{
+	if (tcpu->mapping)
+		return 0;
+
+	tcpu->mapping = trace_mmap(tcpu->fd, tcpu->kbuf);
+	return tcpu->mapping ? 0 : -1;
+}
+
+void tracefs_cpu_unmap(struct tracefs_cpu *tcpu)
+{
+	if (!tcpu->mapping)
+		return;
+
+	trace_unmap(tcpu->mapping);
+}
+
 static void set_nonblock(struct tracefs_cpu *tcpu)
 {
 	long flags;
@@ -383,6 +431,9 @@  int tracefs_cpu_read(struct tracefs_cpu *tcpu, void *buffer, bool nonblock)
 	if (ret <= 0)
 		return ret;
 
+	if (tcpu->mapping)
+		return trace_mmap_read(tcpu->mapping, buffer);
+
 	ret = read(tcpu->fd, buffer, tcpu->subbuf_size);
 
 	/* It's OK if there's no data to read */
@@ -427,6 +478,16 @@  struct kbuffer *tracefs_cpu_read_buf(struct tracefs_cpu *tcpu, bool nonblock)
 {
 	int ret;
 
+	/* If mapping is enabled, just use it directly */
+	if (tcpu->mapping) {
+		ret = wait_on_input(tcpu, nonblock);
+		if (ret <= 0)
+			return NULL;
+
+		ret = trace_mmap_load_subbuf(tcpu->mapping, tcpu->kbuf);
+		return ret > 0 ? tcpu->kbuf : NULL;
+	}
+
 	if (!get_buffer(tcpu))
 		return NULL;