diff mbox series

libtracefs: Add ring buffer memory mapping APIs

Message ID 20231228201100.78aae259@rorschach.local.home (mailing list archive)
State Superseded
Headers show
Series libtracefs: Add ring buffer memory mapping APIs | expand

Commit Message

Steven Rostedt Dec. 29, 2023, 1:11 a.m. UTC
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>

Add the following APIs:

 tracefs_cpu_open_mapped()
 tracefs_cpu_is_mapped()
 tracefs_cpu_map()
 tracefs_cpu_unmap()

This will allow applications to choose to memory map the tracing ring buffer
if it is supported. This will improve the performance of tracefs_cpu_read()
and tracefs_cpu_read_buf(), but it is not done by default because it will
also hurt the performance of tracefs_cpu_buffered_read() and
tracefs_cpu_buffered_read_buf() as those use splicing, and with the ring
buffer memory mapped, the splice has to do a copy instead of a copyless
subbuffer move.

Since this change relies on the libtraceevent APIs:

   kbuffer_dup()
   kbuffer_subbuffer()
   kbuffer_refresh()
   kbuffer_read_buffer()

Which are available after version 1.8, up the minimum version to 1.8.

Note, the samples and utest rely on:

   tep_get_sub_buffer_data_size()

which is in 1.8.1.

Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 Documentation/libtracefs-cpu-map.txt | 194 +++++++++++++++++++++++++++
 Documentation/libtracefs.txt         |   7 +
 Makefile                             |   3 +-
 include/tracefs-local.h              |   6 +
 include/tracefs.h                    |   7 +
 samples/Makefile                     |   1 +
 src/Makefile                         |   1 +
 src/tracefs-mmap.c                   | 190 ++++++++++++++++++++++++++
 src/tracefs-record.c                 |  61 +++++++++
 9 files changed, 469 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/libtracefs-cpu-map.txt
 create mode 100644 src/tracefs-mmap.c

Comments

Vincent Donnefort Jan. 5, 2024, 9:17 a.m. UTC | #1
[...]

> +EXAMPLE
> +-------
> +[source,c]
> +--
> +#include <stdlib.h>
> +#include <ctype.h>
> +#include <tracefs.h>
> +
> +static void read_page(struct tep_handle *tep, struct kbuffer *kbuf)

read_subbuf?

> +{
> +	static struct trace_seq seq;
> +	struct tep_record record;
> +
> +	if (seq.buffer)
> +		trace_seq_reset(&seq);
> +	else
> +		trace_seq_init(&seq);
> +
> +	while ((record.data = kbuffer_read_event(kbuf, &record.ts))) {
> +		record.size = kbuffer_event_size(kbuf);
> +		kbuffer_next_event(kbuf, NULL);
> +		tep_print_event(tep, &seq, &record,
> +				"%s-%d %9d\t%s: %s\n",
> +				TEP_PRINT_COMM,
> +				TEP_PRINT_PID,
> +				TEP_PRINT_TIME,
> +				TEP_PRINT_NAME,
> +				TEP_PRINT_INFO);
> +		trace_seq_do_printf(&seq);
> +		trace_seq_reset(&seq);
> +	}
> +}
> +

[...]

> +__hidden void *trace_mmap(int fd, struct kbuffer *kbuf)
> +{
> +	struct trace_mmap *tmap;
> +	int page_size;
> +	void *meta;
> +	void *data;
> +
> +	page_size = getpagesize();
> +	meta = mmap(NULL, page_size, PROT_READ, MAP_SHARED, fd, 0);
> +	if (meta == MAP_FAILED)
> +		return NULL;
> +
> +	tmap = calloc(1, sizeof(*tmap));
> +	if (!tmap) {
> +		munmap(meta, page_size);
> +		return NULL;
> +	}
> +
> +	tmap->kbuf = kbuffer_dup(kbuf);
> +	if (!tmap->kbuf) {
> +		munmap(meta, page_size);
> +		free(tmap);
> +	}
> +
> +	tmap->fd = fd;
> +
> +	tmap->map = meta;
> +	tmap->meta_len = tmap->map->meta_page_size;
> +
> +	if (tmap->meta_len > page_size) {
> +		munmap(meta, page_size);
> +		meta = mmap(NULL, tmap->meta_len, PROT_READ, MAP_SHARED, fd, 0);
> +		if (meta == MAP_FAILED) {
> +			kbuffer_free(tmap->kbuf);
> +			free(tmap);
> +			return NULL;
> +		}
> +		tmap->map = meta;
> +	}
> +
> +	tmap->data_pages = meta + tmap->meta_len;
> +
> +	tmap->data_len = tmap->map->subbuf_size * tmap->map->nr_subbufs;
> +
> +	tmap->data = mmap(NULL, tmap->data_len, PROT_READ, MAP_SHARED,
> +			  fd, tmap->meta_len);
> +	if (tmap->data == MAP_FAILED) {
> +		munmap(meta, tmap->meta_len);
> +		kbuffer_free(tmap->kbuf);
> +		free(tmap);
> +		return NULL;
> +	}
> +
> +	tmap->last_idx = tmap->map->reader.id;
> +
> +	data = tmap->data + tmap->map->subbuf_size * tmap->last_idx;
> +	kbuffer_load_subbuffer(kbuf, data);

Could it fast-forward to the event until tmap->map->reader.read? So we don't
read again the same events.

Something like

  while (kbuf->curr < tmap->map->reader.read)
  	kbuffer_next_event(kbuf, NULL);

> +
> +	return tmap;
> +}
> +
> +__hidden void trace_unmap(void *mapping)
> +{
> +	struct trace_mmap *tmap = mapping;
> +
> +	munmap(tmap->data, tmap->data_len);
> +	munmap(tmap->map, tmap->meta_len);
> +	kbuffer_free(tmap->kbuf);
> +	free(tmap);
> +}
> +
> +__hidden int trace_mmap_load_subbuf(void *mapping, struct kbuffer *kbuf)
> +{
> +	struct trace_mmap *tmap = mapping;
> +	void *data;
> +	int id;
> +
> +	id = tmap->map->reader.id;
> +	data = tmap->data + tmap->map->subbuf_size * id;
> +
> +	/*
> +	 * If kbuf doesn't point to the current sub-buffer
> +	 * just load it and return.
> +	 */
> +	if (data != kbuffer_subbuffer(kbuf)) {
> +		kbuffer_load_subbuffer(kbuf, data);
> +		return 1;
> +	}
> +
> +	/*
> +	 * Perhaps the reader page had a write that added
> +	 * more data.
> +	 */
> +	kbuffer_refresh(kbuf);
> +
> +	/* Are there still events to read? */
> +	if (kbuffer_curr_size(kbuf))
> +		return 1;

It does not seem to be enough, only kbuf->size is updated in kbuffer_refresh()
while kbuffer_curr_size is next - cur.

> +
> +	/* See if a new page is ready? */
> +	if (ioctl(tmap->fd, TRACE_MMAP_IOCTL_GET_READER) < 0)
> +		return -1;

Maybe this ioctl should be called regardless if events are found on the current
reader page. This would at least update the reader->read field and make sure
subsequent readers are not getting the same events we already had here?

> +	id = tmap->map->reader.id;
> +	data = tmap->data + tmap->map->subbuf_size * id;
> +

[...]
Steven Rostedt Jan. 5, 2024, 1:41 p.m. UTC | #2
On Fri, 5 Jan 2024 09:17:53 +0000
Vincent Donnefort <vdonnefort@google.com> wrote:

> [...]
> 
> > +EXAMPLE
> > +-------
> > +[source,c]
> > +--
> > +#include <stdlib.h>
> > +#include <ctype.h>
> > +#include <tracefs.h>
> > +
> > +static void read_page(struct tep_handle *tep, struct kbuffer *kbuf)  
> 
> read_subbuf?

Heh, sure. That was copied from the original test I had.

> 
> > +{
> > +	static struct trace_seq seq;
> > +	struct tep_record record;
> > +
> > +	if (seq.buffer)
> > +		trace_seq_reset(&seq);
> > +	else
> > +		trace_seq_init(&seq);
> > +
> > +	while ((record.data = kbuffer_read_event(kbuf, &record.ts))) {
> > +		record.size = kbuffer_event_size(kbuf);
> > +		kbuffer_next_event(kbuf, NULL);
> > +		tep_print_event(tep, &seq, &record,
> > +				"%s-%d %9d\t%s: %s\n",
> > +				TEP_PRINT_COMM,
> > +				TEP_PRINT_PID,
> > +				TEP_PRINT_TIME,
> > +				TEP_PRINT_NAME,
> > +				TEP_PRINT_INFO);
> > +		trace_seq_do_printf(&seq);
> > +		trace_seq_reset(&seq);
> > +	}
> > +}
> > +  
> 
> [...]
> 
> > +__hidden void *trace_mmap(int fd, struct kbuffer *kbuf)
> > +{
> > +	struct trace_mmap *tmap;
> > +	int page_size;
> > +	void *meta;
> > +	void *data;
> > +
> > +	page_size = getpagesize();
> > +	meta = mmap(NULL, page_size, PROT_READ, MAP_SHARED, fd, 0);
> > +	if (meta == MAP_FAILED)
> > +		return NULL;
> > +
> > +	tmap = calloc(1, sizeof(*tmap));
> > +	if (!tmap) {
> > +		munmap(meta, page_size);
> > +		return NULL;
> > +	}
> > +
> > +	tmap->kbuf = kbuffer_dup(kbuf);
> > +	if (!tmap->kbuf) {
> > +		munmap(meta, page_size);
> > +		free(tmap);
> > +	}
> > +
> > +	tmap->fd = fd;
> > +
> > +	tmap->map = meta;
> > +	tmap->meta_len = tmap->map->meta_page_size;
> > +
> > +	if (tmap->meta_len > page_size) {
> > +		munmap(meta, page_size);
> > +		meta = mmap(NULL, tmap->meta_len, PROT_READ, MAP_SHARED, fd, 0);
> > +		if (meta == MAP_FAILED) {
> > +			kbuffer_free(tmap->kbuf);
> > +			free(tmap);
> > +			return NULL;
> > +		}
> > +		tmap->map = meta;
> > +	}
> > +
> > +	tmap->data_pages = meta + tmap->meta_len;
> > +
> > +	tmap->data_len = tmap->map->subbuf_size * tmap->map->nr_subbufs;
> > +
> > +	tmap->data = mmap(NULL, tmap->data_len, PROT_READ, MAP_SHARED,
> > +			  fd, tmap->meta_len);
> > +	if (tmap->data == MAP_FAILED) {
> > +		munmap(meta, tmap->meta_len);
> > +		kbuffer_free(tmap->kbuf);
> > +		free(tmap);
> > +		return NULL;
> > +	}
> > +
> > +	tmap->last_idx = tmap->map->reader.id;
> > +
> > +	data = tmap->data + tmap->map->subbuf_size * tmap->last_idx;
> > +	kbuffer_load_subbuffer(kbuf, data);  
> 
> Could it fast-forward to the event until tmap->map->reader.read? So we don't
> read again the same events.
> 
> Something like
> 
>   while (kbuf->curr < tmap->map->reader.read)
>   	kbuffer_next_event(kbuf, NULL);

Note that kbuf->curr is not available to libtracefs. That's internal to
libtraceevent.

But we could have somethings like:

	kbuffer_load_subbuffer(kbuf, data);

	/* Update to kbuf index to the next read */
	if (tmap->map->reader.read) {
		char tmpbuf[tmap->map->reader.read];
		kbuffer_read_buffer(kbuf, tmpbuf, tmap->map->reader.read);
	}

Which should move the kbuf->curr to reader.read.


> 
> > +
> > +	return tmap;
> > +}
> > +
> > +__hidden void trace_unmap(void *mapping)
> > +{
> > +	struct trace_mmap *tmap = mapping;
> > +
> > +	munmap(tmap->data, tmap->data_len);
> > +	munmap(tmap->map, tmap->meta_len);
> > +	kbuffer_free(tmap->kbuf);
> > +	free(tmap);
> > +}
> > +
> > +__hidden int trace_mmap_load_subbuf(void *mapping, struct kbuffer *kbuf)
> > +{
> > +	struct trace_mmap *tmap = mapping;
> > +	void *data;
> > +	int id;
> > +
> > +	id = tmap->map->reader.id;
> > +	data = tmap->data + tmap->map->subbuf_size * id;
> > +
> > +	/*
> > +	 * If kbuf doesn't point to the current sub-buffer
> > +	 * just load it and return.
> > +	 */
> > +	if (data != kbuffer_subbuffer(kbuf)) {
> > +		kbuffer_load_subbuffer(kbuf, data);
> > +		return 1;
> > +	}
> > +
> > +	/*
> > +	 * Perhaps the reader page had a write that added
> > +	 * more data.
> > +	 */
> > +	kbuffer_refresh(kbuf);
> > +
> > +	/* Are there still events to read? */
> > +	if (kbuffer_curr_size(kbuf))
> > +		return 1;  
> 
> It does not seem to be enough, only kbuf->size is updated in kbuffer_refresh()
> while kbuffer_curr_size is next - cur.

That's the size of the event, not what's on the buffer. Next just points to
the end of the current event. Hmm, or you mean that we read the last event
and something else was added? Yeah, should check if curr == size, then next
would need to be updated. That's a bug in kbuffer_refresh().

> 
> > +
> > +	/* See if a new page is ready? */
> > +	if (ioctl(tmap->fd, TRACE_MMAP_IOCTL_GET_READER) < 0)
> > +		return -1;  
> 
> Maybe this ioctl should be called regardless if events are found on the current
> reader page. This would at least update the reader->read field and make sure
> subsequent readers are not getting the same events we already had here?

If we call the ioctl() before we are finished reading events, the events on
the reader page will be discarded and added back to the writer buffer if
the writer is not on the reader page.

And there should only be one reader accessing the map. This is not thread
safe. Once you load the subbuffer into kbuf, kbuf handles what was read. We
don't want to use the reader page for that.

If something is reading the buffer outside this application, it's fine if
we read the same events. Multiple readers of the same buffer already screw
things up today. That's why I created instances.

-- Steve


> 
> > +	id = tmap->map->reader.id;
> > +	data = tmap->data + tmap->map->subbuf_size * id;
> > +  
> 
> [...]
Vincent Donnefort Jan. 5, 2024, 2:25 p.m. UTC | #3
[...]

> > > +	/*
> > > +	 * Perhaps the reader page had a write that added
> > > +	 * more data.
> > > +	 */
> > > +	kbuffer_refresh(kbuf);
> > > +
> > > +	/* Are there still events to read? */
> > > +	if (kbuffer_curr_size(kbuf))
> > > +		return 1;  
> > 
> > It does not seem to be enough, only kbuf->size is updated in kbuffer_refresh()
> > while kbuffer_curr_size is next - cur.
> 
> That's the size of the event, not what's on the buffer. Next just points to
> the end of the current event. Hmm, or you mean that we read the last event
> and something else was added? Yeah, should check if curr == size, then next
> would need to be updated. That's a bug in kbuffer_refresh().

Yeah, probably kbuffer_refresh should also update kbuf->next and not only
kbuf->size.

> 
> > 
> > > +
> > > +	/* See if a new page is ready? */
> > > +	if (ioctl(tmap->fd, TRACE_MMAP_IOCTL_GET_READER) < 0)
> > > +		return -1;  
> > 
> > Maybe this ioctl should be called regardless if events are found on the current
> > reader page. This would at least update the reader->read field and make sure
> > subsequent readers are not getting the same events we already had here?
> 
> If we call the ioctl() before we are finished reading events, the events on
> the reader page will be discarded and added back to the writer buffer if
> the writer is not on the reader page.

Hum, I am a bit confused here. If the writer is not on the reader page, then
there's no way a writer could overwrite these events while we access them?

> 
> And there should only be one reader accessing the map. This is not thread
> safe. Once you load the subbuffer into kbuf, kbuf handles what was read. We
> don't want to use the reader page for that.

I had in mind a scenario with two sequential read. In that case only the reader
page read could help to "save" what has been read so far.

Currently, we have:

 <event A>
 ./cpu-map
   print event A

 <event B>
 ./cpu-map
   print event A 
   print event B

> 
> If something is reading the buffer outside this application, it's fine if
> we read the same events. Multiple readers of the same buffer already screw
> things up today. That's why I created instances.
> 
> -- Steve
> 
> 
> > 
> > > +	id = tmap->map->reader.id;
> > > +	data = tmap->data + tmap->map->subbuf_size * id;
> > > +  
> > 
> > [...]
>
Steven Rostedt Jan. 5, 2024, 5:31 p.m. UTC | #4
On Fri, 5 Jan 2024 14:25:15 +0000
Vincent Donnefort <vdonnefort@google.com> wrote:

> > > Maybe this ioctl should be called regardless if events are found on the current
> > > reader page. This would at least update the reader->read field and make sure
> > > subsequent readers are not getting the same events we already had here?  
> > 
> > If we call the ioctl() before we are finished reading events, the events on
> > the reader page will be discarded and added back to the writer buffer if
> > the writer is not on the reader page.  
> 
> Hum, I am a bit confused here. If the writer is not on the reader page, then
> there's no way a writer could overwrite these events while we access them?

Maybe I'm confused. Where do you want to call the ioctl again? If the
writer is off the reader page and we call the ioctl() where no new events
were added since our last read, the ioctl() will advance the reader page,
will it not? That is, any events that were on the previous reader page that
we did not read yet is lost.

The trace_mmap_load_subbuf() is called to update the reader page. If for
some reason the kbuf hasn't read all the data and calls the
tracefs_cpu_read_buf() again, it should still return the same kbuf, but
updated.

 kbuf = tracefs_cpu_read_buf() {
    ret = trace_mmap_load_subbuf(tcpu->mapping, tcpu->kbuf) {
        if (data != kbuffer_subbuffer(kbuf)) {
            kbuffer_load_subbuffer(kbuf, data);
            return 1;
        }
     return ret > 0 ? tcpu->kbuf : NULL;
 }

 record.data = kbuffer_read_event(kbuf, &record.ts);
 kbuffer_next_event(kbuf, NULL);

 kbuf = tracefs_cpu_read_buf() {
    ret = trace_mmap_load_subbuf(tcpu->mapping, tcpu->kbuf) {
        kbuffer_refresh(kbuf);
        /* Are there still events to read? */
        if (kbuffer_curr_size(kbuf))
            return 1;

The above should return because the application has not read all of the
kbuf yet. This is perfectly fine for the application to do this.

If we don't return here, but instead do:

        ioctl(tmap->fd, TRACE_MMAP_IOCTL_GET_READER);

The buffer that kbuf points to will be swapped out with a new page. And the
data on the buffer is lost.

Hmm, but looking at how the code currently works without mapping, it looks
like it would do the read again anyway assuming that the kbuf was
completely read before reading again. Perhaps it would make the logic
simpler to assume that the buffer was completely read before this gets
called again. That is, we could have it do:

	if (data != kbuffer_subbuffer(kbuf)) {
		kbuffer_load_subbuffer(kbuf, data);
		return 1;
	}

	if (ioctl(tmap->fd, TRACE_MMAP_IOCTL_GET_READER) < 0)
		return -1;

	if (id != tmap->map->reader.id) {
		/* New page, just reload it */
		data = tmap->data + tmap->map->subbuf_size * id;
		kbuffer_load_subbuffer(kbuf, data);
		return 1;
	}

	/*
	 * Perhaps the reader page had a write that added
	 * more data.
	 */
	kbuffer_refresh(kbuf);

	/* Are there still events to read? */
	return kbuffer_curr_size(kbuf) ? 1 : 0;


> 
> > 
> > And there should only be one reader accessing the map. This is not thread
> > safe. Once you load the subbuffer into kbuf, kbuf handles what was read. We
> > don't want to use the reader page for that.  
> 
> I had in mind a scenario with two sequential read. In that case only the reader
> page read could help to "save" what has been read so far.
> 
> Currently, we have:
> 
>  <event A>
>  ./cpu-map
>    print event A
> 
>  <event B>
>  ./cpu-map
>    print event A 
>    print event B

The kbuffer keeps track of what was read from it, so I'm still confused by
what you mean.

-- Steve
Vincent Donnefort Jan. 5, 2024, 6:23 p.m. UTC | #5
On Fri, Jan 05, 2024 at 12:31:11PM -0500, Steven Rostedt wrote:
> On Fri, 5 Jan 2024 14:25:15 +0000
> Vincent Donnefort <vdonnefort@google.com> wrote:
> 
> > > > Maybe this ioctl should be called regardless if events are found on the current
> > > > reader page. This would at least update the reader->read field and make sure
> > > > subsequent readers are not getting the same events we already had here?  
> > > 
> > > If we call the ioctl() before we are finished reading events, the events on
> > > the reader page will be discarded and added back to the writer buffer if
> > > the writer is not on the reader page.  
> > 
> > Hum, I am a bit confused here. If the writer is not on the reader page, then
> > there's no way a writer could overwrite these events while we access them?
> 
> Maybe I'm confused. Where do you want to call the ioctl again? If the
> writer is off the reader page and we call the ioctl() where no new events
> were added since our last read, the ioctl() will advance the reader page,
> will it not? That is, any events that were on the previous reader page that
> we did not read yet is lost.

I meant like the snippet you shared below.

If on a reader page, there are "unread" events, the ioctl will simply return the
same reader page, as long as there is something to read. It is then safe to call
the ioctl unconditionally.

> 
> The trace_mmap_load_subbuf() is called to update the reader page. If for
> some reason the kbuf hasn't read all the data and calls the
> tracefs_cpu_read_buf() again, it should still return the same kbuf, but
> updated.
> 
>  kbuf = tracefs_cpu_read_buf() {
>     ret = trace_mmap_load_subbuf(tcpu->mapping, tcpu->kbuf) {
>         if (data != kbuffer_subbuffer(kbuf)) {
>             kbuffer_load_subbuffer(kbuf, data);
>             return 1;
>         }
>      return ret > 0 ? tcpu->kbuf : NULL;
>  }
> 
>  record.data = kbuffer_read_event(kbuf, &record.ts);
>  kbuffer_next_event(kbuf, NULL);
> 
>  kbuf = tracefs_cpu_read_buf() {
>     ret = trace_mmap_load_subbuf(tcpu->mapping, tcpu->kbuf) {
>         kbuffer_refresh(kbuf);
>         /* Are there still events to read? */
>         if (kbuffer_curr_size(kbuf))
>             return 1;
> 
> The above should return because the application has not read all of the
> kbuf yet. This is perfectly fine for the application to do this.
> 
> If we don't return here, but instead do:
> 
>         ioctl(tmap->fd, TRACE_MMAP_IOCTL_GET_READER);
> 
> The buffer that kbuf points to will be swapped out with a new page. And the
> data on the buffer is lost.

It shouldn't, the same reader page will be returned, only the reader_page->read
pointer would be updated.

> 
> Hmm, but looking at how the code currently works without mapping, it looks
> like it would do the read again anyway assuming that the kbuf was
> completely read before reading again. Perhaps it would make the logic
> simpler to assume that the buffer was completely read before this gets
> called again. That is, we could have it do:
> 
> 	if (data != kbuffer_subbuffer(kbuf)) {
> 		kbuffer_load_subbuffer(kbuf, data);
> 		return 1;
> 	}
> 
> 	if (ioctl(tmap->fd, TRACE_MMAP_IOCTL_GET_READER) < 0)
> 		return -1;
> 
> 	if (id != tmap->map->reader.id) {
> 		/* New page, just reload it */
> 		data = tmap->data + tmap->map->subbuf_size * id;
> 		kbuffer_load_subbuffer(kbuf, data);
> 		return 1;
> 	}
> 
> 	/*
> 	 * Perhaps the reader page had a write that added
> 	 * more data.
> 	 */
> 	kbuffer_refresh(kbuf);
> 
> 	/* Are there still events to read? */
> 	return kbuffer_curr_size(kbuf) ? 1 : 0;
>

That sounds good

> 
> > 
> > > 
> > > And there should only be one reader accessing the map. This is not thread
> > > safe. Once you load the subbuffer into kbuf, kbuf handles what was read. We
> > > don't want to use the reader page for that.  
> > 
> > I had in mind a scenario with two sequential read. In that case only the reader
> > page read could help to "save" what has been read so far.
> > 
> > Currently, we have:
> > 
> >  <event A>
> >  ./cpu-map
> >    print event A
> > 
> >  <event B>
> >  ./cpu-map
> >    print event A 
> >    print event B
> 
> The kbuffer keeps track of what was read from it, so I'm still confused by
> what you mean.

We probably want to keep track of what has already been consumed outside of
the remits of a single libtraceevent execution.

But now with the code snippet above, in addition to the fast-forward when the
kbuf is first loaded (kbuffer_read_buffer(kbuf, tmpbuf, tmap->map->reader.read)), 
we shouldn't have that problem anymore.

  <event A>
  $ ./cpu-map
    print event A
 
  <event B>
  $ ./cpu-map
    print event B

Or even

  <event A>
  $ ./cpu-map
    print event A
 
  <event B>
  $ cat /sys/kernel/tracing/trace_pipe
    print event B
> 
> -- Steve
Steven Rostedt Jan. 5, 2024, 7 p.m. UTC | #6
On Fri, 5 Jan 2024 18:23:57 +0000
Vincent Donnefort <vdonnefort@google.com> wrote:

> On Fri, Jan 05, 2024 at 12:31:11PM -0500, Steven Rostedt wrote:
> > On Fri, 5 Jan 2024 14:25:15 +0000
> > Vincent Donnefort <vdonnefort@google.com> wrote:
> >   
> > > > > Maybe this ioctl should be called regardless if events are found on the current
> > > > > reader page. This would at least update the reader->read field and make sure
> > > > > subsequent readers are not getting the same events we already had here?    
> > > > 
> > > > If we call the ioctl() before we are finished reading events, the events on
> > > > the reader page will be discarded and added back to the writer buffer if
> > > > the writer is not on the reader page.    
> > > 
> > > Hum, I am a bit confused here. If the writer is not on the reader page, then
> > > there's no way a writer could overwrite these events while we access them?  
> > 
> > Maybe I'm confused. Where do you want to call the ioctl again? If the
> > writer is off the reader page and we call the ioctl() where no new events
> > were added since our last read, the ioctl() will advance the reader page,
> > will it not? That is, any events that were on the previous reader page that
> > we did not read yet is lost.  
> 
> I meant like the snippet you shared below.
> 
> If on a reader page, there are "unread" events, the ioctl will simply return the
> same reader page, as long as there is something to read. It is then safe to call
> the ioctl unconditionally.

There's only unread events the first time we read the cpubuffer after
mapping. Then every ioctl() will give a new page if the writer is not
currently on it.

int ring_buffer_map_get_reader(struct trace_buffer *buffer, int cpu)
{
	struct ring_buffer_per_cpu *cpu_buffer;
	unsigned long reader_size;
	unsigned long flags;

	cpu_buffer = rb_get_mapped_buffer(buffer, cpu);
	if (IS_ERR(cpu_buffer))
		return (int)PTR_ERR(cpu_buffer);

	raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
consume:
	if (rb_per_cpu_empty(cpu_buffer))
		goto out;

	reader_size = rb_page_size(cpu_buffer->reader_page);

	/*
	 * There are data to be read on the current reader page, we can
	 * return to the caller. But before that, we assume the latter will read
	 * everything. Let's update the kernel reader accordingly.
	 */
	if (cpu_buffer->reader_page->read < reader_size) {
		while (cpu_buffer->reader_page->read < reader_size)
			rb_advance_reader(cpu_buffer);

// This updates the reader_page to the size of the page.
// This means that the next time this is called, it will swap the page.

		goto out;
	}

	if (WARN_ON(!rb_get_reader_page(cpu_buffer)))
		goto out;

	goto consume;
out:
	raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
	rb_put_mapped_buffer(cpu_buffer);

	return 0;
}


> 
> > 
> > The trace_mmap_load_subbuf() is called to update the reader page. If for
> > some reason the kbuf hasn't read all the data and calls the
> > tracefs_cpu_read_buf() again, it should still return the same kbuf, but
> > updated.
> > 
> >  kbuf = tracefs_cpu_read_buf() {
> >     ret = trace_mmap_load_subbuf(tcpu->mapping, tcpu->kbuf) {
> >         if (data != kbuffer_subbuffer(kbuf)) {
> >             kbuffer_load_subbuffer(kbuf, data);
> >             return 1;
> >         }
> >      return ret > 0 ? tcpu->kbuf : NULL;
> >  }
> > 
> >  record.data = kbuffer_read_event(kbuf, &record.ts);
> >  kbuffer_next_event(kbuf, NULL);
> > 
> >  kbuf = tracefs_cpu_read_buf() {
> >     ret = trace_mmap_load_subbuf(tcpu->mapping, tcpu->kbuf) {
> >         kbuffer_refresh(kbuf);
> >         /* Are there still events to read? */
> >         if (kbuffer_curr_size(kbuf))
> >             return 1;
> > 
> > The above should return because the application has not read all of the
> > kbuf yet. This is perfectly fine for the application to do this.
> > 
> > If we don't return here, but instead do:
> > 
> >         ioctl(tmap->fd, TRACE_MMAP_IOCTL_GET_READER);
> > 
> > The buffer that kbuf points to will be swapped out with a new page. And the
> > data on the buffer is lost.  
> 
> It shouldn't, the same reader page will be returned, only the reader_page->read
> pointer would be updated.

But after the first read of the page, the reader_page->read is at the end,
and will swap.

> 
> > 
> > Hmm, but looking at how the code currently works without mapping, it looks
> > like it would do the read again anyway assuming that the kbuf was
> > completely read before reading again. Perhaps it would make the logic
> > simpler to assume that the buffer was completely read before this gets
> > called again. That is, we could have it do:
> > 
> > 	if (data != kbuffer_subbuffer(kbuf)) {
> > 		kbuffer_load_subbuffer(kbuf, data);
> > 		return 1;
> > 	}
> > 
> > 	if (ioctl(tmap->fd, TRACE_MMAP_IOCTL_GET_READER) < 0)
> > 		return -1;
> > 
> > 	if (id != tmap->map->reader.id) {
> > 		/* New page, just reload it */
> > 		data = tmap->data + tmap->map->subbuf_size * id;
> > 		kbuffer_load_subbuffer(kbuf, data);
> > 		return 1;
> > 	}
> > 
> > 	/*
> > 	 * Perhaps the reader page had a write that added
> > 	 * more data.
> > 	 */
> > 	kbuffer_refresh(kbuf);
> > 
> > 	/* Are there still events to read? */
> > 	return kbuffer_curr_size(kbuf) ? 1 : 0;
> >  
> 
> That sounds good
> 
> >   
> > >   
> > > > 
> > > > And there should only be one reader accessing the map. This is not thread
> > > > safe. Once you load the subbuffer into kbuf, kbuf handles what was read. We
> > > > don't want to use the reader page for that.    
> > > 
> > > I had in mind a scenario with two sequential read. In that case only the reader
> > > page read could help to "save" what has been read so far.
> > > 
> > > Currently, we have:
> > > 
> > >  <event A>
> > >  ./cpu-map
> > >    print event A
> > > 
> > >  <event B>
> > >  ./cpu-map
> > >    print event A 
> > >    print event B  
> > 
> > The kbuffer keeps track of what was read from it, so I'm still confused by
> > what you mean.  
> 
> We probably want to keep track of what has already been consumed outside of
> the remits of a single libtraceevent execution.
> 
> But now with the code snippet above, in addition to the fast-forward when the
> kbuf is first loaded (kbuffer_read_buffer(kbuf, tmpbuf, tmap->map->reader.read)), 
> we shouldn't have that problem anymore.
> 
>   <event A>
>   $ ./cpu-map
>     print event A
>  
>   <event B>
>   $ ./cpu-map
>     print event B

I think that should be on the kernel side. The first mapping should update
the read meta page and then update the reader_page so that the next mapping
will give something different.

> 
> Or even
> 
>   <event A>
>   $ ./cpu-map
>     print event A
>  
>   <event B>
>   $ cat /sys/kernel/tracing/trace_pipe
>     print event B

So this is a kernel change, not a library one.

-- Steve
Steven Rostedt Jan. 5, 2024, 8:11 p.m. UTC | #7
On Fri, 5 Jan 2024 12:31:11 -0500
Steven Rostedt <rostedt@goodmis.org> wrote:

> Hmm, but looking at how the code currently works without mapping, it looks
> like it would do the read again anyway assuming that the kbuf was
> completely read before reading again. Perhaps it would make the logic
> simpler to assume that the buffer was completely read before this gets
> called again. That is, we could have it do:

Now I remember why I did it the other way. This is called to get the
kbuffer after it is mapped. That is, the first time we call this,
kbuf->curr will be zero, and we do not want to call the ioctl(). If we do,
we just threw away all the events currently loaded on the kbuf.

That is, the code does this:


        tcpu = tracefs_cpu_open_mapped(NULL, cpu, 0);

// Here the memory is already mapped, and a kbuf is loaded.

        mapped = tracefs_cpu_is_mapped(tcpu);
        if (!mapped)
                printf("Was not able to map, falling back to buffered read\n");
        while ((kbuf = mapped ? tracefs_cpu_read_buf(tcpu, true) :

// The tracefs_cpu_read_buf() will call the trace_mmap_load_subbuf().
// That is, it must check if kbuf has left over data and return that.
// If it does the ioctl() here, it will throw out what it loaded in the
// initial mapping.


                        tracefs_cpu_buffered_read_buf(tcpu, true))) {
                read_page(tep, kbuf);
        }



-- Steve


> 
> 	if (data != kbuffer_subbuffer(kbuf)) {
> 		kbuffer_load_subbuffer(kbuf, data);
> 		return 1;
> 	}
> 
> 	if (ioctl(tmap->fd, TRACE_MMAP_IOCTL_GET_READER) < 0)
> 		return -1;
> 
> 	if (id != tmap->map->reader.id) {
> 		/* New page, just reload it */
> 		data = tmap->data + tmap->map->subbuf_size * id;
> 		kbuffer_load_subbuffer(kbuf, data);
> 		return 1;
> 	}
> 
> 	/*
> 	 * Perhaps the reader page had a write that added
> 	 * more data.
> 	 */
> 	kbuffer_refresh(kbuf);
> 
> 	/* Are there still events to read? */
> 	return kbuffer_curr_size(kbuf) ? 1 : 0;
Steven Rostedt Jan. 5, 2024, 8:22 p.m. UTC | #8
On Fri, 5 Jan 2024 08:41:00 -0500
Steven Rostedt <rostedt@goodmis.org> wrote:

> > Could it fast-forward to the event until tmap->map->reader.read? So we don't
> > read again the same events.
> > 
> > Something like
> > 
> >   while (kbuf->curr < tmap->map->reader.read)
> >   	kbuffer_next_event(kbuf, NULL);  
> 
> Note that kbuf->curr is not available to libtracefs. That's internal to
> libtraceevent.
> 
> But we could have somethings like:
> 
> 	kbuffer_load_subbuffer(kbuf, data);
> 
> 	/* Update to kbuf index to the next read */
> 	if (tmap->map->reader.read) {
> 		char tmpbuf[tmap->map->reader.read];
> 		kbuffer_read_buffer(kbuf, tmpbuf, tmap->map->reader.read);
> 	}
> 
> Which should move the kbuf->curr to reader.read.

So, I tried out this:

(compiled the sample code into /tmp/map)

 # taskset -c 0 echo hello > /sys/kernel/tracing/trace_marker
 # /tmp/map 0
<...>-3767  6659.560129846	print: tracing_mark_write: hello

 # taskset -c 0 echo hello 2 > /sys/kernel/tracing/trace_marker
# /tmp/map 0
<...>-3767  6659.560129846	print: tracing_mark_write: hello

<...>-3769  6669.280279823	print: tracing_mark_write: hello 2


So it reported "hello" and "hello 2" but only should have reported "hello 2".

I added this:

@@ -111,6 +112,16 @@ __hidden void *trace_mmap(int fd, struct kbuffer *kbuf)
        data = tmap->data + tmap->map->subbuf_size * tmap->last_idx;
        kbuffer_load_subbuffer(kbuf, data);
 
+       /*
+        * The page could have left over data on it that was already
+        * consumed. Move the "read" forward in that case.
+        */
+       if (tmap->map->reader.read) {
+               int size = kbuffer_start_of_data(kbuf) + tmap->map->reader.read;
+               char tmpbuf[size];
+               kbuffer_read_buffer(kbuf, tmpbuf, size);
+       }
+
        return tmap;
 }

And now I get:

 # taskset -c 0 echo hello > /sys/kernel/tracing/trace_marker
 # /tmp/map 0
<...>-3938  6820.503525176	print: tracing_mark_write: hello

 # taskset -c 0 echo hello 2 > /sys/kernel/tracing/trace_marker
 # /tmp/map 0
<...>-3940  6827.655627086	print: tracing_mark_write: hello 2

The way you were expecting.

[
 Note the above didn't work until I applied to libtraceevent:

  https://lore.kernel.org/all/20240105194015.253165-3-rostedt@goodmis.org/
]

I'll send a v2.

-- Steve.
diff mbox series

Patch

diff --git a/Documentation/libtracefs-cpu-map.txt b/Documentation/libtracefs-cpu-map.txt
new file mode 100644
index 000000000000..9bcd29715795
--- /dev/null
+++ b/Documentation/libtracefs-cpu-map.txt
@@ -0,0 +1,194 @@ 
+libtracefs(3)
+=============
+
+NAME
+----
+tracefs_cpu_open_mapped, tracefs_cpu_is_mapped, tracefs_cpu_map, tracefs_cpu_unmap - Memory mapping of the ring buffer
+
+SYNOPSIS
+--------
+[verse]
+--
+*#include <tracefs.h>*
+
+bool *tracefs_cpu_is_mapped*(struct tracefs_cpu pass:[*]tcpu);
+int *tracefs_cpu_map*(struct tracefs_cpu pass:[*]tcpu);
+void *tracefs_cpu_unmap*(struct tracefs_cpu pass:[*]tcpu);
+struct tracefs_cpu pass:[*]*tracefs_cpu_open_mapped*(struct tracefs_instance pass:[*]instance,
+					    int cpu, bool nonblock);
+--
+
+DESCRIPTION
+-----------
+If the trace_pipe_raw supports memory mapping, this is usually a more efficient
+method to stream data from the kernel ring buffer than by reading it, as it does
+not require copying the memory that is being read.
+
+If memory mapping is supported by the kernel and the application asks to use the
+memory mapping via either *tracefs_cpu_map()* or by *tracefs_cpu_open_mapped()*
+then the functions *tracefs_cpu_read*(3) and *tracefs_cpu_read_buf*(3) will use
+the mapping directly instead of calling the read system call.
+
+Note, mapping can also slow down *tracefs_cpu_buffered_read*(3) and
+*tracefs_cpu_buffered_read_buf*(3), as those use splice piping and when the
+kernel ring buffer is memory mapped, splice does a copy instead of using the
+ring buffer directly. Thus care must be used when determining to map the
+ring buffer or not, and why it does not get mapped by default.
+
+The *tracefs_cpu_is_mapped()* function will return true if _tcpu_ currently has
+its ring buffer memory mapped and false otherwise. This does not return whether or
+not that the kernel supports memory mapping, but that can usually be determined
+by calling *tracefs_cpu_map()*.
+
+The *tracefs_cpu_map()* function will attempt to map the ring buffer associated
+to _tcpu_ if it is not already mapped.
+
+The *tracefs_cpu_unmap()* function will unmap the ring buffer associated to
+_tcpu_ if it is mapped.
+
+The *tracefs_cpu_open_mapped()* is equivalent to calling *tracefs_cpu_open*(3) followed
+by *tracefs_cpu_map()* on the returned _tcpu_ of *tracefs_cpu_open*(3). Note, this
+will still succeed if the mapping fails, in which case it acts the same as
+*tracefs_cpu_open*(3). If knowing if the mapping succeed or not, *tracefs_cpu_is_mapped()*
+should be called on the return _tcpu_.
+
+RETURN VALUE
+------------
+*tracefs_cpu_is_mapped()* returns true if the given _tcpu_ has its ring buffer
+memory mapped or false otherwise.
+
+*tracefs_cpu_map()* returns 0 on success and -1 on error in mapping. If 0 is
+returned then *tracefs_cpu_is_mapped()* will return true afterward, or false
+if the mapping failed.
+
+*tracefs_cpu_open_mapped()* returns an allocated tracefs_cpu on success of creation
+regardless if it succeed in mapping the ring buffer or not. It returns NULL for
+the same reasons *tracefs_cpu_open*(3) returns NULL. If success of mapping is
+to be known, then calling *tracefs_cpu_is_mapped()* afterward is required.
+
+EXAMPLE
+-------
+[source,c]
+--
+#include <stdlib.h>
+#include <ctype.h>
+#include <tracefs.h>
+
+static void read_page(struct tep_handle *tep, struct kbuffer *kbuf)
+{
+	static struct trace_seq seq;
+	struct tep_record record;
+
+	if (seq.buffer)
+		trace_seq_reset(&seq);
+	else
+		trace_seq_init(&seq);
+
+	while ((record.data = kbuffer_read_event(kbuf, &record.ts))) {
+		record.size = kbuffer_event_size(kbuf);
+		kbuffer_next_event(kbuf, NULL);
+		tep_print_event(tep, &seq, &record,
+				"%s-%d %9d\t%s: %s\n",
+				TEP_PRINT_COMM,
+				TEP_PRINT_PID,
+				TEP_PRINT_TIME,
+				TEP_PRINT_NAME,
+				TEP_PRINT_INFO);
+		trace_seq_do_printf(&seq);
+		trace_seq_reset(&seq);
+	}
+}
+
+int main (int argc, char **argv)
+{
+	struct tracefs_cpu *tcpu;
+	struct tep_handle *tep;
+	struct kbuffer *kbuf;
+	bool mapped;
+	int cpu;
+
+	if (argc < 2 || !isdigit(argv[1][0])) {
+		printf("usage: %s cpu\n\n", argv[0]);
+		exit(-1);
+	}
+
+	cpu = atoi(argv[1]);
+
+	tep = tracefs_local_events(NULL);
+	if (!tep) {
+		perror("Reading trace event formats");
+		exit(-1);
+	}
+
+	tcpu = tracefs_cpu_open_mapped(NULL, cpu, 0);
+	if (!tcpu) {
+		perror("Open CPU 0 file");
+		exit(-1);
+	}
+
+	/*
+	 * If this kernel supports mapping, use normal read,
+	 * otherwise use the piped buffer read.
+	 */
+	mapped = tracefs_cpu_is_mapped(tcpu);
+	if (!mapped)
+		printf("Was not able to map, falling back to buffered read\n");
+	while ((kbuf = mapped ? tracefs_cpu_read_buf(tcpu, true) :
+			tracefs_cpu_buffered_read_buf(tcpu, true))) {
+		read_page(tep, kbuf);
+	}
+
+	kbuf = tracefs_cpu_flush_buf(tcpu);
+	if (kbuf)
+		read_page(tep, kbuf);
+
+	tracefs_cpu_close(tcpu);
+	tep_free(tep);
+
+	return 0;
+}
+--
+
+FILES
+-----
+[verse]
+--
+*tracefs.h*
+	Header file to include in order to have access to the library APIs.
+*-ltracefs*
+	Linker switch to add when building a program that uses the library.
+--
+
+SEE ALSO
+--------
+*tracefs_cpu_open*(3),
+*tracefs_cpu_read*(3),
+*tracefs_cpu_read_buf*(3),
+*tracefs_cpu_buffered_read*(3),
+*tracefs_cpu_buffered_read_buf*(3),
+*libtracefs*(3),
+*libtraceevent*(3),
+*trace-cmd*(1)
+
+AUTHOR
+------
+[verse]
+--
+*Steven Rostedt* <rostedt@goodmis.org>
+--
+REPORTING BUGS
+--------------
+Report bugs to  <linux-trace-devel@vger.kernel.org>
+
+LICENSE
+-------
+libtracefs is Free Software licensed under the GNU LGPL 2.1
+
+RESOURCES
+---------
+https://git.kernel.org/pub/scm/libs/libtrace/libtracefs.git/
+
+COPYING
+-------
+Copyright \(C) 2022 Google, Inc. Free use of this software is granted under
+the terms of the GNU Public License (GPL).
diff --git a/Documentation/libtracefs.txt b/Documentation/libtracefs.txt
index b0aaa6222ec7..8dc3ba7386e3 100644
--- a/Documentation/libtracefs.txt
+++ b/Documentation/libtracefs.txt
@@ -138,6 +138,13 @@  Trace stream:
 	ssize_t *tracefs_trace_pipe_print*(struct tracefs_instance pass:[*]_instance_, int _flags_);
 	void *tracefs_trace_pipe_stop*(struct tracefs_instance pass:[*]_instance_);
 
+Memory mapping the ring buffer:
+	bool *tracefs_cpu_is_mapped*(struct tracefs_cpu pass:[*]tcpu);
+	int *tracefs_cpu_map*(struct tracefs_cpu pass:[*]tcpu);
+	void *tracefs_cpu_unmap*(struct tracefs_cpu pass:[*]tcpu);
+	struct tracefs_cpu pass:[*]*tracefs_cpu_open_mapped*(struct tracefs_instance pass:[*]instance,
+						int cpu, bool nonblock);
+
 Trace options:
 	const struct tracefs_options_mask pass:[*]*tracefs_options_get_supported*(struct tracefs_instance pass:[*]_instance_);
 	bool *tracefs_option_is_supported*(struct tracefs_instance pass:[*]_instance_, enum tracefs_option_id _id_);
diff --git a/Makefile b/Makefile
index 80076aa2cff3..e915e14b74e6 100644
--- a/Makefile
+++ b/Makefile
@@ -10,7 +10,8 @@  export TFS_PATCHLEVEL
 export TFS_EXTRAVERSION
 export TRACEFS_VERSION
 
-LIBTRACEEVENT_MIN_VERSION = 1.3
+# Note, samples and utests need 1.8.1
+LIBTRACEEVENT_MIN_VERSION = 1.8
 
 # taken from trace-cmd
 MAKEFLAGS += --no-print-directory
diff --git a/include/tracefs-local.h b/include/tracefs-local.h
index 9cae73c8b806..ffc9d33b1796 100644
--- a/include/tracefs-local.h
+++ b/include/tracefs-local.h
@@ -6,6 +6,7 @@ 
 #ifndef _TRACE_FS_LOCAL_H
 #define _TRACE_FS_LOCAL_H
 
+#include <tracefs.h>
 #include <pthread.h>
 
 #define __hidden __attribute__((visibility ("hidden")))
@@ -116,6 +117,11 @@  int trace_append_filter(char **filter, unsigned int *state,
 			enum tracefs_compare compare,
 			 const char *val);
 
+void *trace_mmap(int fd, struct kbuffer *kbuf);
+void trace_unmap(void *mapping);
+int trace_mmap_load_subbuf(void *mapping, struct kbuffer *kbuf);
+int trace_mmap_read(void *mapping, void *buffer);
+
 struct tracefs_synth *synth_init_from(struct tep_handle *tep,
 				      const char *start_system,
 				      const char *start_event);
diff --git a/include/tracefs.h b/include/tracefs.h
index 989112c851c8..8569171247b7 100644
--- a/include/tracefs.h
+++ b/include/tracefs.h
@@ -693,6 +693,13 @@  int tracefs_snapshot_snap(struct tracefs_instance *instance);
 int tracefs_snapshot_clear(struct tracefs_instance *instance);
 int tracefs_snapshot_free(struct tracefs_instance *instance);
 
+/* Memory mapping of ring buffer */
+bool tracefs_cpu_is_mapped(struct tracefs_cpu *tcpu);
+int tracefs_cpu_map(struct tracefs_cpu *tcpu);
+void tracefs_cpu_unmap(struct tracefs_cpu *tcpu);
+struct tracefs_cpu *tracefs_cpu_open_mapped(struct tracefs_instance *instance,
+					    int cpu, bool nonblock);
+
 /* Mapping vsocket cids to pids using tracing */
 int tracefs_instance_find_cid_pid(struct tracefs_instance *instance, int cid);
 int tracefs_find_cid_pid(int cid);
diff --git a/samples/Makefile b/samples/Makefile
index 77739c8b0aa7..81c8006f823e 100644
--- a/samples/Makefile
+++ b/samples/Makefile
@@ -26,6 +26,7 @@  EXAMPLES += guest
 EXAMPLES += cpu-buf
 EXAMPLES += instances-stat
 EXAMPLES += instances-subbuf
+EXAMPLES += cpu-map
 
 TARGETS :=
 TARGETS += sqlhist
diff --git a/src/Makefile b/src/Makefile
index faa3b25c4002..be81059ce10a 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -16,6 +16,7 @@  OBJS += tracefs-dynevents.o
 OBJS += tracefs-eprobes.o
 OBJS += tracefs-uprobes.o
 OBJS += tracefs-record.o
+OBJS += tracefs-mmap.o
 ifeq ($(VSOCK_DEFINED), 1)
 OBJS += tracefs-vsock.o
 endif
diff --git a/src/tracefs-mmap.c b/src/tracefs-mmap.c
new file mode 100644
index 000000000000..bc28caff91f3
--- /dev/null
+++ b/src/tracefs-mmap.c
@@ -0,0 +1,190 @@ 
+// SPDX-License-Identifier: LGPL-2.1
+/*
+ * Copyright (C) 2023 Google Inc, Steven Rostedt <rostedt@goodmis.org>
+ */
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <sys/ioctl.h>
+#include <asm/types.h>
+#include "tracefs-local.h"
+
+struct trace_buffer_meta {
+	unsigned long	entries;
+	unsigned long	overrun;
+	unsigned long	read;
+
+	unsigned long	subbufs_touched;
+	unsigned long	subbufs_lost;
+	unsigned long	subbufs_read;
+
+	struct {
+		unsigned long	lost_events;	/* Events lost at the time of the reader swap */
+		__u32		id;		/* Reader subbuf ID from 0 to nr_subbufs - 1 */
+		__u32		read;		/* Number of bytes read on the reader subbuf */
+	} reader;
+
+	__u32		subbuf_size;		/* Size of each subbuf including the header */
+	__u32		nr_subbufs;		/* Number of subbufs in the ring-buffer */
+
+	__u32		meta_page_size;		/* Size of the meta-page */
+	__u32		meta_struct_len;	/* Len of this struct */
+};
+
+#define TRACE_MMAP_IOCTL_GET_READER		_IO('T', 0x1)
+
+struct trace_mmap {
+	struct trace_buffer_meta	*map;
+	struct kbuffer			*kbuf;
+	void				*data;
+	int				*data_pages;
+	int				fd;
+	int				last_idx;
+	int				last_read;
+	int				meta_len;
+	int				data_len;
+};
+
+/**
+ * trace_mmap - try to mmap the ring buffer
+ * @fd: The file descriptor to the trace_pipe_raw file
+ *
+ * Will try to mmap the ring buffer if it is supported, and
+ * if not, will return NULL, otherwise it returns a descriptor
+ * to handle the mapping.
+ */
+__hidden void *trace_mmap(int fd, struct kbuffer *kbuf)
+{
+	struct trace_mmap *tmap;
+	int page_size;
+	void *meta;
+	void *data;
+
+	page_size = getpagesize();
+	meta = mmap(NULL, page_size, PROT_READ, MAP_SHARED, fd, 0);
+	if (meta == MAP_FAILED)
+		return NULL;
+
+	tmap = calloc(1, sizeof(*tmap));
+	if (!tmap) {
+		munmap(meta, page_size);
+		return NULL;
+	}
+
+	tmap->kbuf = kbuffer_dup(kbuf);
+	if (!tmap->kbuf) {
+		munmap(meta, page_size);
+		free(tmap);
+	}
+
+	tmap->fd = fd;
+
+	tmap->map = meta;
+	tmap->meta_len = tmap->map->meta_page_size;
+
+	if (tmap->meta_len > page_size) {
+		munmap(meta, page_size);
+		meta = mmap(NULL, tmap->meta_len, PROT_READ, MAP_SHARED, fd, 0);
+		if (meta == MAP_FAILED) {
+			kbuffer_free(tmap->kbuf);
+			free(tmap);
+			return NULL;
+		}
+		tmap->map = meta;
+	}
+
+	tmap->data_pages = meta + tmap->meta_len;
+
+	tmap->data_len = tmap->map->subbuf_size * tmap->map->nr_subbufs;
+
+	tmap->data = mmap(NULL, tmap->data_len, PROT_READ, MAP_SHARED,
+			  fd, tmap->meta_len);
+	if (tmap->data == MAP_FAILED) {
+		munmap(meta, tmap->meta_len);
+		kbuffer_free(tmap->kbuf);
+		free(tmap);
+		return NULL;
+	}
+
+	tmap->last_idx = tmap->map->reader.id;
+
+	data = tmap->data + tmap->map->subbuf_size * tmap->last_idx;
+	kbuffer_load_subbuffer(kbuf, data);
+
+	return tmap;
+}
+
+__hidden void trace_unmap(void *mapping)
+{
+	struct trace_mmap *tmap = mapping;
+
+	munmap(tmap->data, tmap->data_len);
+	munmap(tmap->map, tmap->meta_len);
+	kbuffer_free(tmap->kbuf);
+	free(tmap);
+}
+
+__hidden int trace_mmap_load_subbuf(void *mapping, struct kbuffer *kbuf)
+{
+	struct trace_mmap *tmap = mapping;
+	void *data;
+	int id;
+
+	id = tmap->map->reader.id;
+	data = tmap->data + tmap->map->subbuf_size * id;
+
+	/*
+	 * If kbuf doesn't point to the current sub-buffer
+	 * just load it and return.
+	 */
+	if (data != kbuffer_subbuffer(kbuf)) {
+		kbuffer_load_subbuffer(kbuf, data);
+		return 1;
+	}
+
+	/*
+	 * Perhaps the reader page had a write that added
+	 * more data.
+	 */
+	kbuffer_refresh(kbuf);
+
+	/* Are there still events to read? */
+	if (kbuffer_curr_size(kbuf))
+		return 1;
+
+	/* See if a new page is ready? */
+	if (ioctl(tmap->fd, TRACE_MMAP_IOCTL_GET_READER) < 0)
+		return -1;
+	id = tmap->map->reader.id;
+	data = tmap->data + tmap->map->subbuf_size * id;
+
+	/*
+	 * If the sub-buffer hasn't changed, then there's no more
+	 * events to read.
+	 */
+	if (data == kbuffer_subbuffer(kbuf))
+		return 0;
+
+	kbuffer_load_subbuffer(kbuf, data);
+	return 1;
+}
+
+__hidden int trace_mmap_read(void *mapping, void *buffer)
+{
+	struct trace_mmap *tmap = mapping;
+	struct kbuffer *kbuf;
+	int ret;
+
+	if (!tmap)
+		return -1;
+
+	kbuf = tmap->kbuf;
+
+	ret = trace_mmap_load_subbuf(mapping, kbuf);
+	/* Return for error or no more events */
+	if (ret <= 0)
+		return ret;
+
+	/* Update the buffer */
+	return kbuffer_read_buffer(kbuf, buffer, tmap->map->subbuf_size);
+}
diff --git a/src/tracefs-record.c b/src/tracefs-record.c
index e8be3335070b..f51e18420bc7 100644
--- a/src/tracefs-record.c
+++ b/src/tracefs-record.c
@@ -36,6 +36,7 @@  struct tracefs_cpu {
 	int		splice_read_flags;
 	struct kbuffer	*kbuf;
 	void		*buffer;
+	void		*mapping;
 };
 
 /**
@@ -229,6 +230,31 @@  int tracefs_snapshot_free(struct tracefs_instance *instance)
 	return ret < 0 ? -1 : 0;
 }
 
+/**
+ * tracefs_cpu_open_mapped - open an instance raw trace file and map it
+ * @instance: the instance (NULL for toplevel) of the cpu raw file to open
+ * @cpu: The CPU that the raw trace file is associated with
+ * @nonblock: If true, the file will be opened in O_NONBLOCK mode
+ *
+ * Return a descriptor that can read the tracefs trace_pipe_raw file
+ * for a give @cpu in a given @instance.
+ *
+ * Returns NULL on error.
+ */
+struct tracefs_cpu *
+tracefs_cpu_open_mapped(struct tracefs_instance *instance, int cpu, bool nonblock)
+{
+	struct tracefs_cpu *tcpu;
+
+	tcpu = tracefs_cpu_open(instance, cpu, nonblock);
+	if (!tcpu)
+		return NULL;
+
+	tracefs_cpu_map(tcpu);
+
+	return tcpu;
+}
+
 static void close_fd(int fd)
 {
 	if (fd < 0)
@@ -285,6 +311,28 @@  int tracefs_cpu_read_size(struct tracefs_cpu *tcpu)
 	return tcpu->subbuf_size;
 }
 
+bool tracefs_cpu_is_mapped(struct tracefs_cpu *tcpu)
+{
+	return tcpu->mapping != NULL;
+}
+
+int tracefs_cpu_map(struct tracefs_cpu *tcpu)
+{
+	if (tcpu->mapping)
+		return 0;
+
+	tcpu->mapping = trace_mmap(tcpu->fd, tcpu->kbuf);
+	return tcpu->mapping ? 0 : -1;
+}
+
+void tracefs_cpu_unmap(struct tracefs_cpu *tcpu)
+{
+	if (!tcpu->mapping)
+		return;
+
+	trace_unmap(tcpu->mapping);
+}
+
 static void set_nonblock(struct tracefs_cpu *tcpu)
 {
 	long flags;
@@ -383,6 +431,9 @@  int tracefs_cpu_read(struct tracefs_cpu *tcpu, void *buffer, bool nonblock)
 	if (ret <= 0)
 		return ret;
 
+	if (tcpu->mapping)
+		return trace_mmap_read(tcpu->mapping, buffer);
+
 	ret = read(tcpu->fd, buffer, tcpu->subbuf_size);
 
 	/* It's OK if there's no data to read */
@@ -427,6 +478,16 @@  struct kbuffer *tracefs_cpu_read_buf(struct tracefs_cpu *tcpu, bool nonblock)
 {
 	int ret;
 
+	/* If mapping is enabled, just use it directly */
+	if (tcpu->mapping) {
+		ret = wait_on_input(tcpu, nonblock);
+		if (ret <= 0)
+			return NULL;
+
+		ret = trace_mmap_load_subbuf(tcpu->mapping, tcpu->kbuf);
+		return ret > 0 ? tcpu->kbuf : NULL;
+	}
+
 	if (!get_buffer(tcpu))
 		return NULL;