diff mbox series

ring-buffer: Allow persistent ring buffers to be mmapped

Message ID 20250214130442.3f4fee5d@gandalf.local.home (mailing list archive)
State New
Headers show
Series ring-buffer: Allow persistent ring buffers to be mmapped | expand

Commit Message

Steven Rostedt Feb. 14, 2025, 6:04 p.m. UTC
From: Steven Rostedt <rostedt@goodmis.org>

The persistent ring buffer uses vmap()'d memory to map the reserved memory
from boot. But the user space mmap() to the ring buffer requires
virt_to_page() to return a valid page. But that only works for core kernel
addresses and not for vmap() addresses.

If virt_addr_valid() fails on the page to be mapped, use vmalloc_to_page()
instead.

Also, the persistent memory uses the page->id for its own purpose where as
the user mmap buffer currently uses that for the subbuf array mapped to
user space. If the buffer is a persistent buffer, use the page index into
that buffer as the identifier instead of the page->id.

That is, the page->id for a persistent buffer, represents the order of the
buffer is in the link list. ->id == 0 means it is the reader page.
When a reader page is swapped, the new reader page's ->id gets zero, and
the old reader page gets the ->id of the page that it swapped with.

The user space mapping has the ->id is the index of where it was mapped in
user space and does not change while it is mapped.

Since the persistent buffer is fixed in its location, the index of where
a page is in the memory range can be used as the "id" to put in the meta
page array, and it can be mapped in the same order to user space as it is
in the persistent memory.

Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 49 ++++++++++++++++++++++++++++++++------
 kernel/trace/trace.c       |  4 ----
 2 files changed, 42 insertions(+), 11 deletions(-)

Comments

Masami Hiramatsu (Google) Feb. 15, 2025, 3:21 p.m. UTC | #1
On Fri, 14 Feb 2025 13:04:42 -0500
Steven Rostedt <rostedt@goodmis.org> wrote:

> From: Steven Rostedt <rostedt@goodmis.org>
> 
> The persistent ring buffer uses vmap()'d memory to map the reserved memory
> from boot. But the user space mmap() to the ring buffer requires
> virt_to_page() to return a valid page. But that only works for core kernel
> addresses and not for vmap() addresses.
> 
> If virt_addr_valid() fails on the page to be mapped, use vmalloc_to_page()
> instead.
> 
> Also, the persistent memory uses the page->id for its own purpose where as
> the user mmap buffer currently uses that for the subbuf array mapped to
> user space. If the buffer is a persistent buffer, use the page index into
> that buffer as the identifier instead of the page->id.
> 
> That is, the page->id for a persistent buffer, represents the order of the
> buffer is in the link list. ->id == 0 means it is the reader page.
> When a reader page is swapped, the new reader page's ->id gets zero, and
> the old reader page gets the ->id of the page that it swapped with.
> 
> The user space mapping has the ->id is the index of where it was mapped in
> user space and does not change while it is mapped.
> 
> Since the persistent buffer is fixed in its location, the index of where
> a page is in the memory range can be used as the "id" to put in the meta
> page array, and it can be mapped in the same order to user space as it is
> in the persistent memory.

Looks good and works for me.

Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Tested-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>

Thank you,

> 
> Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
> ---
>  kernel/trace/ring_buffer.c | 49 ++++++++++++++++++++++++++++++++------
>  kernel/trace/trace.c       |  4 ----
>  2 files changed, 42 insertions(+), 11 deletions(-)
> 
> diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
> index bb6089c2951e..87caf9d48edb 100644
> --- a/kernel/trace/ring_buffer.c
> +++ b/kernel/trace/ring_buffer.c
> @@ -5950,12 +5950,18 @@ static void rb_clear_buffer_page(struct buffer_page *page)
>  static void rb_update_meta_page(struct ring_buffer_per_cpu *cpu_buffer)
>  {
>  	struct trace_buffer_meta *meta = cpu_buffer->meta_page;
> +	struct page *page;
>  
>  	if (!meta)
>  		return;
>  
>  	meta->reader.read = cpu_buffer->reader_page->read;
> -	meta->reader.id = cpu_buffer->reader_page->id;
> +	/* For boot buffers, the id is the index */
> +	if (cpu_buffer->ring_meta)
> +		meta->reader.id = rb_meta_subbuf_idx(cpu_buffer->ring_meta,
> +						     cpu_buffer->reader_page->page);
> +	else
> +		meta->reader.id = cpu_buffer->reader_page->id;
>  	meta->reader.lost_events = cpu_buffer->lost_events;
>  
>  	meta->entries = local_read(&cpu_buffer->entries);
> @@ -5963,7 +5969,12 @@ static void rb_update_meta_page(struct ring_buffer_per_cpu *cpu_buffer)
>  	meta->read = cpu_buffer->read;
>  
>  	/* Some archs do not have data cache coherency between kernel and user-space */
> -	flush_dcache_folio(virt_to_folio(cpu_buffer->meta_page));
> +	if (virt_addr_valid(cpu_buffer->meta_page))
> +		page = virt_to_page(cpu_buffer->meta_page);
> +	else
> +		page = vmalloc_to_page(cpu_buffer->meta_page);
> +
> +	flush_dcache_folio(page_folio(page));
>  }
>  
>  static void
> @@ -6883,23 +6894,38 @@ static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer,
>  	struct trace_buffer_meta *meta = cpu_buffer->meta_page;
>  	unsigned int nr_subbufs = cpu_buffer->nr_pages + 1;
>  	struct buffer_page *first_subbuf, *subbuf;
> +	int cnt = 0;
>  	int id = 0;
>  
> -	subbuf_ids[id] = (unsigned long)cpu_buffer->reader_page->page;
> -	cpu_buffer->reader_page->id = id++;
> +	if (cpu_buffer->ring_meta)
> +		id = rb_meta_subbuf_idx(cpu_buffer->ring_meta,
> +					cpu_buffer->reader_page->page);
> +	else
> +		cpu_buffer->reader_page->id = id;
> +
> +	subbuf_ids[id++] = (unsigned long)cpu_buffer->reader_page->page;
> +	cnt++;
>  
>  	first_subbuf = subbuf = rb_set_head_page(cpu_buffer);
>  	do {
> +		if (cpu_buffer->ring_meta)
> +			id = rb_meta_subbuf_idx(cpu_buffer->ring_meta,
> +						subbuf->page);
> +		else
> +			subbuf->id = id;
> +
>  		if (WARN_ON(id >= nr_subbufs))
>  			break;
>  
>  		subbuf_ids[id] = (unsigned long)subbuf->page;
> -		subbuf->id = id;
>  
>  		rb_inc_page(&subbuf);
>  		id++;
> +		cnt++;
>  	} while (subbuf != first_subbuf);
>  
> +	WARN_ON(cnt != nr_subbufs);
> +
>  	/* install subbuf ID to kern VA translation */
>  	cpu_buffer->subbuf_ids = subbuf_ids;
>  
> @@ -7064,7 +7090,10 @@ static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer,
>  			goto out;
>  		}
>  
> -		page = virt_to_page((void *)cpu_buffer->subbuf_ids[s]);
> +		if (virt_addr_valid(cpu_buffer->subbuf_ids[s]))
> +			page = virt_to_page((void *)cpu_buffer->subbuf_ids[s]);
> +		else
> +			page = vmalloc_to_page((void *)cpu_buffer->subbuf_ids[s]);
>  
>  		for (; off < (1 << (subbuf_order)); off++, page++) {
>  			if (p >= nr_pages)
> @@ -7210,6 +7239,7 @@ int ring_buffer_map_get_reader(struct trace_buffer *buffer, int cpu)
>  	unsigned long missed_events;
>  	unsigned long reader_size;
>  	unsigned long flags;
> +	struct page *page;
>  
>  	cpu_buffer = rb_get_mapped_buffer(buffer, cpu);
>  	if (IS_ERR(cpu_buffer))
> @@ -7278,7 +7308,12 @@ int ring_buffer_map_get_reader(struct trace_buffer *buffer, int cpu)
>  
>  out:
>  	/* Some archs do not have data cache coherency between kernel and user-space */
> -	flush_dcache_folio(virt_to_folio(cpu_buffer->reader_page->page));
> +	if (virt_addr_valid(cpu_buffer->meta_page))
> +		page = virt_to_page(cpu_buffer->meta_page);
> +	else
> +		page = vmalloc_to_page(cpu_buffer->meta_page);
> +
> +	flush_dcache_folio(page_folio(page));
>  
>  	rb_update_meta_page(cpu_buffer);
>  
> diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
> index 0e6d517e74e0..25ff37aab00f 100644
> --- a/kernel/trace/trace.c
> +++ b/kernel/trace/trace.c
> @@ -8279,10 +8279,6 @@ static int tracing_buffers_mmap(struct file *filp, struct vm_area_struct *vma)
>  	struct trace_iterator *iter = &info->iter;
>  	int ret = 0;
>  
> -	/* Currently the boot mapped buffer is not supported for mmap */
> -	if (iter->tr->flags & TRACE_ARRAY_FL_BOOT)
> -		return -ENODEV;
> -
>  	ret = get_snapshot_map(iter->tr);
>  	if (ret)
>  		return ret;
> -- 
> 2.47.2
>
diff mbox series

Patch

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index bb6089c2951e..87caf9d48edb 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -5950,12 +5950,18 @@  static void rb_clear_buffer_page(struct buffer_page *page)
 static void rb_update_meta_page(struct ring_buffer_per_cpu *cpu_buffer)
 {
 	struct trace_buffer_meta *meta = cpu_buffer->meta_page;
+	struct page *page;
 
 	if (!meta)
 		return;
 
 	meta->reader.read = cpu_buffer->reader_page->read;
-	meta->reader.id = cpu_buffer->reader_page->id;
+	/* For boot buffers, the id is the index */
+	if (cpu_buffer->ring_meta)
+		meta->reader.id = rb_meta_subbuf_idx(cpu_buffer->ring_meta,
+						     cpu_buffer->reader_page->page);
+	else
+		meta->reader.id = cpu_buffer->reader_page->id;
 	meta->reader.lost_events = cpu_buffer->lost_events;
 
 	meta->entries = local_read(&cpu_buffer->entries);
@@ -5963,7 +5969,12 @@  static void rb_update_meta_page(struct ring_buffer_per_cpu *cpu_buffer)
 	meta->read = cpu_buffer->read;
 
 	/* Some archs do not have data cache coherency between kernel and user-space */
-	flush_dcache_folio(virt_to_folio(cpu_buffer->meta_page));
+	if (virt_addr_valid(cpu_buffer->meta_page))
+		page = virt_to_page(cpu_buffer->meta_page);
+	else
+		page = vmalloc_to_page(cpu_buffer->meta_page);
+
+	flush_dcache_folio(page_folio(page));
 }
 
 static void
@@ -6883,23 +6894,38 @@  static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer,
 	struct trace_buffer_meta *meta = cpu_buffer->meta_page;
 	unsigned int nr_subbufs = cpu_buffer->nr_pages + 1;
 	struct buffer_page *first_subbuf, *subbuf;
+	int cnt = 0;
 	int id = 0;
 
-	subbuf_ids[id] = (unsigned long)cpu_buffer->reader_page->page;
-	cpu_buffer->reader_page->id = id++;
+	if (cpu_buffer->ring_meta)
+		id = rb_meta_subbuf_idx(cpu_buffer->ring_meta,
+					cpu_buffer->reader_page->page);
+	else
+		cpu_buffer->reader_page->id = id;
+
+	subbuf_ids[id++] = (unsigned long)cpu_buffer->reader_page->page;
+	cnt++;
 
 	first_subbuf = subbuf = rb_set_head_page(cpu_buffer);
 	do {
+		if (cpu_buffer->ring_meta)
+			id = rb_meta_subbuf_idx(cpu_buffer->ring_meta,
+						subbuf->page);
+		else
+			subbuf->id = id;
+
 		if (WARN_ON(id >= nr_subbufs))
 			break;
 
 		subbuf_ids[id] = (unsigned long)subbuf->page;
-		subbuf->id = id;
 
 		rb_inc_page(&subbuf);
 		id++;
+		cnt++;
 	} while (subbuf != first_subbuf);
 
+	WARN_ON(cnt != nr_subbufs);
+
 	/* install subbuf ID to kern VA translation */
 	cpu_buffer->subbuf_ids = subbuf_ids;
 
@@ -7064,7 +7090,10 @@  static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer,
 			goto out;
 		}
 
-		page = virt_to_page((void *)cpu_buffer->subbuf_ids[s]);
+		if (virt_addr_valid(cpu_buffer->subbuf_ids[s]))
+			page = virt_to_page((void *)cpu_buffer->subbuf_ids[s]);
+		else
+			page = vmalloc_to_page((void *)cpu_buffer->subbuf_ids[s]);
 
 		for (; off < (1 << (subbuf_order)); off++, page++) {
 			if (p >= nr_pages)
@@ -7210,6 +7239,7 @@  int ring_buffer_map_get_reader(struct trace_buffer *buffer, int cpu)
 	unsigned long missed_events;
 	unsigned long reader_size;
 	unsigned long flags;
+	struct page *page;
 
 	cpu_buffer = rb_get_mapped_buffer(buffer, cpu);
 	if (IS_ERR(cpu_buffer))
@@ -7278,7 +7308,12 @@  int ring_buffer_map_get_reader(struct trace_buffer *buffer, int cpu)
 
 out:
 	/* Some archs do not have data cache coherency between kernel and user-space */
-	flush_dcache_folio(virt_to_folio(cpu_buffer->reader_page->page));
+	if (virt_addr_valid(cpu_buffer->meta_page))
+		page = virt_to_page(cpu_buffer->meta_page);
+	else
+		page = vmalloc_to_page(cpu_buffer->meta_page);
+
+	flush_dcache_folio(page_folio(page));
 
 	rb_update_meta_page(cpu_buffer);
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 0e6d517e74e0..25ff37aab00f 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -8279,10 +8279,6 @@  static int tracing_buffers_mmap(struct file *filp, struct vm_area_struct *vma)
 	struct trace_iterator *iter = &info->iter;
 	int ret = 0;
 
-	/* Currently the boot mapped buffer is not supported for mmap */
-	if (iter->tr->flags & TRACE_ARRAY_FL_BOOT)
-		return -ENODEV;
-
 	ret = get_snapshot_map(iter->tr);
 	if (ret)
 		return ret;