diff mbox series

[v2] tracing: Introduce pipe_cpumask to avoid race on trace_pipes

Message ID 20230818022645.1948314-1-zhengyejian1@huawei.com (mailing list archive)
State Accepted
Commit c2489bb7e6be2e8cdced12c16c42fa128403ac03
Headers show
Series [v2] tracing: Introduce pipe_cpumask to avoid race on trace_pipes | expand

Commit Message

Zheng Yejian Aug. 18, 2023, 2:26 a.m. UTC
There is race issue when concurrently splice_read main trace_pipe and
per_cpu trace_pipes which will result in data read out being different
from what actually writen.

As suggested by Steven:
  > I believe we should add a ref count to trace_pipe and the per_cpu
  > trace_pipes, where if they are opened, nothing else can read it.
  >
  > Opening trace_pipe locks all per_cpu ref counts, if any of them are
  > open, then the trace_pipe open will fail (and releases any ref counts
  > it had taken).
  >
  > Opening a per_cpu trace_pipe will up the ref count for just that
  > CPU buffer. This will allow multiple tasks to read different per_cpu
  > trace_pipe files, but will prevent the main trace_pipe file from
  > being opened.

But because we only need to know whether per_cpu trace_pipe is open or
not, using a cpumask instead of using ref count may be easier.

After this patch, users will find that:
 - Main trace_pipe can be opened by only one user, and if it is
   opened, all per_cpu trace_pipes cannot be opened;
 - Per_cpu trace_pipes can be opened by multiple users, but each per_cpu
   trace_pipe can only be opened by one user. And if one of them is
   opened, main trace_pipe cannot be opened.

Suggested-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: Zheng Yejian <zhengyejian1@huawei.com>
---

v2:
  - In open_pipe_on_cpu(), clean format of if statements.
  - In tracing_open_pipe(), call tracing_get_cpu() after
    tracing_check_open_get_tr() and within trace_types_lock,
    also keep "int ret;" as the last declaration.
    Link: https://lore.kernel.org/all/20230817101331.21ab6b33@gandalf.local.home/

v1:
  - Link: https://lore.kernel.org/all/20230817115057.1637676-1-zhengyejian1@huawei.com/

 kernel/trace/trace.c | 55 ++++++++++++++++++++++++++++++++++++++------
 kernel/trace/trace.h |  2 ++
 2 files changed, 50 insertions(+), 7 deletions(-)

Comments

Masami Hiramatsu (Google) Aug. 18, 2023, 5:03 a.m. UTC | #1
On Fri, 18 Aug 2023 10:26:45 +0800
Zheng Yejian <zhengyejian1@huawei.com> wrote:

> There is race issue when concurrently splice_read main trace_pipe and
> per_cpu trace_pipes which will result in data read out being different
> from what actually writen.
> 
> As suggested by Steven:
>   > I believe we should add a ref count to trace_pipe and the per_cpu
>   > trace_pipes, where if they are opened, nothing else can read it.
>   >
>   > Opening trace_pipe locks all per_cpu ref counts, if any of them are
>   > open, then the trace_pipe open will fail (and releases any ref counts
>   > it had taken).
>   >
>   > Opening a per_cpu trace_pipe will up the ref count for just that
>   > CPU buffer. This will allow multiple tasks to read different per_cpu
>   > trace_pipe files, but will prevent the main trace_pipe file from
>   > being opened.
> 
> But because we only need to know whether per_cpu trace_pipe is open or
> not, using a cpumask instead of using ref count may be easier.
> 
> After this patch, users will find that:
>  - Main trace_pipe can be opened by only one user, and if it is
>    opened, all per_cpu trace_pipes cannot be opened;
>  - Per_cpu trace_pipes can be opened by multiple users, but each per_cpu
>    trace_pipe can only be opened by one user. And if one of them is
>    opened, main trace_pipe cannot be opened.
> 
> Suggested-by: Steven Rostedt (Google) <rostedt@goodmis.org>
> Signed-off-by: Zheng Yejian <zhengyejian1@huawei.com>

Looks good to me. 

Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>

BTW, this reminds me that splice read support had been introduced for
virtio-trace support. I've tested it with/without this patch, but I hit
a different issue, which seems like a virtio-serial issue (per-cpu buffers
on the guest are spliced correctly, but the data can not be read from host
side). Let me investigate it.

Thank you,

> ---
> 
> v2:
>   - In open_pipe_on_cpu(), clean format of if statements.
>   - In tracing_open_pipe(), call tracing_get_cpu() after
>     tracing_check_open_get_tr() and within trace_types_lock,
>     also keep "int ret;" as the last declaration.
>     Link: https://lore.kernel.org/all/20230817101331.21ab6b33@gandalf.local.home/
> 
> v1:
>   - Link: https://lore.kernel.org/all/20230817115057.1637676-1-zhengyejian1@huawei.com/
> 
>  kernel/trace/trace.c | 55 ++++++++++++++++++++++++++++++++++++++------
>  kernel/trace/trace.h |  2 ++
>  2 files changed, 50 insertions(+), 7 deletions(-)
> 
> diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
> index b8870078ef58..c888a0a2c0e2 100644
> --- a/kernel/trace/trace.c
> +++ b/kernel/trace/trace.c
> @@ -6705,10 +6705,36 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf,
>  
>  #endif
>  
> +static int open_pipe_on_cpu(struct trace_array *tr, int cpu)
> +{
> +	if (cpu == RING_BUFFER_ALL_CPUS) {
> +		if (cpumask_empty(tr->pipe_cpumask)) {
> +			cpumask_setall(tr->pipe_cpumask);
> +			return 0;
> +		}
> +	} else if (!cpumask_test_cpu(cpu, tr->pipe_cpumask)) {
> +		cpumask_set_cpu(cpu, tr->pipe_cpumask);
> +		return 0;
> +	}
> +	return -EBUSY;
> +}
> +
> +static void close_pipe_on_cpu(struct trace_array *tr, int cpu)
> +{
> +	if (cpu == RING_BUFFER_ALL_CPUS) {
> +		WARN_ON(!cpumask_full(tr->pipe_cpumask));
> +		cpumask_clear(tr->pipe_cpumask);
> +	} else {
> +		WARN_ON(!cpumask_test_cpu(cpu, tr->pipe_cpumask));
> +		cpumask_clear_cpu(cpu, tr->pipe_cpumask);
> +	}
> +}
> +
>  static int tracing_open_pipe(struct inode *inode, struct file *filp)
>  {
>  	struct trace_array *tr = inode->i_private;
>  	struct trace_iterator *iter;
> +	int cpu;
>  	int ret;
>  
>  	ret = tracing_check_open_get_tr(tr);
> @@ -6716,13 +6742,16 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
>  		return ret;
>  
>  	mutex_lock(&trace_types_lock);
> +	cpu = tracing_get_cpu(inode);
> +	ret = open_pipe_on_cpu(tr, cpu);
> +	if (ret)
> +		goto fail_pipe_on_cpu;
>  
>  	/* create a buffer to store the information to pass to userspace */
>  	iter = kzalloc(sizeof(*iter), GFP_KERNEL);
>  	if (!iter) {
>  		ret = -ENOMEM;
> -		__trace_array_put(tr);
> -		goto out;
> +		goto fail_alloc_iter;
>  	}
>  
>  	trace_seq_init(&iter->seq);
> @@ -6745,7 +6774,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
>  
>  	iter->tr = tr;
>  	iter->array_buffer = &tr->array_buffer;
> -	iter->cpu_file = tracing_get_cpu(inode);
> +	iter->cpu_file = cpu;
>  	mutex_init(&iter->mutex);
>  	filp->private_data = iter;
>  
> @@ -6755,12 +6784,15 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
>  	nonseekable_open(inode, filp);
>  
>  	tr->trace_ref++;
> -out:
> +
>  	mutex_unlock(&trace_types_lock);
>  	return ret;
>  
>  fail:
>  	kfree(iter);
> +fail_alloc_iter:
> +	close_pipe_on_cpu(tr, cpu);
> +fail_pipe_on_cpu:
>  	__trace_array_put(tr);
>  	mutex_unlock(&trace_types_lock);
>  	return ret;
> @@ -6777,7 +6809,7 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
>  
>  	if (iter->trace->pipe_close)
>  		iter->trace->pipe_close(iter);
> -
> +	close_pipe_on_cpu(tr, iter->cpu_file);
>  	mutex_unlock(&trace_types_lock);
>  
>  	free_cpumask_var(iter->started);
> @@ -9441,6 +9473,9 @@ static struct trace_array *trace_array_create(const char *name)
>  	if (!alloc_cpumask_var(&tr->tracing_cpumask, GFP_KERNEL))
>  		goto out_free_tr;
>  
> +	if (!alloc_cpumask_var(&tr->pipe_cpumask, GFP_KERNEL))
> +		goto out_free_tr;
> +
>  	tr->trace_flags = global_trace.trace_flags & ~ZEROED_TRACE_FLAGS;
>  
>  	cpumask_copy(tr->tracing_cpumask, cpu_all_mask);
> @@ -9482,6 +9517,7 @@ static struct trace_array *trace_array_create(const char *name)
>   out_free_tr:
>  	ftrace_free_ftrace_ops(tr);
>  	free_trace_buffers(tr);
> +	free_cpumask_var(tr->pipe_cpumask);
>  	free_cpumask_var(tr->tracing_cpumask);
>  	kfree(tr->name);
>  	kfree(tr);
> @@ -9584,6 +9620,7 @@ static int __remove_instance(struct trace_array *tr)
>  	}
>  	kfree(tr->topts);
>  
> +	free_cpumask_var(tr->pipe_cpumask);
>  	free_cpumask_var(tr->tracing_cpumask);
>  	kfree(tr->name);
>  	kfree(tr);
> @@ -10381,12 +10418,14 @@ __init static int tracer_alloc_buffers(void)
>  	if (trace_create_savedcmd() < 0)
>  		goto out_free_temp_buffer;
>  
> +	if (!alloc_cpumask_var(&global_trace.pipe_cpumask, GFP_KERNEL))
> +		goto out_free_savedcmd;
> +
>  	/* TODO: make the number of buffers hot pluggable with CPUS */
>  	if (allocate_trace_buffers(&global_trace, ring_buf_size) < 0) {
>  		MEM_FAIL(1, "tracer: failed to allocate ring buffer!\n");
> -		goto out_free_savedcmd;
> +		goto out_free_pipe_cpumask;
>  	}
> -
>  	if (global_trace.buffer_disabled)
>  		tracing_off();
>  
> @@ -10439,6 +10478,8 @@ __init static int tracer_alloc_buffers(void)
>  
>  	return 0;
>  
> +out_free_pipe_cpumask:
> +	free_cpumask_var(global_trace.pipe_cpumask);
>  out_free_savedcmd:
>  	free_saved_cmdlines_buffer(savedcmd);
>  out_free_temp_buffer:
> diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
> index e1edc2197fc8..53ac0f7780c2 100644
> --- a/kernel/trace/trace.h
> +++ b/kernel/trace/trace.h
> @@ -377,6 +377,8 @@ struct trace_array {
>  	struct list_head	events;
>  	struct trace_event_file *trace_marker_file;
>  	cpumask_var_t		tracing_cpumask; /* only trace on set CPUs */
> +	/* one per_cpu trace_pipe can be opened by only one user */
> +	cpumask_var_t		pipe_cpumask;
>  	int			ref;
>  	int			trace_ref;
>  #ifdef CONFIG_FUNCTION_TRACER
> -- 
> 2.25.1
>
Steven Rostedt Aug. 18, 2023, 1:41 p.m. UTC | #2
On Fri, 18 Aug 2023 14:03:09 +0900
Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:

> Looks good to me. 
> 
> Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
> 
> BTW, this reminds me that splice read support had been introduced for
> virtio-trace support. I've tested it with/without this patch, but I hit
> a different issue, which seems like a virtio-serial issue (per-cpu buffers
> on the guest are spliced correctly, but the data can not be read from host
> side). Let me investigate it.

Is virtio-trace using trace_pipe or trace_pipe_raw? Those two are handled
differently.

-- Steve
Masami Hiramatsu (Google) Aug. 18, 2023, 2:23 p.m. UTC | #3
On Fri, 18 Aug 2023 09:41:28 -0400
Steven Rostedt <rostedt@goodmis.org> wrote:

> On Fri, 18 Aug 2023 14:03:09 +0900
> Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:
> 
> > Looks good to me. 
> > 
> > Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
> > 
> > BTW, this reminds me that splice read support had been introduced for
> > virtio-trace support. I've tested it with/without this patch, but I hit
> > a different issue, which seems like a virtio-serial issue (per-cpu buffers
> > on the guest are spliced correctly, but the data can not be read from host
> > side). Let me investigate it.
> 
> Is virtio-trace using trace_pipe or trace_pipe_raw? Those two are handled
> differently.

It uses trace_pipe_raw. I guess if splice(from trace_pipe_raw to virtio-serial)
returns -1 and errno == EAGAIN, the trace data will be lost?

Thank you,

> 
> -- Steve
Steven Rostedt Aug. 18, 2023, 3:53 p.m. UTC | #4
On Fri, 18 Aug 2023 23:23:01 +0900
Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:

> It uses trace_pipe_raw. I guess if splice(from trace_pipe_raw to virtio-serial)
> returns -1 and errno == EAGAIN, the trace data will be lost?

It shouldn't. If it does, then there's likely a bug. The code will block
and if an interrupt comes in it will return immediately without reading
from the buffer.

https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/kernel/trace/trace.c#n8262

I don't see where it would return -EINTR and consume data, but I may be
missing something.

-- Steve
Masami Hiramatsu (Google) Aug. 19, 2023, 1:42 a.m. UTC | #5
On Fri, 18 Aug 2023 11:53:22 -0400
Steven Rostedt <rostedt@goodmis.org> wrote:

> On Fri, 18 Aug 2023 23:23:01 +0900
> Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:
> 
> > It uses trace_pipe_raw. I guess if splice(from trace_pipe_raw to virtio-serial)
> > returns -1 and errno == EAGAIN, the trace data will be lost?
> 
> It shouldn't. If it does, then there's likely a bug. The code will block
> and if an interrupt comes in it will return immediately without reading
> from the buffer.
> 
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/kernel/trace/trace.c#n8262
> 
> I don't see where it would return -EINTR and consume data, but I may be
> missing something.

Hmm, I suspect the case if the spilice_to_pipe() returns -EAGAIN.

https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/kernel/trace/trace.c#n8491

It seems not handling such case.

Anyway, I also think something wrong in virtio-serial (or misusing?), since
it can not read anything from the host sometimes. I just setup the virtio-trace
with below patch (ignore EAGAIN).


From 92242480285448360c9390a743ea7b3751bb3e61 Mon Sep 17 00:00:00 2001
From: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Date: Thu, 17 Aug 2023 14:08:40 +0900
Subject: [PATCH 1/3] tools/virtio-trace: Ignore EAGAIN error on splice()

splice() can return EAGAIN error instead of returning 0 size read.
In that case, wait a while and try to call splice() again.

Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 tools/virtio/virtio-trace/trace-agent-rw.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tools/virtio/virtio-trace/trace-agent-rw.c b/tools/virtio/virtio-trace/trace-agent-rw.c
index ddfe7875eb16..e8a4c4f0c499 100644
--- a/tools/virtio/virtio-trace/trace-agent-rw.c
+++ b/tools/virtio/virtio-trace/trace-agent-rw.c
@@ -8,6 +8,7 @@
  */
 
 #define _GNU_SOURCE
+#include <errno.h>
 #include <fcntl.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -127,10 +128,10 @@ static void *rw_thread_main(void *thread_info)
 		rlen = splice(ts->in_fd, NULL, ts->read_pipe, NULL,
 				ts->pipe_size, SPLICE_F_MOVE | SPLICE_F_MORE);
 
-		if (rlen < 0) {
-			pr_err("Splice_read in rw-thread(%d)\n", ts->cpu_num);
+		if (rlen < 0 && errno != EAGAIN) {
+			pr_err("Splice_read error (%d) in rw-thread(%d)\n", errno, ts->cpu_num);
 			goto error;
-		} else if (rlen == 0) {
+		} else if (rlen == 0 || errno == EAGAIN) {
 			/*
 			 * If trace data do not exist or are unreadable not
 			 * for exceeding the page size, splice_read returns
Masami Hiramatsu (Google) Aug. 20, 2023, 1:18 p.m. UTC | #6
On Sat, 19 Aug 2023 10:42:57 +0900
Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:

> On Fri, 18 Aug 2023 11:53:22 -0400
> Steven Rostedt <rostedt@goodmis.org> wrote:
> 
> > On Fri, 18 Aug 2023 23:23:01 +0900
> > Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:
> > 
> > > It uses trace_pipe_raw. I guess if splice(from trace_pipe_raw to virtio-serial)
> > > returns -1 and errno == EAGAIN, the trace data will be lost?
> > 
> > It shouldn't. If it does, then there's likely a bug. The code will block
> > and if an interrupt comes in it will return immediately without reading
> > from the buffer.
> > 
> > https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/kernel/trace/trace.c#n8262
> > 
> > I don't see where it would return -EINTR and consume data, but I may be
> > missing something.
> 
> Hmm, I suspect the case if the spilice_to_pipe() returns -EAGAIN.
> 
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/kernel/trace/trace.c#n8491
> 
> It seems not handling such case.
> 
> Anyway, I also think something wrong in virtio-serial (or misusing?), since
> it can not read anything from the host sometimes. I just setup the virtio-trace
> with below patch (ignore EAGAIN).

Hmm, I couldn't reproduce it. (maybe a host security update change something?)
Anyway, I confirmed that the ring buffer pages will not be consumed unless
splice_to_pipe() succeeded.

Thank you,

> 
> 
> From 92242480285448360c9390a743ea7b3751bb3e61 Mon Sep 17 00:00:00 2001
> From: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
> Date: Thu, 17 Aug 2023 14:08:40 +0900
> Subject: [PATCH 1/3] tools/virtio-trace: Ignore EAGAIN error on splice()
> 
> splice() can return EAGAIN error instead of returning 0 size read.
> In that case, wait a while and try to call splice() again.
> 
> Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
> ---
>  tools/virtio/virtio-trace/trace-agent-rw.c | 7 ++++---
>  1 file changed, 4 insertions(+), 3 deletions(-)
> 
> diff --git a/tools/virtio/virtio-trace/trace-agent-rw.c b/tools/virtio/virtio-trace/trace-agent-rw.c
> index ddfe7875eb16..e8a4c4f0c499 100644
> --- a/tools/virtio/virtio-trace/trace-agent-rw.c
> +++ b/tools/virtio/virtio-trace/trace-agent-rw.c
> @@ -8,6 +8,7 @@
>   */
>  
>  #define _GNU_SOURCE
> +#include <errno.h>
>  #include <fcntl.h>
>  #include <stdio.h>
>  #include <stdlib.h>
> @@ -127,10 +128,10 @@ static void *rw_thread_main(void *thread_info)
>  		rlen = splice(ts->in_fd, NULL, ts->read_pipe, NULL,
>  				ts->pipe_size, SPLICE_F_MOVE | SPLICE_F_MORE);
>  
> -		if (rlen < 0) {
> -			pr_err("Splice_read in rw-thread(%d)\n", ts->cpu_num);
> +		if (rlen < 0 && errno != EAGAIN) {
> +			pr_err("Splice_read error (%d) in rw-thread(%d)\n", errno, ts->cpu_num);
>  			goto error;
> -		} else if (rlen == 0) {
> +		} else if (rlen == 0 || errno == EAGAIN) {
>  			/*
>  			 * If trace data do not exist or are unreadable not
>  			 * for exceeding the page size, splice_read returns
> -- 
> 2.34.1
> 
> 
> -- 
> Masami Hiramatsu (Google) <mhiramat@kernel.org>
Masami Hiramatsu (Google) Aug. 21, 2023, 2:19 a.m. UTC | #7
On Sat, 19 Aug 2023 10:42:57 +0900
Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:

> On Fri, 18 Aug 2023 11:53:22 -0400
> Steven Rostedt <rostedt@goodmis.org> wrote:
> 
> > On Fri, 18 Aug 2023 23:23:01 +0900
> > Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:
> > 
> > > It uses trace_pipe_raw. I guess if splice(from trace_pipe_raw to virtio-serial)
> > > returns -1 and errno == EAGAIN, the trace data will be lost?
> > 
> > It shouldn't. If it does, then there's likely a bug. The code will block
> > and if an interrupt comes in it will return immediately without reading
> > from the buffer.
> > 
> > https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/kernel/trace/trace.c#n8262
> > 
> > I don't see where it would return -EINTR and consume data, but I may be
> > missing something.
> 
> Hmm, I suspect the case if the spilice_to_pipe() returns -EAGAIN.
> 
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/kernel/trace/trace.c#n8491
> 
> It seems not handling such case.
> 
> Anyway, I also think something wrong in virtio-serial (or misusing?), since
> it can not read anything from the host sometimes. I just setup the virtio-trace
> with below patch (ignore EAGAIN).
> 
> 
> From 92242480285448360c9390a743ea7b3751bb3e61 Mon Sep 17 00:00:00 2001
> From: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
> Date: Thu, 17 Aug 2023 14:08:40 +0900
> Subject: [PATCH 1/3] tools/virtio-trace: Ignore EAGAIN error on splice()
> 
> splice() can return EAGAIN error instead of returning 0 size read.
> In that case, wait a while and try to call splice() again.
> 
> Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
> ---
>  tools/virtio/virtio-trace/trace-agent-rw.c | 7 ++++---
>  1 file changed, 4 insertions(+), 3 deletions(-)
> 
> diff --git a/tools/virtio/virtio-trace/trace-agent-rw.c b/tools/virtio/virtio-trace/trace-agent-rw.c
> index ddfe7875eb16..e8a4c4f0c499 100644
> --- a/tools/virtio/virtio-trace/trace-agent-rw.c
> +++ b/tools/virtio/virtio-trace/trace-agent-rw.c
> @@ -8,6 +8,7 @@
>   */
>  
>  #define _GNU_SOURCE
> +#include <errno.h>
>  #include <fcntl.h>
>  #include <stdio.h>
>  #include <stdlib.h>
> @@ -127,10 +128,10 @@ static void *rw_thread_main(void *thread_info)
>  		rlen = splice(ts->in_fd, NULL, ts->read_pipe, NULL,
>  				ts->pipe_size, SPLICE_F_MOVE | SPLICE_F_MORE);
>  
> -		if (rlen < 0) {
> -			pr_err("Splice_read in rw-thread(%d)\n", ts->cpu_num);
> +		if (rlen < 0 && errno != EAGAIN) {
> +			pr_err("Splice_read error (%d) in rw-thread(%d)\n", errno, ts->cpu_num);
>  			goto error;
> -		} else if (rlen == 0) {
> +		} else if (rlen == 0 || errno == EAGAIN) {

Ah, this caused a drop. errno can be EAGAIN even if rlen > 0.
I've fixed this and that works.
BTW, I think this virtio-trace would be better to move under
tools/tracing because it is a tracing tool.

Thanks,

>  			/*
>  			 * If trace data do not exist or are unreadable not
>  			 * for exceeding the page size, splice_read returns
> -- 
> 2.34.1
> 
> 
> -- 
> Masami Hiramatsu (Google) <mhiramat@kernel.org>
Steven Rostedt Aug. 21, 2023, 2:33 a.m. UTC | #8
On Mon, 21 Aug 2023 11:19:54 +0900
Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:

> Ah, this caused a drop. errno can be EAGAIN even if rlen > 0.
> I've fixed this and that works.
> BTW, I think this virtio-trace would be better to move under
> tools/tracing because it is a tracing tool.

I'm fine with that, as where it is, I'm very unfamiliar with this tool.
It is likely not taking advantage of all the tracing tooling we
have. I actually never even used it.

-- Steve
Masami Hiramatsu (Google) Aug. 21, 2023, 9:21 a.m. UTC | #9
On Sun, 20 Aug 2023 22:33:01 -0400
Steven Rostedt <rostedt@goodmis.org> wrote:

> On Mon, 21 Aug 2023 11:19:54 +0900
> Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:
> 
> > Ah, this caused a drop. errno can be EAGAIN even if rlen > 0.
> > I've fixed this and that works.
> > BTW, I think this virtio-trace would be better to move under
> > tools/tracing because it is a tracing tool.
> 
> I'm fine with that, as where it is, I'm very unfamiliar with this tool.
> It is likely not taking advantage of all the tracing tooling we
> have. I actually never even used it.

Actually I also used this after a long time. :P
I think this is a kind of simplest splice support test tool.

What the tool does;

(guest side)
per_cpu/cpu*/trace_pipe_raw
     |
   (splice)
     |
  anon-pipe
     |
   (splice)
     |
  virtio-serial chardev
     |
= virtqueue ===
     |
  named-fifo
(host-side)

So that we can move the traced data (page) from the ring buffer to
virtqueue. Then host tool can read the trace data without copying.
(The host part needs a copy to read or write to file.)
Obviously, this requires some integration work with other tracing
tools, because this is just a "fastest trace-data dumper".

(I think Yoshihiro worked that integration, but it was not updated)

https://lkml.org/lkml/2013/9/12/788

Thank you,
Steven Rostedt Aug. 21, 2023, 3:17 p.m. UTC | #10
On Sat, 19 Aug 2023 10:42:57 +0900
Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:

> >From 92242480285448360c9390a743ea7b3751bb3e61 Mon Sep 17 00:00:00 2001  
> From: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
> Date: Thu, 17 Aug 2023 14:08:40 +0900
> Subject: [PATCH 1/3] tools/virtio-trace: Ignore EAGAIN error on splice()
> 
> splice() can return EAGAIN error instead of returning 0 size read.
> In that case, wait a while and try to call splice() again.
> 
> Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>

Bah,

When I pulled in Zheng's patch from patchwork, it included the above
signed-off-by. I was confused to why it added your SoB.

Probably best if we had started this conversation on another thread and
not hijack the discussion about this patch.

-- Steve


> ---
>  tools/virtio/virtio-trace/trace-agent-rw.c | 7 ++++---
>  1 file changed, 4 insertions(+), 3 deletions(-)
Masami Hiramatsu (Google) Aug. 21, 2023, 10:32 p.m. UTC | #11
On Mon, 21 Aug 2023 11:17:25 -0400
Steven Rostedt <rostedt@goodmis.org> wrote:

> On Sat, 19 Aug 2023 10:42:57 +0900
> Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:
> 
> > >From 92242480285448360c9390a743ea7b3751bb3e61 Mon Sep 17 00:00:00 2001  
> > From: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
> > Date: Thu, 17 Aug 2023 14:08:40 +0900
> > Subject: [PATCH 1/3] tools/virtio-trace: Ignore EAGAIN error on splice()
> > 
> > splice() can return EAGAIN error instead of returning 0 size read.
> > In that case, wait a while and try to call splice() again.
> > 
> > Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
> 
> Bah,
> 
> When I pulled in Zheng's patch from patchwork, it included the above
> signed-off-by. I was confused to why it added your SoB.

Oops, I didn't noticed such patchwork behavior. 

> 
> Probably best if we had started this conversation on another thread and
> not hijack the discussion about this patch.

Agreed. Sorry for confusion.


> 
> -- Steve
> 
> 
> > ---
> >  tools/virtio/virtio-trace/trace-agent-rw.c | 7 ++++---
> >  1 file changed, 4 insertions(+), 3 deletions(-)
>
diff mbox series

Patch

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index b8870078ef58..c888a0a2c0e2 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -6705,10 +6705,36 @@  tracing_max_lat_write(struct file *filp, const char __user *ubuf,
 
 #endif
 
+static int open_pipe_on_cpu(struct trace_array *tr, int cpu)
+{
+	if (cpu == RING_BUFFER_ALL_CPUS) {
+		if (cpumask_empty(tr->pipe_cpumask)) {
+			cpumask_setall(tr->pipe_cpumask);
+			return 0;
+		}
+	} else if (!cpumask_test_cpu(cpu, tr->pipe_cpumask)) {
+		cpumask_set_cpu(cpu, tr->pipe_cpumask);
+		return 0;
+	}
+	return -EBUSY;
+}
+
+static void close_pipe_on_cpu(struct trace_array *tr, int cpu)
+{
+	if (cpu == RING_BUFFER_ALL_CPUS) {
+		WARN_ON(!cpumask_full(tr->pipe_cpumask));
+		cpumask_clear(tr->pipe_cpumask);
+	} else {
+		WARN_ON(!cpumask_test_cpu(cpu, tr->pipe_cpumask));
+		cpumask_clear_cpu(cpu, tr->pipe_cpumask);
+	}
+}
+
 static int tracing_open_pipe(struct inode *inode, struct file *filp)
 {
 	struct trace_array *tr = inode->i_private;
 	struct trace_iterator *iter;
+	int cpu;
 	int ret;
 
 	ret = tracing_check_open_get_tr(tr);
@@ -6716,13 +6742,16 @@  static int tracing_open_pipe(struct inode *inode, struct file *filp)
 		return ret;
 
 	mutex_lock(&trace_types_lock);
+	cpu = tracing_get_cpu(inode);
+	ret = open_pipe_on_cpu(tr, cpu);
+	if (ret)
+		goto fail_pipe_on_cpu;
 
 	/* create a buffer to store the information to pass to userspace */
 	iter = kzalloc(sizeof(*iter), GFP_KERNEL);
 	if (!iter) {
 		ret = -ENOMEM;
-		__trace_array_put(tr);
-		goto out;
+		goto fail_alloc_iter;
 	}
 
 	trace_seq_init(&iter->seq);
@@ -6745,7 +6774,7 @@  static int tracing_open_pipe(struct inode *inode, struct file *filp)
 
 	iter->tr = tr;
 	iter->array_buffer = &tr->array_buffer;
-	iter->cpu_file = tracing_get_cpu(inode);
+	iter->cpu_file = cpu;
 	mutex_init(&iter->mutex);
 	filp->private_data = iter;
 
@@ -6755,12 +6784,15 @@  static int tracing_open_pipe(struct inode *inode, struct file *filp)
 	nonseekable_open(inode, filp);
 
 	tr->trace_ref++;
-out:
+
 	mutex_unlock(&trace_types_lock);
 	return ret;
 
 fail:
 	kfree(iter);
+fail_alloc_iter:
+	close_pipe_on_cpu(tr, cpu);
+fail_pipe_on_cpu:
 	__trace_array_put(tr);
 	mutex_unlock(&trace_types_lock);
 	return ret;
@@ -6777,7 +6809,7 @@  static int tracing_release_pipe(struct inode *inode, struct file *file)
 
 	if (iter->trace->pipe_close)
 		iter->trace->pipe_close(iter);
-
+	close_pipe_on_cpu(tr, iter->cpu_file);
 	mutex_unlock(&trace_types_lock);
 
 	free_cpumask_var(iter->started);
@@ -9441,6 +9473,9 @@  static struct trace_array *trace_array_create(const char *name)
 	if (!alloc_cpumask_var(&tr->tracing_cpumask, GFP_KERNEL))
 		goto out_free_tr;
 
+	if (!alloc_cpumask_var(&tr->pipe_cpumask, GFP_KERNEL))
+		goto out_free_tr;
+
 	tr->trace_flags = global_trace.trace_flags & ~ZEROED_TRACE_FLAGS;
 
 	cpumask_copy(tr->tracing_cpumask, cpu_all_mask);
@@ -9482,6 +9517,7 @@  static struct trace_array *trace_array_create(const char *name)
  out_free_tr:
 	ftrace_free_ftrace_ops(tr);
 	free_trace_buffers(tr);
+	free_cpumask_var(tr->pipe_cpumask);
 	free_cpumask_var(tr->tracing_cpumask);
 	kfree(tr->name);
 	kfree(tr);
@@ -9584,6 +9620,7 @@  static int __remove_instance(struct trace_array *tr)
 	}
 	kfree(tr->topts);
 
+	free_cpumask_var(tr->pipe_cpumask);
 	free_cpumask_var(tr->tracing_cpumask);
 	kfree(tr->name);
 	kfree(tr);
@@ -10381,12 +10418,14 @@  __init static int tracer_alloc_buffers(void)
 	if (trace_create_savedcmd() < 0)
 		goto out_free_temp_buffer;
 
+	if (!alloc_cpumask_var(&global_trace.pipe_cpumask, GFP_KERNEL))
+		goto out_free_savedcmd;
+
 	/* TODO: make the number of buffers hot pluggable with CPUS */
 	if (allocate_trace_buffers(&global_trace, ring_buf_size) < 0) {
 		MEM_FAIL(1, "tracer: failed to allocate ring buffer!\n");
-		goto out_free_savedcmd;
+		goto out_free_pipe_cpumask;
 	}
-
 	if (global_trace.buffer_disabled)
 		tracing_off();
 
@@ -10439,6 +10478,8 @@  __init static int tracer_alloc_buffers(void)
 
 	return 0;
 
+out_free_pipe_cpumask:
+	free_cpumask_var(global_trace.pipe_cpumask);
 out_free_savedcmd:
 	free_saved_cmdlines_buffer(savedcmd);
 out_free_temp_buffer:
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index e1edc2197fc8..53ac0f7780c2 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -377,6 +377,8 @@  struct trace_array {
 	struct list_head	events;
 	struct trace_event_file *trace_marker_file;
 	cpumask_var_t		tracing_cpumask; /* only trace on set CPUs */
+	/* one per_cpu trace_pipe can be opened by only one user */
+	cpumask_var_t		pipe_cpumask;
 	int			ref;
 	int			trace_ref;
 #ifdef CONFIG_FUNCTION_TRACER