diff mbox series

trace_seq: Increase the buffer size to almost two pages

Message ID 20231209175220.19867af4@gandalf.local.home (mailing list archive)
State Accepted
Commit 40fc60e36c60ba85b2974e507b67df40c94e9578
Headers show
Series trace_seq: Increase the buffer size to almost two pages | expand

Commit Message

Steven Rostedt Dec. 9, 2023, 10:52 p.m. UTC
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>

Now that trace_marker can hold more than 1KB string, and can write as much
as the ring buffer can hold, the trace_seq is not big enough to hold
writes:

 ~# a="1234567890"
 ~# cnt=4080
 ~# s=""
 ~# while [ $cnt -gt 10 ]; do
 ~#	s="${s}${a}"
 ~#	cnt=$((cnt-10))
 ~# done
 ~# echo $s > trace_marker
 ~# cat trace
 # tracer: nop
 #
 # entries-in-buffer/entries-written: 2/2   #P:8
 #
 #                                _-----=> irqs-off/BH-disabled
 #                               / _----=> need-resched
 #                              | / _---=> hardirq/softirq
 #                              || / _--=> preempt-depth
 #                              ||| / _-=> migrate-disable
 #                              |||| /     delay
 #           TASK-PID     CPU#  |||||  TIMESTAMP  FUNCTION
 #              | |         |   |||||     |         |
            <...>-860     [002] .....   105.543465: tracing_mark_write[LINE TOO BIG]
            <...>-860     [002] .....   105.543496: tracing_mark_write: 789012345678901234567890

By increasing the trace_seq buffer to almost two pages, it can now print
out the first line.

This also subtracts the rest of the trace_seq fields from the buffer, so
that the entire trace_seq is now PAGE_SIZE aligned.

Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/trace_seq.h | 9 ++++++---
 kernel/trace/trace.c      | 6 +++---
 kernel/trace/trace_seq.c  | 3 ---
 3 files changed, 9 insertions(+), 9 deletions(-)

Comments

Masami Hiramatsu (Google) Dec. 11, 2023, 12:46 p.m. UTC | #1
On Sat, 9 Dec 2023 17:52:20 -0500
Steven Rostedt <rostedt@goodmis.org> wrote:

> From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
> 
> Now that trace_marker can hold more than 1KB string, and can write as much
> as the ring buffer can hold, the trace_seq is not big enough to hold
> writes:
> 
>  ~# a="1234567890"
>  ~# cnt=4080
>  ~# s=""
>  ~# while [ $cnt -gt 10 ]; do
>  ~#	s="${s}${a}"
>  ~#	cnt=$((cnt-10))
>  ~# done
>  ~# echo $s > trace_marker
>  ~# cat trace
>  # tracer: nop
>  #
>  # entries-in-buffer/entries-written: 2/2   #P:8
>  #
>  #                                _-----=> irqs-off/BH-disabled
>  #                               / _----=> need-resched
>  #                              | / _---=> hardirq/softirq
>  #                              || / _--=> preempt-depth
>  #                              ||| / _-=> migrate-disable
>  #                              |||| /     delay
>  #           TASK-PID     CPU#  |||||  TIMESTAMP  FUNCTION
>  #              | |         |   |||||     |         |
>             <...>-860     [002] .....   105.543465: tracing_mark_write[LINE TOO BIG]
>             <...>-860     [002] .....   105.543496: tracing_mark_write: 789012345678901234567890
> 
> By increasing the trace_seq buffer to almost two pages, it can now print
> out the first line.
> 
> This also subtracts the rest of the trace_seq fields from the buffer, so
> that the entire trace_seq is now PAGE_SIZE aligned.

Ok, but I just a bit concern about the memory consumption.
Since this is very specific case, can we make it configurable later?

Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>

Thanks,

> 
> Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
> ---
>  include/linux/trace_seq.h | 9 ++++++---
>  kernel/trace/trace.c      | 6 +++---
>  kernel/trace/trace_seq.c  | 3 ---
>  3 files changed, 9 insertions(+), 9 deletions(-)
> 
> diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h
> index 3691e0e76a1a..9ec229dfddaa 100644
> --- a/include/linux/trace_seq.h
> +++ b/include/linux/trace_seq.h
> @@ -8,11 +8,14 @@
>  
>  /*
>   * Trace sequences are used to allow a function to call several other functions
> - * to create a string of data to use (up to a max of PAGE_SIZE).
> + * to create a string of data to use.
>   */
>  
> +#define TRACE_SEQ_BUFFER_SIZE	(PAGE_SIZE * 2 - \
> +	(sizeof(struct seq_buf) + sizeof(size_t) + sizeof(int)))
> +
>  struct trace_seq {
> -	char			buffer[PAGE_SIZE];
> +	char			buffer[TRACE_SEQ_BUFFER_SIZE];
>  	struct seq_buf		seq;
>  	size_t			readpos;
>  	int			full;
> @@ -21,7 +24,7 @@ struct trace_seq {
>  static inline void
>  trace_seq_init(struct trace_seq *s)
>  {
> -	seq_buf_init(&s->seq, s->buffer, PAGE_SIZE);
> +	seq_buf_init(&s->seq, s->buffer, TRACE_SEQ_BUFFER_SIZE);
>  	s->full = 0;
>  	s->readpos = 0;
>  }
> diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
> index 83393c65ec71..da837119a446 100644
> --- a/kernel/trace/trace.c
> +++ b/kernel/trace/trace.c
> @@ -3753,7 +3753,7 @@ static bool trace_safe_str(struct trace_iterator *iter, const char *str,
>  
>  	/* OK if part of the temp seq buffer */
>  	if ((addr >= (unsigned long)iter->tmp_seq.buffer) &&
> -	    (addr < (unsigned long)iter->tmp_seq.buffer + PAGE_SIZE))
> +	    (addr < (unsigned long)iter->tmp_seq.buffer + TRACE_SEQ_BUFFER_SIZE))
>  		return true;
>  
>  	/* Core rodata can not be freed */
> @@ -6926,8 +6926,8 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
>  		goto out;
>  	}
>  
> -	if (cnt >= PAGE_SIZE)
> -		cnt = PAGE_SIZE - 1;
> +	if (cnt >= TRACE_SEQ_BUFFER_SIZE)
> +		cnt = TRACE_SEQ_BUFFER_SIZE - 1;
>  
>  	/* reset all but tr, trace, and overruns */
>  	trace_iterator_reset(iter);
> diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c
> index 7be97229ddf8..c158d65a8a88 100644
> --- a/kernel/trace/trace_seq.c
> +++ b/kernel/trace/trace_seq.c
> @@ -13,9 +13,6 @@
>   * trace_seq_init() more than once to reset the trace_seq to start
>   * from scratch.
>   * 
> - * The buffer size is currently PAGE_SIZE, although it may become dynamic
> - * in the future.
> - *
>   * A write to the buffer will either succeed or fail. That is, unlike
>   * sprintf() there will not be a partial write (well it may write into
>   * the buffer but it wont update the pointers). This allows users to
> -- 
> 2.42.0
>
Steven Rostedt Dec. 11, 2023, 6:28 p.m. UTC | #2
On Mon, 11 Dec 2023 21:46:27 +0900
Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:

> > 
> > By increasing the trace_seq buffer to almost two pages, it can now print
> > out the first line.
> > 
> > This also subtracts the rest of the trace_seq fields from the buffer, so
> > that the entire trace_seq is now PAGE_SIZE aligned.  
> 
> Ok, but I just a bit concern about the memory consumption.
> Since this is very specific case, can we make it configurable later?

I was concerned about this too, but it looks like it's allocated and later
freed in every location except for a couple of instances.

One is "tracepoint_print_iter" which is used to pipe tracepoints to printk.
I think we can possibly make that allocated too.

The other is in ftrace_dump, which I don't think we can easily allocate
that. Although, we could have it allocated at boot up if
ftrace_dump_on_oops() is enabled.

Another KTODO?

> 
> Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
> 

Thanks!

-- Steve
Masami Hiramatsu (Google) Dec. 11, 2023, 11:07 p.m. UTC | #3
On Mon, 11 Dec 2023 13:28:37 -0500
Steven Rostedt <rostedt@goodmis.org> wrote:

> On Mon, 11 Dec 2023 21:46:27 +0900
> Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:
> 
> > > 
> > > By increasing the trace_seq buffer to almost two pages, it can now print
> > > out the first line.
> > > 
> > > This also subtracts the rest of the trace_seq fields from the buffer, so
> > > that the entire trace_seq is now PAGE_SIZE aligned.  
> > 
> > Ok, but I just a bit concern about the memory consumption.
> > Since this is very specific case, can we make it configurable later?
> 
> I was concerned about this too, but it looks like it's allocated and later
> freed in every location except for a couple of instances.
> 
> One is "tracepoint_print_iter" which is used to pipe tracepoints to printk.
> I think we can possibly make that allocated too.
> 
> The other is in ftrace_dump, which I don't think we can easily allocate
> that. Although, we could have it allocated at boot up if
> ftrace_dump_on_oops() is enabled.

Can we reallocate it when we detect such bigger event entry in the path
of trace_marker write? If any issue happens in the reallocation, we will
not finish (commit) such big event in dumping buffer anyway.

> 
> Another KTODO?

Yes, I think so.

Thanks,

> 
> > 
> > Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
> > 
> 
> Thanks!
> 
> -- Steve
diff mbox series

Patch

diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h
index 3691e0e76a1a..9ec229dfddaa 100644
--- a/include/linux/trace_seq.h
+++ b/include/linux/trace_seq.h
@@ -8,11 +8,14 @@ 
 
 /*
  * Trace sequences are used to allow a function to call several other functions
- * to create a string of data to use (up to a max of PAGE_SIZE).
+ * to create a string of data to use.
  */
 
+#define TRACE_SEQ_BUFFER_SIZE	(PAGE_SIZE * 2 - \
+	(sizeof(struct seq_buf) + sizeof(size_t) + sizeof(int)))
+
 struct trace_seq {
-	char			buffer[PAGE_SIZE];
+	char			buffer[TRACE_SEQ_BUFFER_SIZE];
 	struct seq_buf		seq;
 	size_t			readpos;
 	int			full;
@@ -21,7 +24,7 @@  struct trace_seq {
 static inline void
 trace_seq_init(struct trace_seq *s)
 {
-	seq_buf_init(&s->seq, s->buffer, PAGE_SIZE);
+	seq_buf_init(&s->seq, s->buffer, TRACE_SEQ_BUFFER_SIZE);
 	s->full = 0;
 	s->readpos = 0;
 }
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 83393c65ec71..da837119a446 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3753,7 +3753,7 @@  static bool trace_safe_str(struct trace_iterator *iter, const char *str,
 
 	/* OK if part of the temp seq buffer */
 	if ((addr >= (unsigned long)iter->tmp_seq.buffer) &&
-	    (addr < (unsigned long)iter->tmp_seq.buffer + PAGE_SIZE))
+	    (addr < (unsigned long)iter->tmp_seq.buffer + TRACE_SEQ_BUFFER_SIZE))
 		return true;
 
 	/* Core rodata can not be freed */
@@ -6926,8 +6926,8 @@  tracing_read_pipe(struct file *filp, char __user *ubuf,
 		goto out;
 	}
 
-	if (cnt >= PAGE_SIZE)
-		cnt = PAGE_SIZE - 1;
+	if (cnt >= TRACE_SEQ_BUFFER_SIZE)
+		cnt = TRACE_SEQ_BUFFER_SIZE - 1;
 
 	/* reset all but tr, trace, and overruns */
 	trace_iterator_reset(iter);
diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c
index 7be97229ddf8..c158d65a8a88 100644
--- a/kernel/trace/trace_seq.c
+++ b/kernel/trace/trace_seq.c
@@ -13,9 +13,6 @@ 
  * trace_seq_init() more than once to reset the trace_seq to start
  * from scratch.
  * 
- * The buffer size is currently PAGE_SIZE, although it may become dynamic
- * in the future.
- *
  * A write to the buffer will either succeed or fail. That is, unlike
  * sprintf() there will not be a partial write (well it may write into
  * the buffer but it wont update the pointers). This allows users to