diff mbox series

[014/173] mm, tracing: record slab name for kmem_cache_free()

Message ID 20210224200055.U7Xz47kX5%akpm@linux-foundation.org (mailing list archive)
State New, archived
Headers show
Series [001/173] hexagon: remove CONFIG_EXPERIMENTAL from defconfigs | expand

Commit Message

Andrew Morton Feb. 24, 2021, 8 p.m. UTC
From: Jacob Wen <jian.w.wen@oracle.com>
Subject: mm, tracing: record slab name for kmem_cache_free()

Currently, a trace record generated by the RCU core is as below.

... kmem_cache_free: call_site=rcu_core+0x1fd/0x610 ptr=00000000f3b49a66

It doesn't tell us what the RCU core has freed.

This patch adds the slab name to trace_kmem_cache_free().
The new format is as follows.

... kmem_cache_free: call_site=rcu_core+0x1fd/0x610 ptr=0000000037f79c8d name=dentry
... kmem_cache_free: call_site=rcu_core+0x1fd/0x610 ptr=00000000f78cb7b5 name=sock_inode_cache
... kmem_cache_free: call_site=rcu_core+0x1fd/0x610 ptr=0000000018768985 name=pool_workqueue
... kmem_cache_free: call_site=rcu_core+0x1fd/0x610 ptr=000000006a6cb484 name=radix_tree_node

We can use it to understand what the RCU core is going to free. For
example, some users maybe interested in when the RCU core starts
freeing reclaimable slabs like dentry to reduce memory pressure.

Link: https://lkml.kernel.org/r/20201216072804.8838-1-jian.w.wen@oracle.com
Signed-off-by: Jacob Wen <jian.w.wen@oracle.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---

 include/trace/events/kmem.h |   24 ++++++++++++++++--------
 mm/slab.c                   |    2 +-
 mm/slob.c                   |    2 +-
 mm/slub.c                   |    2 +-
 4 files changed, 19 insertions(+), 11 deletions(-)

Comments

Steven Rostedt Feb. 25, 2021, 1:37 a.m. UTC | #1
On Wed, 24 Feb 2021 12:00:55 -0800
Andrew Morton <akpm@linux-foundation.org> wrote:

> From: Jacob Wen <jian.w.wen@oracle.com>
> Subject: mm, tracing: record slab name for kmem_cache_free()
> 
> Currently, a trace record generated by the RCU core is as below.
> 
> ... kmem_cache_free: call_site=rcu_core+0x1fd/0x610 ptr=00000000f3b49a66
> 
> It doesn't tell us what the RCU core has freed.
> 
> This patch adds the slab name to trace_kmem_cache_free().
> The new format is as follows.
> 
> ... kmem_cache_free: call_site=rcu_core+0x1fd/0x610 ptr=0000000037f79c8d name=dentry
> ... kmem_cache_free: call_site=rcu_core+0x1fd/0x610 ptr=00000000f78cb7b5 name=sock_inode_cache
> ... kmem_cache_free: call_site=rcu_core+0x1fd/0x610 ptr=0000000018768985 name=pool_workqueue
> ... kmem_cache_free: call_site=rcu_core+0x1fd/0x610 ptr=000000006a6cb484 name=radix_tree_node
> 
> We can use it to understand what the RCU core is going to free. For
> example, some users maybe interested in when the RCU core starts
> freeing reclaimable slabs like dentry to reduce memory pressure.
> 
> Link: https://lkml.kernel.org/r/20201216072804.8838-1-jian.w.wen@oracle.com
> Signed-off-by: Jacob Wen <jian.w.wen@oracle.com>
> Cc: Christoph Lameter <cl@linux.com>
> Cc: Pekka Enberg <penberg@kernel.org>
> Cc: David Rientjes <rientjes@google.com>
> Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
> Cc: Steven Rostedt <rostedt@goodmis.org>
> Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
> ---
> 
>  include/trace/events/kmem.h |   24 ++++++++++++++++--------
>  mm/slab.c                   |    2 +-
>  mm/slob.c                   |    2 +-
>  mm/slub.c                   |    2 +-
>  4 files changed, 19 insertions(+), 11 deletions(-)
> 
> --- a/include/trace/events/kmem.h~mm-tracing-record-slab-name-for-kmem_cache_free
> +++ a/include/trace/events/kmem.h
> @@ -115,7 +115,7 @@ DEFINE_EVENT(kmem_alloc_node, kmem_cache
>  	TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node)
>  );
>  
> -DECLARE_EVENT_CLASS(kmem_free,
> +TRACE_EVENT(kfree,
>  
>  	TP_PROTO(unsigned long call_site, const void *ptr),
>  
> @@ -135,18 +135,26 @@ DECLARE_EVENT_CLASS(kmem_free,
>  		  (void *)__entry->call_site, __entry->ptr)
>  );
>  
> -DEFINE_EVENT(kmem_free, kfree,
> +TRACE_EVENT(kmem_cache_free,
>  
> -	TP_PROTO(unsigned long call_site, const void *ptr),
> +	TP_PROTO(unsigned long call_site, const void *ptr, const char *name),
>  
> -	TP_ARGS(call_site, ptr)
> -);
> +	TP_ARGS(call_site, ptr, name),
>  
> -DEFINE_EVENT(kmem_free, kmem_cache_free,
> +	TP_STRUCT__entry(
> +		__field(	unsigned long,	call_site	)
> +		__field(	const void *,	ptr		)
> +		__field(	const char *,	name		)

You must use __string() here!

		__string(name, name);

> +	),
>  
> -	TP_PROTO(unsigned long call_site, const void *ptr),
> +	TP_fast_assign(
> +		__entry->call_site	= call_site;
> +		__entry->ptr		= ptr;
> +		__entry->name		= name;

This must be:

		__assign_str(name, name);


> +	),
>  
> -	TP_ARGS(call_site, ptr)
> +	TP_printk("call_site=%pS ptr=%p name=%s",
> +		  (void *)__entry->call_site, __entry->ptr, __entry->name)

	You must use __get_str(name) here!

(see other users of this logic in include/trace/events/*.h)

What is happening is that TP_fast_assign() is called by the tracepoint
logic (at the time of the event), then much later (seconds, minutes,
hours, days!), when the user does a "cat trace" of the file, the
__entry->name is read and the printf logic is called against it. Well,
the s->name that __entry->name points to, can be long gone by then!

Instead, using __string() tells the TRACE_EVENT() macro that this is a
dynamic string. The __assign_str() records the string into the ring
buffer. The __get_str() retrieves the string from the ring buffer as
part of the event, so it stays around as long as the event being read
by the trace file is around.

Please do not apply this patch as is, it is very buggy!

-- Steve


>  );
>  
>  TRACE_EVENT(mm_page_free,
> --- a/mm/slab.c~mm-tracing-record-slab-name-for-kmem_cache_free
> +++ a/mm/slab.c
> @@ -3717,7 +3717,7 @@ void kmem_cache_free(struct kmem_cache *
>  	__cache_free(cachep, objp, _RET_IP_);
>  	local_irq_restore(flags);
>  
> -	trace_kmem_cache_free(_RET_IP_, objp);
> +	trace_kmem_cache_free(_RET_IP_, objp, cachep->name);
>  }
>  EXPORT_SYMBOL(kmem_cache_free);
>  
> --- a/mm/slob.c~mm-tracing-record-slab-name-for-kmem_cache_free
> +++ a/mm/slob.c
> @@ -673,7 +673,7 @@ void kmem_cache_free(struct kmem_cache *
>  		__kmem_cache_free(b, c->size);
>  	}
>  
> -	trace_kmem_cache_free(_RET_IP_, b);
> +	trace_kmem_cache_free(_RET_IP_, b, c->name);
>  }
>  EXPORT_SYMBOL(kmem_cache_free);
>  
> --- a/mm/slub.c~mm-tracing-record-slab-name-for-kmem_cache_free
> +++ a/mm/slub.c
> @@ -3157,7 +3157,7 @@ void kmem_cache_free(struct kmem_cache *
>  	if (!s)
>  		return;
>  	slab_free(s, virt_to_head_page(x), x, NULL, 1, _RET_IP_);
> -	trace_kmem_cache_free(_RET_IP_, x);
> +	trace_kmem_cache_free(_RET_IP_, x, s->name);
>  }
>  EXPORT_SYMBOL(kmem_cache_free);
>  
> _
Linus Torvalds Feb. 25, 2021, 1:51 a.m. UTC | #2
On Wed, Feb 24, 2021 at 5:37 PM Steven Rostedt <rostedt@goodmis.org> wrote:
>
> Please do not apply this patch as is, it is very buggy!

Sadly, it's already there and pushed out.  I try to wait a few hours
(sometimes overnight) with Andrew's series before pushing things out
in order to catch comments like this, but this one came too late.

I can obviously revert, but if the fix is the two-liner you outlined,
perhaps that is better? Mind sending it as a patch (or just putting it
in your tracing tree)? Or Jacob? A tested patch along the lines that
Steven outlined?

               Linus
Linus Torvalds Feb. 25, 2021, 1:52 a.m. UTC | #3
On Wed, Feb 24, 2021 at 5:51 PM Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> I can obviously revert, but if the fix is the two-liner you outlined,

Duh. Three lines, not two...

          Linus
Steven Rostedt Feb. 25, 2021, 2:03 a.m. UTC | #4
On Wed, 24 Feb 2021 17:51:46 -0800
Linus Torvalds <torvalds@linux-foundation.org> wrote:

> On Wed, Feb 24, 2021 at 5:37 PM Steven Rostedt <rostedt@goodmis.org> wrote:
> >
> > Please do not apply this patch as is, it is very buggy!  
> 
> Sadly, it's already there and pushed out.  I try to wait a few hours
> (sometimes overnight) with Andrew's series before pushing things out
> in order to catch comments like this, but this one came too late.

I try to look at patches like this for these very mistakes. I don't
seem to be Cc'd on the original patch, and this looks to be the first
time I've seen this patch. I should update my lkml filter to send me
any patches that touch include/trace/ code or has TRACE_EVENT in it,
and forward it to my inbox if I'm not already on the Cc.

> 
> I can obviously revert, but if the fix is the two-liner you outlined,
> perhaps that is better? Mind sending it as a patch (or just putting it
> in your tracing tree)? Or Jacob? A tested patch along the lines that
> Steven outlined?

I'm done for the night, but if nobody does anything by tomorrow, I can
write up a quick fix in a few minutes. I prefer the original author to
do it, just so they get into the habit of understanding strings in
events.

But even though it's very buggy, it wont break anything unless people
really try to. The breakage would require enabling the event, then
doing something that would free a slab and its name, then reading the
trace file that has the events. In other words, the risk is low, but
something you don't want on a production system. I don't see this
breaking current mainline testing, so its not urgent to get fixed
immediately.

-- Steve
Steven Rostedt Feb. 25, 2021, 2:07 a.m. UTC | #5
On Wed, 24 Feb 2021 20:37:08 -0500
Steven Rostedt <rostedt@goodmis.org> wrote:

> > +	TP_printk("call_site=%pS ptr=%p name=%s",
> > +		  (void *)__entry->call_site, __entry->ptr, __entry->name)  
> 
> 	You must use __get_str(name) here!
> 
> (see other users of this logic in include/trace/events/*.h)
> 
> What is happening is that TP_fast_assign() is called by the tracepoint
> logic (at the time of the event), then much later (seconds, minutes,
> hours, days!), when the user does a "cat trace" of the file, the
> __entry->name is read and the printf logic is called against it. Well,
> the s->name that __entry->name points to, can be long gone by then!
> 
> Instead, using __string() tells the TRACE_EVENT() macro that this is a
> dynamic string. The __assign_str() records the string into the ring
> buffer. The __get_str() retrieves the string from the ring buffer as
> part of the event, so it stays around as long as the event being read
> by the trace file is around.
> 
> Please do not apply this patch as is, it is very buggy!

I wonder if we can add something to checkpatch that can check if
TP_printk() has a call to "%s" where it references a __entry->xxx and
not a __get_str(), and will warn about it.

There a a few cases where its OK. Like RCU uses a TPS() macro around
strings it passes into the tracepoint, which is used for strings that
never are freed, and maps the string pointer to the string for user
space. But RCU is the only user of that I believe.

-- Steve
Jacob Wen Feb. 25, 2021, 7:06 a.m. UTC | #6
On 2/25/21 10:03 AM, Steven Rostedt wrote:
> On Wed, 24 Feb 2021 17:51:46 -0800
> Linus Torvalds <torvalds@linux-foundation.org> wrote:
>
>> On Wed, Feb 24, 2021 at 5:37 PM Steven Rostedt <rostedt@goodmis.org> wrote:
>>> Please do not apply this patch as is, it is very buggy!
>> Sadly, it's already there and pushed out.  I try to wait a few hours
>> (sometimes overnight) with Andrew's series before pushing things out
>> in order to catch comments like this, but this one came too late.
> I try to look at patches like this for these very mistakes. I don't
> seem to be Cc'd on the original patch, and this looks to be the first
> time I've seen this patch. I should update my lkml filter to send me
> any patches that touch include/trace/ code or has TRACE_EVENT in it,
> and forward it to my inbox if I'm not already on the Cc.
>
>> I can obviously revert, but if the fix is the two-liner you outlined,
>> perhaps that is better? Mind sending it as a patch (or just putting it
>> in your tracing tree)? Or Jacob? A tested patch along the lines that
>> Steven outlined?
> I'm done for the night, but if nobody does anything by tomorrow, I can
> write up a quick fix in a few minutes. I prefer the original author to
> do it, just so they get into the habit of understanding strings in
> events.

Learning how strings in events works is great. Thanks for the detailed 
explanation.

I don't have time to fix and test it today.

>
> But even though it's very buggy, it wont break anything unless people
> really try to. The breakage would require enabling the event, then
> doing something that would free a slab and its name, then reading the
> trace file that has the events. In other words, the risk is low, but
> something you don't want on a production system. I don't see this
> breaking current mainline testing, so its not urgent to get fixed
> immediately.
>
> -- Steve
Jacob Wen Feb. 25, 2021, 7:07 a.m. UTC | #7
On 2/25/21 10:07 AM, Steven Rostedt wrote:
> On Wed, 24 Feb 2021 20:37:08 -0500
> Steven Rostedt <rostedt@goodmis.org> wrote:
>
>>> +	TP_printk("call_site=%pS ptr=%p name=%s",
>>> +		  (void *)__entry->call_site, __entry->ptr, __entry->name)
>> 	You must use __get_str(name) here!
>>
>> (see other users of this logic in include/trace/events/*.h)
>>
>> What is happening is that TP_fast_assign() is called by the tracepoint
>> logic (at the time of the event), then much later (seconds, minutes,
>> hours, days!), when the user does a "cat trace" of the file, the
>> __entry->name is read and the printf logic is called against it. Well,
>> the s->name that __entry->name points to, can be long gone by then!
>>
>> Instead, using __string() tells the TRACE_EVENT() macro that this is a
>> dynamic string. The __assign_str() records the string into the ring
>> buffer. The __get_str() retrieves the string from the ring buffer as
>> part of the event, so it stays around as long as the event being read
>> by the trace file is around.
>>
>> Please do not apply this patch as is, it is very buggy!
> I wonder if we can add something to checkpatch that can check if
> TP_printk() has a call to "%s" where it references a __entry->xxx and
> not a __get_str(), and will warn about it.
That's helpful for me who don't know "%s" of TP_printk is special.
>
> There a a few cases where its OK. Like RCU uses a TPS() macro around
> strings it passes into the tracepoint, which is used for strings that
> never are freed, and maps the string pointer to the string for user
> space. But RCU is the only user of that I believe.
>
> -- Steve
Steven Rostedt Feb. 25, 2021, 2:25 p.m. UTC | #8
On Thu, 25 Feb 2021 15:06:08 +0800
Jacob Wen <jian.w.wen@oracle.com> wrote:

> I don't have time to fix and test it today.

OK, I'll write something up today.

Thanks,

-- Steve
Steven Rostedt Feb. 25, 2021, 2:31 p.m. UTC | #9
On Thu, 25 Feb 2021 15:07:50 +0800
Jacob Wen <jian.w.wen@oracle.com> wrote:

> > I wonder if we can add something to checkpatch that can check if
> > TP_printk() has a call to "%s" where it references a __entry->xxx and
> > not a __get_str(), and will warn about it.  
> That's helpful for me who don't know "%s" of TP_printk is special.

Here's nothing special about %s in TP_printk. It uses the same code as
printk() and what other string formatters use.

What is special is that the print is on data that is stored from a previous
time.

TP_fast_assign() / TP_printk() is basically this:

struct entry {
	char *name;
} entry;


TP_fast_assign() {
	entry.name = slab->name;
}

TP_printk() {
	printk("%s", entry.name);
}


Where TP_printk() can be called some time in the future when a user asks
for it. If the slab->name is freed, then the entry.name will be pointing to
stale data, and you don't want to call printk() on that!

Thus, the "%s" in TP_printk() is nothing special, it's the fact that the
data it reads is called much later in time from when that data was
recorded. Which means, you can not rely on any dereferencing of pointers.

The __string() __assign_str() and __get_str() macros are helpers to easily
store strings in the ring buffer, as that is a common practice in the
trace events.

-- Steve


> >
> > There a a few cases where its OK. Like RCU uses a TPS() macro around
> > strings it passes into the tracepoint, which is used for strings that
> > never are freed, and maps the string pointer to the string for user
> > space. But RCU is the only user of that I believe.
> >
Linus Torvalds Feb. 25, 2021, 5:49 p.m. UTC | #10
On Thu, Feb 25, 2021 at 6:31 AM Steven Rostedt <rostedt@goodmis.org> wrote:
>
> Here's nothing special about %s in TP_printk. It uses the same code as
> printk() and what other string formatters use.
>
> What is special is that the print is on data that is stored from a previous
> time.

Well, technically that's not actually limited to just %s, although
that's the common case.

All our magic "%p*" formats do the same thing: they print out not the
argument _value_, but something that the pointer value points to.

So "%pD" takes a "struct file *" pointer, and follows it to the
dentry, and then from the dentry to the name. So it will in fact
follow pointers even more than "%s" does.

It might indeed be worth having a warning for TP_printk() about any of
the formats that follow a pointer, exactly because of the whole "by
the time it actually prints, the pointer may be long gone".

             Linus
Steven Rostedt Feb. 25, 2021, 5:57 p.m. UTC | #11
On Thu, 25 Feb 2021 09:49:15 -0800
Linus Torvalds <torvalds@linux-foundation.org> wrote:

> On Thu, Feb 25, 2021 at 6:31 AM Steven Rostedt <rostedt@goodmis.org> wrote:
> >
> > Here's nothing special about %s in TP_printk. It uses the same code as
> > printk() and what other string formatters use.
> >
> > What is special is that the print is on data that is stored from a previous
> > time.  
> 
> Well, technically that's not actually limited to just %s, although
> that's the common case.
> 
> All our magic "%p*" formats do the same thing: they print out not the
> argument _value_, but something that the pointer value points to.
> 
> So "%pD" takes a "struct file *" pointer, and follows it to the
> dentry, and then from the dentry to the name. So it will in fact
> follow pointers even more than "%s" does.

Correct, as I've told people about that as well.

> 
> It might indeed be worth having a warning for TP_printk() about any of
> the formats that follow a pointer, exactly because of the whole "by
> the time it actually prints, the pointer may be long gone".
>

Just a comment? Or should we add some check that gives a warning for when
one of these are used? That can be done at boot up or module load. (note, %s
can be OK for some cases, as mentioned in a previous email).

-- Steve
Steven Rostedt Feb. 25, 2021, 9:48 p.m. UTC | #12
On Thu, 25 Feb 2021 12:57:41 -0500
Steven Rostedt <rostedt@goodmis.org> wrote:

> > So "%pD" takes a "struct file *" pointer, and follows it to the
> > dentry, and then from the dentry to the name. So it will in fact
> > follow pointers even more than "%s" does.  
> 
> Correct, as I've told people about that as well.
> 
> > 
> > It might indeed be worth having a warning for TP_printk() about any of
> > the formats that follow a pointer, exactly because of the whole "by
> > the time it actually prints, the pointer may be long gone".
> >  
> 
> Just a comment? Or should we add some check that gives a warning for when
> one of these are used? That can be done at boot up or module load. (note, %s
> can be OK for some cases, as mentioned in a previous email).

My fix for the patch in this thread is currently going through my test
suite.

But I just made this patch (not applied to any tree yet) that checks the
print format of every event when they are registered, and if it contains a
dereference pointer that does not point to the code in the ring buffer
(either via an address '&' or the field being an array), then it will give
a big warning.

So far it hasn't triggered on any of the events that I have compiled in,
although it did trigger when I didn't parse correctly.

Is this something that I should add? (with better comments and such)

Because strings may be allowed if the trace point always passes in
something that is not freed, I would need to add a post processing check
(before the string is printed out) to make sure that no string is
dereferenced that doesn't point to kernel read only memory, and refuse to
print it if it does (and trigger a warning as well). That would have caught
the bug in this patch.

-- Steve

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index a3563afd412d..fc691f054fb6 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -217,6 +217,172 @@ int trace_event_get_offsets(struct trace_event_call *call)
 	return tail->offset + tail->size;
 }
 
+static bool test_field(const char *fmt, struct trace_event_call *call)
+{
+	struct trace_event_fields *field = call->class->fields_array;
+	const char *array_descriptor;
+	const char *p = fmt;
+	int len;
+
+	if (!(len = str_has_prefix(fmt, "REC->")))
+		return false;
+	fmt += len;
+	for (p = fmt; *p; p++) {
+		if (!isalnum(*p) && *p != '_')
+			break;
+	}
+	len = p - fmt;
+
+	for (; field->type; field++) {
+		if (strncmp(field->name, fmt, len) ||
+		    field->name[len])
+			continue;
+		array_descriptor = strchr(field->type, '[');
+		if (str_has_prefix(field->type, "__data_loc"))
+			array_descriptor = NULL;
+		/* This is an array and is OK to dereference. */
+		return array_descriptor != NULL;
+	}
+	return false;
+}
+
+/* For type cast only, does not handle quotes */
+static int skip_parens(const char *fmt)
+{
+	int parens = 0;
+	int i;
+
+	for (i = 0; fmt[i]; i++) {
+		switch (fmt[i]) {
+		case '(':
+			parens++;
+			break;
+		case ')':
+			if (!--parens)
+				return i + 1;
+		}
+	}
+	return i;
+}
+
+static void test_event_printk(struct trace_event_call *call)
+{
+	u64 dereference_flags = 0;
+	bool first = true;
+	const char *fmt;
+	int parens = 0;
+	char in_quote = 0;
+	int start_arg = 0;
+	int arg = 0;
+	int i;
+
+	fmt = call->print_fmt;
+
+	if (!fmt)
+		return;
+
+	for (i = 0; fmt[i]; i++) {
+		switch (fmt[i]) {
+		case '\\':
+			i++;
+			if (!fmt[i])
+				return;
+			continue;
+		case '"':
+		case '\'':
+			if (first) {
+				if (fmt[i] == '\'')
+					continue;
+				if (in_quote) {
+					arg = 0;
+					first = false;
+				}
+			}
+			if (in_quote) {
+				if (in_quote == fmt[i])
+					in_quote = 0;
+			} else {
+				in_quote = fmt[i];
+			}
+			continue;
+		case '%':
+			if (!first || !in_quote)
+				continue;
+			i++;
+			if (!fmt[i])
+				return;
+			switch (fmt[i]) {
+			case '%':
+				continue;
+			case 'p':
+				/* Find dereferencing fields */
+				switch (fmt[i + 1]) {
+				case 'B': case 'R': case 'r':
+				case 'b': case 'M': case 'm':
+				case 'I': case 'i': case 'E':
+				case 'U': case 'V': case 'N':
+				case 'a': case 'd': case 'D':
+				case 'g': case 't': case 'C':
+				case 'O': case 'f':
+					if (WARN_ONCE(arg == 63,
+						      "Event: %s",
+						      trace_event_name(call)))
+						return;
+					dereference_flags |= 1ULL << arg;
+				}
+				break;
+			}
+			arg++;
+			continue;
+		case '(':
+			if (in_quote)
+				continue;
+			parens++;
+			continue;
+		case ')':
+			if (in_quote)
+				continue;
+			parens--;
+			if (WARN_ONCE(parens < 0, "Event: %s\narg='%s'\n%*s",
+				      trace_event_name(call),
+				      fmt + start_arg,
+				      (i - start_arg) + 5, "^"))
+				return;
+			continue;
+		case ',':
+			if (in_quote || parens)
+				continue;
+			i++;
+			while (isspace(fmt[i]))
+				i++;
+			if (fmt[i] == '(')
+				i += skip_parens(fmt + i);
+			start_arg = i;
+			/* dereferenced pointers are fine here */
+			if (fmt[i] == '&')
+				dereference_flags &= ~(1ULL << arg);
+
+			if (dereference_flags & (1ULL << arg)) {
+				if (test_field(fmt + i, call))
+					dereference_flags &= ~(1ULL << arg);
+			}
+			i--;
+			arg++;
+		}
+	}
+
+	if (WARN_ON_ONCE(dereference_flags)) {
+		arg = 0;
+		while (!(dereference_flags & 1)) {
+			dereference_flags >>= 1;
+			arg++;
+		}
+		pr_warn("event %s has unsafe dereference of argument %d\n",
+			trace_event_name(call), arg);
+		pr_warn("print_fmt: %s\n", fmt);
+	}
+}
+
 int trace_event_raw_init(struct trace_event_call *call)
 {
 	int id;
@@ -225,6 +391,8 @@ int trace_event_raw_init(struct trace_event_call *call)
 	if (!id)
 		return -ENODEV;
 
+	test_event_printk(call);
+
 	return 0;
 }
 EXPORT_SYMBOL_GPL(trace_event_raw_init);
Jacob Wen Feb. 26, 2021, 2:02 a.m. UTC | #13
On 2/26/21 1:57 AM, Steven Rostedt wrote:
> On Thu, 25 Feb 2021 09:49:15 -0800
> Linus Torvalds <torvalds@linux-foundation.org> wrote:
>
>> On Thu, Feb 25, 2021 at 6:31 AM Steven Rostedt <rostedt@goodmis.org> wrote:
>>> Here's nothing special about %s in TP_printk. It uses the same code as
>>> printk() and what other string formatters use.
>>>
>>> What is special is that the print is on data that is stored from a previous
>>> time.
>> Well, technically that's not actually limited to just %s, although
>> that's the common case.
>>
>> All our magic "%p*" formats do the same thing: they print out not the
>> argument _value_, but something that the pointer value points to.
>>
>> So "%pD" takes a "struct file *" pointer, and follows it to the
>> dentry, and then from the dentry to the name. So it will in fact
>> follow pointers even more than "%s" does.
> Correct, as I've told people about that as well.
I was thinking about this yesterday.

Freeing a slab name generally means removing a kernel module.
See the callers of kmem_cache_destroy.

We need to fix some of the trace events that call TP_printk with %pS
to get a symbol of a kernel module that maybe removed later.

>
>> It might indeed be worth having a warning for TP_printk() about any of
>> the formats that follow a pointer, exactly because of the whole "by
>> the time it actually prints, the pointer may be long gone".
>>
> Just a comment? Or should we add some check that gives a warning for when
> one of these are used? That can be done at boot up or module load. (note, %s
> can be OK for some cases, as mentioned in a previous email).
>
> -- Steve
Steven Rostedt Feb. 26, 2021, 2:56 a.m. UTC | #14
On Fri, 26 Feb 2021 10:02:53 +0800
Jacob Wen <jian.w.wen@oracle.com> wrote:

> Freeing a slab name generally means removing a kernel module.
> See the callers of kmem_cache_destroy.
> 
> We need to fix some of the trace events that call TP_printk with %pS
> to get a symbol of a kernel module that maybe removed later.

No. If the symbol is not found in the kallsyms lookup at the time of
reading the trace, it will act just like a %p and print the address.
Yes, it can change the reading of the trace file form one read to the
next, but it's harmless, and it's been like this since ftrace was
introduced back in 2009.

-- Steve
Jacob Wen Feb. 26, 2021, 3:48 a.m. UTC | #15
On 2/26/21 10:56 AM, Steven Rostedt wrote:
> On Fri, 26 Feb 2021 10:02:53 +0800
> Jacob Wen <jian.w.wen@oracle.com> wrote:
>
>> Freeing a slab name generally means removing a kernel module.
>> See the callers of kmem_cache_destroy.
>>
>> We need to fix some of the trace events that call TP_printk with %pS
>> to get a symbol of a kernel module that maybe removed later.
> No. If the symbol is not found in the kallsyms lookup at the time of
> reading the trace, it will act just like a %p and print the address.
> Yes, it can change the reading of the trace file form one read to the
> next, but it's harmless, and it's been like this since ftrace was
> introduced back in 2009.
>
> -- Steve

Removing a kernel module to see if it releases a lot of memory is
helpful to quickly find out the root causes of some issues.

It seems nice to reserve the symbols. I don't have a relevant use case
so it's just my 2 cents.
Steven Rostedt Feb. 26, 2021, 2:15 p.m. UTC | #16
On Fri, 26 Feb 2021 11:48:18 +0800
Jacob Wen <jian.w.wen@oracle.com> wrote:

> Removing a kernel module to see if it releases a lot of memory is
> helpful to quickly find out the root causes of some issues.
> 
> It seems nice to reserve the symbols. I don't have a relevant use case
> so it's just my 2 cents.

It's much more complex than that. We do something similar, where we keep
the kallsyms for the module init code (otherwise when tracing a module that
is loaded, you wont be able to see its functions).

But the complexity comes from the fact that kallsyms has addresses
associated to functions that no longer exist. Now what happens when new
code maps at those addresses? It confuses kallsyms.

Yeah, unloading of a module will turn function names into their original
address, and worse, can even show function names for new code added. But
there's not much we can do without having to save all kallsyms that were
ever loaded, along with time stamps to know when they were valid. And
remember, this is all in kernel memory that is never swapped out.

If you want to add a feature to do something like that go ahead. I would
recommend a "time out" where after a given time (user defined) that older
symbols are flushed from the system, and perhaps even allow the user to
define the number of symbols to keep around after they are gone. At least
this will let the users decide how much memory they want to waste for this.

-- Steve
Steven Rostedt Feb. 26, 2021, 3:54 p.m. UTC | #17
On Thu, 25 Feb 2021 16:48:29 -0500
Steven Rostedt <rostedt@goodmis.org> wrote:

> Because strings may be allowed if the trace point always passes in
> something that is not freed, I would need to add a post processing check
> (before the string is printed out) to make sure that no string is
> dereferenced that doesn't point to kernel read only memory, and refuse to
> print it if it does (and trigger a warning as well). That would have caught
> the bug in this patch.

The below patch catches cases that unsafe strings are dereferenced. For
example:

  kmem_cache_free: call_site=__i915_gem_free_object_rcu+0x30/0x40 [i915] ptr=00000000f445da7e name=(0xffff8b01456930a0:drm_i915_gem_object)[UNSAFE-MEMORY]

Note, I plan on changing this to make it an opt in option to display the
string that is unsafe (as it is unsafe to display it) but may be necessary
to see what those strings are to see why its unsafe and debug it.

Note, because it allows strings that are constant in the core kernel, it
doesn't always complain:

  kmem_cache_free: call_site=unlink_anon_vmas+0x79/0x1e0 ptr=0000000056c4302b name=anon_vma_chain
  kmem_cache_free: call_site=__put_anon_vma+0x4e/0xe0 ptr=00000000e658eb73 name=anon_vma

It gives a big warning when it does trigger, so it shouldn't be missed.

I'll add more comments and make this ready for the next merge window. At
least now it should catch cases when people add unsafe strings, and be less
reliant on me needing to police all trace event submissions.

-- Steve

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index e295c413580e..0bd76873a7f5 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3551,6 +3551,89 @@ static char *trace_iter_expand_format(struct trace_iterator *iter)
 	return tmp;
 }
 
+static bool trace_safe_str(struct trace_iterator *iter, const char *str)
+{
+	unsigned long addr = (unsigned long)str;
+
+	/* OK if part of the event data */
+	if ((addr >= (unsigned long)iter->ent) &&
+	    (addr < (unsigned long)iter->ent + iter->ent_size))
+		return true;
+	/* OK if part of the temp seq buffer */
+	if ((addr >= (unsigned long)iter->tmp_seq.buffer) &&
+	    (addr < (unsigned long)iter->tmp_seq.buffer + PAGE_SIZE))
+		return true;
+	/* Core rodata can not be freed */
+	if (is_kernel_rodata(addr))
+		return true;
+	return false;
+}
+
+void trace_check_vprintf(struct trace_iterator *iter, const char *fmt,
+			 va_list ap)
+{
+	const char *p = fmt;
+	const char *str;
+	int i, j;
+
+	if (WARN_ON_ONCE(!fmt))
+		return;
+
+	/* Don't bother checking when doing a ftrace_dump() */
+	if (iter->fmt == static_fmt_buf)
+		goto print;
+
+	while (*p) {
+		j = 0;
+
+		for (i = 0; p[i]; i++) {
+			if (i + 1 >= iter->fmt_size) {
+				if (!trace_iter_expand_format(iter))
+					goto print;
+			}
+
+			if (p[i] == '\\' && p[i+1]) {
+				i++;
+				continue;
+			}
+			if (p[i] == '%') {
+				for (j = 1; p[i+j]; j++) {
+					if (isdigit(p[i+j]) ||
+					    p[i+j] == '*' ||
+					    p[i+j] == '.')
+						continue;
+					break;
+				}
+				if (p[i+j] == 's')
+					break;
+			}
+			j = 0;
+		}
+		if (!p[i])
+			break;
+
+		strncpy(iter->fmt, p, i);
+		iter->fmt[i] = '\0';
+		trace_seq_vprintf(&iter->seq, iter->fmt, ap);
+
+		str = va_arg(ap, const char *);
+		if (WARN_ON_ONCE(!trace_safe_str(iter, str))) {
+			trace_seq_printf(&iter->seq, "(0x%px:%s)", str, str);
+			str = "[UNSAFE-MEMORY]";
+			strcpy(iter->fmt, "%s");
+		} else {
+			strncpy(iter->fmt, p + i, j + 1);
+			iter->fmt[j+1] = '\0';
+		}
+		trace_seq_printf(&iter->seq, iter->fmt, str);
+
+		p += i + j + 1;
+	}
+ print:
+	if (*p)
+		trace_seq_vprintf(&iter->seq, p, ap);
+}
+
 const char *trace_event_format(struct trace_iterator *iter, const char *fmt)
 {
 	const char *p, *new_fmt;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index dec13ff66077..5e41b5fd5318 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -582,6 +582,8 @@ void trace_buffer_unlock_commit_nostack(struct trace_buffer *buffer,
 					struct ring_buffer_event *event);
 
 const char *trace_event_format(struct trace_iterator *iter, const char *fmt);
+void trace_check_vprintf(struct trace_iterator *iter, const char *fmt,
+			 va_list ap);
 
 int trace_empty(struct trace_iterator *iter);
 
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 61255bad7e01..a0146e1fffdf 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -317,7 +317,7 @@ void trace_event_printf(struct trace_iterator *iter, const char *fmt, ...)
 	va_list ap;
 
 	va_start(ap, fmt);
-	trace_seq_vprintf(&iter->seq, trace_event_format(iter, fmt), ap);
+	trace_check_vprintf(iter, trace_event_format(iter, fmt), ap);
 	va_end(ap);
 }
 EXPORT_SYMBOL(trace_event_printf);
diff mbox series

Patch

--- a/include/trace/events/kmem.h~mm-tracing-record-slab-name-for-kmem_cache_free
+++ a/include/trace/events/kmem.h
@@ -115,7 +115,7 @@  DEFINE_EVENT(kmem_alloc_node, kmem_cache
 	TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node)
 );
 
-DECLARE_EVENT_CLASS(kmem_free,
+TRACE_EVENT(kfree,
 
 	TP_PROTO(unsigned long call_site, const void *ptr),
 
@@ -135,18 +135,26 @@  DECLARE_EVENT_CLASS(kmem_free,
 		  (void *)__entry->call_site, __entry->ptr)
 );
 
-DEFINE_EVENT(kmem_free, kfree,
+TRACE_EVENT(kmem_cache_free,
 
-	TP_PROTO(unsigned long call_site, const void *ptr),
+	TP_PROTO(unsigned long call_site, const void *ptr, const char *name),
 
-	TP_ARGS(call_site, ptr)
-);
+	TP_ARGS(call_site, ptr, name),
 
-DEFINE_EVENT(kmem_free, kmem_cache_free,
+	TP_STRUCT__entry(
+		__field(	unsigned long,	call_site	)
+		__field(	const void *,	ptr		)
+		__field(	const char *,	name		)
+	),
 
-	TP_PROTO(unsigned long call_site, const void *ptr),
+	TP_fast_assign(
+		__entry->call_site	= call_site;
+		__entry->ptr		= ptr;
+		__entry->name		= name;
+	),
 
-	TP_ARGS(call_site, ptr)
+	TP_printk("call_site=%pS ptr=%p name=%s",
+		  (void *)__entry->call_site, __entry->ptr, __entry->name)
 );
 
 TRACE_EVENT(mm_page_free,
--- a/mm/slab.c~mm-tracing-record-slab-name-for-kmem_cache_free
+++ a/mm/slab.c
@@ -3717,7 +3717,7 @@  void kmem_cache_free(struct kmem_cache *
 	__cache_free(cachep, objp, _RET_IP_);
 	local_irq_restore(flags);
 
-	trace_kmem_cache_free(_RET_IP_, objp);
+	trace_kmem_cache_free(_RET_IP_, objp, cachep->name);
 }
 EXPORT_SYMBOL(kmem_cache_free);
 
--- a/mm/slob.c~mm-tracing-record-slab-name-for-kmem_cache_free
+++ a/mm/slob.c
@@ -673,7 +673,7 @@  void kmem_cache_free(struct kmem_cache *
 		__kmem_cache_free(b, c->size);
 	}
 
-	trace_kmem_cache_free(_RET_IP_, b);
+	trace_kmem_cache_free(_RET_IP_, b, c->name);
 }
 EXPORT_SYMBOL(kmem_cache_free);
 
--- a/mm/slub.c~mm-tracing-record-slab-name-for-kmem_cache_free
+++ a/mm/slub.c
@@ -3157,7 +3157,7 @@  void kmem_cache_free(struct kmem_cache *
 	if (!s)
 		return;
 	slab_free(s, virt_to_head_page(x), x, NULL, 1, _RET_IP_);
-	trace_kmem_cache_free(_RET_IP_, x);
+	trace_kmem_cache_free(_RET_IP_, x, s->name);
 }
 EXPORT_SYMBOL(kmem_cache_free);