diff mbox series

[v4,09/10] eventfs: Move tracing/events to eventfs

Message ID 1689248004-8158-10-git-send-email-akaher@vmware.com (mailing list archive)
State New
Headers show
Series tracing: introducing eventfs | expand

Commit Message

Ajay Kaher July 13, 2023, 11:33 a.m. UTC
Till now /sys/kernel/debug/tracing/events is a part of tracefs,
with-in this patch creating 'events' and it's sub-dir as eventfs.
Basically replacing tracefs calls with eventfs calls for 'events'.

Signed-off-by: Ajay Kaher <akaher@vmware.com>
Co-developed-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Tested-by: Ching-lin Yu <chinglinyu@google.com>
---
 fs/tracefs/inode.c           | 18 ++++++++++
 include/linux/trace_events.h |  1 +
 kernel/trace/trace.h         |  2 +-
 kernel/trace/trace_events.c  | 67 +++++++++++++++++++-----------------
 4 files changed, 55 insertions(+), 33 deletions(-)

Comments

Steven Rostedt July 14, 2023, 9:06 p.m. UTC | #1
On Thu, 13 Jul 2023 17:03:23 +0530
Ajay Kaher <akaher@vmware.com> wrote:

> Till now /sys/kernel/debug/tracing/events is a part of tracefs,
> with-in this patch creating 'events' and it's sub-dir as eventfs.
> Basically replacing tracefs calls with eventfs calls for 'events'.

[ note: /sys/kernel/debug/tracing is deprecated. Please avoid referencing it. ]

Up until now, /sys/kernel/tracing/events was no different than any other
part of tracefs. The files and directories within the events directory was
created when the tracefs was mounted, and also created for the instances in
/sys/kernel/tracing/instances/<instance>/events. Most of these files and
directories will never be referenced. Since there are thousands of these
files and directories they spend their time wasting precious memory
resources.

Move the "events" directory to the new eventfs. The eventfs will take the
meta data of the events that they represent and store that. When the files
in the events directory are referenced, the dentry and inodes to represent
them are then created. When the files are no longer referenced, they are
freed. This saves the precious memory resources that were wasted on these
seldom referenced dentries and inodes.

We should also include memory metrics of both improvements from
/proc/slabinfo and /proc/meminfo.

> 
> Signed-off-by: Ajay Kaher <akaher@vmware.com>
> Co-developed-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
> Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
> Tested-by: Ching-lin Yu <chinglinyu@google.com>
> ---
>  fs/tracefs/inode.c           | 18 ++++++++++
>  include/linux/trace_events.h |  1 +
>  kernel/trace/trace.h         |  2 +-
>  kernel/trace/trace_events.c  | 67 +++++++++++++++++++-----------------
>  4 files changed, 55 insertions(+), 33 deletions(-)
> 
> diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
> index 76820d3e97eb..a098d7153498 100644
> --- a/fs/tracefs/inode.c
> +++ b/fs/tracefs/inode.c
> @@ -374,6 +374,23 @@ static const struct super_operations tracefs_super_operations = {
>  	.show_options	= tracefs_show_options,
>  };
>  
> +static void tracefs_dentry_iput(struct dentry *dentry, struct inode *inode)
> +{
> +	struct tracefs_inode *ti;
> +
> +	if (!dentry || !inode)
> +		return;
> +
> +	ti = get_tracefs(inode);
> +	if (ti && ti->flags & TRACEFS_EVENT_INODE)
> +		eventfs_set_ef_status_free(dentry);
> +	iput(inode);
> +}
> +
> +static const struct dentry_operations tracefs_dentry_operations = {
> +	.d_iput = tracefs_dentry_iput,
> +};
> +
>  static int trace_fill_super(struct super_block *sb, void *data, int silent)
>  {
>  	static const struct tree_descr trace_files[] = {{""}};
> @@ -396,6 +413,7 @@ static int trace_fill_super(struct super_block *sb, void *data, int silent)
>  		goto fail;
>  
>  	sb->s_op = &tracefs_super_operations;
> +	sb->s_d_op = &tracefs_dentry_operations;
>  
>  	tracefs_apply_options(sb, false);
>  
> diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
> index 7c4a0b72334e..c6bb74cccea3 100644
> --- a/include/linux/trace_events.h
> +++ b/include/linux/trace_events.h
> @@ -635,6 +635,7 @@ struct trace_event_file {
>  	struct list_head		list;
>  	struct trace_event_call		*event_call;
>  	struct event_filter __rcu	*filter;
> +	struct eventfs_file             *ef;
>  	struct dentry			*dir;
>  	struct trace_array		*tr;
>  	struct trace_subsystem_dir	*system;
> diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
> index 79bdefe9261b..d9a0ec2b918e 100644
> --- a/kernel/trace/trace.h
> +++ b/kernel/trace/trace.h
> @@ -1309,7 +1309,7 @@ struct trace_subsystem_dir {
>  	struct list_head		list;
>  	struct event_subsystem		*subsystem;
>  	struct trace_array		*tr;
> -	struct dentry			*entry;
> +	struct eventfs_file             *ef;
>  	int				ref_count;
>  	int				nr_events;
>  };
> diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
> index 12ed71428939..4058178de682 100644
> --- a/kernel/trace/trace_events.c
> +++ b/kernel/trace/trace_events.c
> @@ -990,7 +990,8 @@ static void remove_subsystem(struct trace_subsystem_dir *dir)
>  		return;
>  
>  	if (!--dir->nr_events) {
> -		tracefs_remove(dir->entry);
> +		if (dir->ef)
> +			eventfs_remove(dir->ef);

Why the if statement? eventfs_remove() has:

void eventfs_remove(struct eventfs_file *ef)
{
	if (!ef)
		return;
	[..]
}

Let's get rid of that.

>  		list_del(&dir->list);
>  		__put_system_dir(dir);
>  	}
> @@ -1011,7 +1012,8 @@ static void remove_event_file_dir(struct trace_event_file *file)
>  
>  		tracefs_remove(dir);
>  	}
> -
> +	if (file->ef)
> +		eventfs_remove(file->ef);

Same here.

>  	list_del(&file->list);
>  	remove_subsystem(file->system);
>  	free_event_filter(file->filter);
> @@ -2297,13 +2299,13 @@ create_new_subsystem(const char *name)
>  	return NULL;
>  }
>  
> -static struct dentry *
> +static struct eventfs_file *
>  event_subsystem_dir(struct trace_array *tr, const char *name,
>  		    struct trace_event_file *file, struct dentry *parent)
>  {
>  	struct event_subsystem *system, *iter;
>  	struct trace_subsystem_dir *dir;
> -	struct dentry *entry;
> +	int res;
>  
>  	/* First see if we did not already create this dir */
>  	list_for_each_entry(dir, &tr->systems, list) {
> @@ -2311,7 +2313,7 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
>  		if (strcmp(system->name, name) == 0) {
>  			dir->nr_events++;
>  			file->system = dir;
> -			return dir->entry;
> +			return dir->ef;
>  		}
>  	}
>  
> @@ -2335,8 +2337,8 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
>  	} else
>  		__get_system(system);
>  
> -	dir->entry = tracefs_create_dir(name, parent);
> -	if (!dir->entry) {
> +	dir->ef = eventfs_add_subsystem_dir(name, parent);
> +	if (IS_ERR(dir->ef)) {
>  		pr_warn("Failed to create system directory %s\n", name);
>  		__put_system(system);
>  		goto out_free;
> @@ -2351,22 +2353,22 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
>  	/* the ftrace system is special, do not create enable or filter files */
>  	if (strcmp(name, "ftrace") != 0) {
>  
> -		entry = tracefs_create_file("filter", TRACE_MODE_WRITE,
> -					    dir->entry, dir,
> +		res = eventfs_add_file("filter", TRACE_MODE_WRITE,
> +					    dir->ef, dir,
>  					    &ftrace_subsystem_filter_fops);
> -		if (!entry) {
> +		if (res) {
>  			kfree(system->filter);
>  			system->filter = NULL;
>  			pr_warn("Could not create tracefs '%s/filter' entry\n", name);
>  		}
>  
> -		trace_create_file("enable", TRACE_MODE_WRITE, dir->entry, dir,
> +		eventfs_add_file("enable", TRACE_MODE_WRITE, dir->ef, dir,
>  				  &ftrace_system_enable_fops);
>  	}
>  
>  	list_add(&dir->list, &tr->systems);
>  
> -	return dir->entry;
> +	return dir->ef;
>  
>   out_free:
>  	kfree(dir);
> @@ -2420,7 +2422,7 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file)
>  {
>  	struct trace_event_call *call = file->event_call;
>  	struct trace_array *tr = file->tr;
> -	struct dentry *d_events;
> +	struct eventfs_file *ef_subsystem = NULL;

Move this around to keep the upside-down-xmas tree order.

-- Steve


>  	const char *name;
>  	int ret;
>  
> @@ -2432,24 +2434,24 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file)
>  	if (WARN_ON_ONCE(strcmp(call->class->system, TRACE_SYSTEM) == 0))
>  		return -ENODEV;
>  
> -	d_events = event_subsystem_dir(tr, call->class->system, file, parent);
> -	if (!d_events)
> +	ef_subsystem = event_subsystem_dir(tr, call->class->system, file, parent);
> +	if (!ef_subsystem)
>  		return -ENOMEM;
>  
>  	name = trace_event_name(call);
> -	file->dir = tracefs_create_dir(name, d_events);
> -	if (!file->dir) {
> +	file->ef = eventfs_add_dir(name, ef_subsystem);
> +	if (IS_ERR(file->ef)) {
>  		pr_warn("Could not create tracefs '%s' directory\n", name);
>  		return -1;
>  	}
>  
>  	if (call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE))
> -		trace_create_file("enable", TRACE_MODE_WRITE, file->dir, file,
> +		eventfs_add_file("enable", TRACE_MODE_WRITE, file->ef, file,
>  				  &ftrace_enable_fops);
>  
>  #ifdef CONFIG_PERF_EVENTS
>  	if (call->event.type && call->class->reg)
> -		trace_create_file("id", TRACE_MODE_READ, file->dir,
> +		eventfs_add_file("id", TRACE_MODE_READ, file->ef,
>  				  (void *)(long)call->event.type,
>  				  &ftrace_event_id_fops);
>  #endif
> @@ -2465,27 +2467,27 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file)
>  	 * triggers or filters.
>  	 */
>  	if (!(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) {
> -		trace_create_file("filter", TRACE_MODE_WRITE, file->dir,
> +		eventfs_add_file("filter", TRACE_MODE_WRITE, file->ef,
>  				  file, &ftrace_event_filter_fops);
>  
> -		trace_create_file("trigger", TRACE_MODE_WRITE, file->dir,
> +		eventfs_add_file("trigger", TRACE_MODE_WRITE, file->ef,
>  				  file, &event_trigger_fops);
>  	}
>  
>  #ifdef CONFIG_HIST_TRIGGERS
> -	trace_create_file("hist", TRACE_MODE_READ, file->dir, file,
> +	eventfs_add_file("hist", TRACE_MODE_READ, file->ef, file,
>  			  &event_hist_fops);
>  #endif
>  #ifdef CONFIG_HIST_TRIGGERS_DEBUG
> -	trace_create_file("hist_debug", TRACE_MODE_READ, file->dir, file,
> +	eventfs_add_file("hist_debug", TRACE_MODE_READ, file->ef, file,
>  			  &event_hist_debug_fops);
>  #endif
> -	trace_create_file("format", TRACE_MODE_READ, file->dir, call,
> +	eventfs_add_file("format", TRACE_MODE_READ, file->ef, call,
>  			  &ftrace_event_format_fops);
>  
>  #ifdef CONFIG_TRACE_EVENT_INJECT
>  	if (call->event.type && call->class->reg)
> -		trace_create_file("inject", 0200, file->dir, file,
> +		eventfs_add_file("inject", 0200, file->ef, file,
>  				  &event_inject_fops);
>  #endif
>  
> @@ -3638,21 +3640,22 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
>  {
>  	struct dentry *d_events;
>  	struct dentry *entry;
> +	int error = 0;
>  
>  	entry = trace_create_file("set_event", TRACE_MODE_WRITE, parent,
>  				  tr, &ftrace_set_event_fops);
>  	if (!entry)
>  		return -ENOMEM;
>  
> -	d_events = tracefs_create_dir("events", parent);
> -	if (!d_events) {
> +	d_events = eventfs_create_events_dir("events", parent);
> +	if (IS_ERR(d_events)) {
>  		pr_warn("Could not create tracefs 'events' directory\n");
>  		return -ENOMEM;
>  	}
>  
> -	entry = trace_create_file("enable", TRACE_MODE_WRITE, d_events,
> +	error = eventfs_add_top_file("enable", TRACE_MODE_WRITE, d_events,
>  				  tr, &ftrace_tr_enable_fops);
> -	if (!entry)
> +	if (error)
>  		return -ENOMEM;
>  
>  	/* There are not as crucial, just warn if they are not created */
> @@ -3665,11 +3668,11 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
>  			  &ftrace_set_event_notrace_pid_fops);
>  
>  	/* ring buffer internal formats */
> -	trace_create_file("header_page", TRACE_MODE_READ, d_events,
> +	eventfs_add_top_file("header_page", TRACE_MODE_READ, d_events,
>  				  ring_buffer_print_page_header,
>  				  &ftrace_show_header_fops);
>  
> -	trace_create_file("header_event", TRACE_MODE_READ, d_events,
> +	eventfs_add_top_file("header_event", TRACE_MODE_READ, d_events,
>  				  ring_buffer_print_entry_header,
>  				  &ftrace_show_header_fops);
>  
> @@ -3757,7 +3760,7 @@ int event_trace_del_tracer(struct trace_array *tr)
>  
>  	down_write(&trace_event_sem);
>  	__trace_remove_event_dirs(tr);
> -	tracefs_remove(tr->event_dir);
> +	eventfs_remove_events_dir(tr->event_dir);
>  	up_write(&trace_event_sem);
>  
>  	tr->event_dir = NULL;
Ajay Kaher July 19, 2023, 11:08 a.m. UTC | #2
> On 15-Jul-2023, at 2:36 AM, Steven Rostedt <rostedt@goodmis.org> wrote:
> 
> !! External Email
> 
> On Thu, 13 Jul 2023 17:03:23 +0530
> Ajay Kaher <akaher@vmware.com> wrote:
> 
>> Till now /sys/kernel/debug/tracing/events is a part of tracefs,
>> with-in this patch creating 'events' and it's sub-dir as eventfs.
>> Basically replacing tracefs calls with eventfs calls for 'events'.
> 
> [ note: /sys/kernel/debug/tracing is deprecated. Please avoid referencing it. ]
> 
> Up until now, /sys/kernel/tracing/events was no different than any other
> part of tracefs. The files and directories within the events directory was
> created when the tracefs was mounted, and also created for the instances in
> /sys/kernel/tracing/instances/<instance>/events. Most of these files and
> directories will never be referenced. Since there are thousands of these
> files and directories they spend their time wasting precious memory
> resources.
> 
> Move the "events" directory to the new eventfs. The eventfs will take the
> meta data of the events that they represent and store that. When the files
> in the events directory are referenced, the dentry and inodes to represent
> them are then created. When the files are no longer referenced, they are
> freed. This saves the precious memory resources that were wasted on these
> seldom referenced dentries and inodes.
> 

Some correction here:

The dentry and inodes to represent eventfs files or directories will be freed only
during drop cache or eventfs_remove(). This is same as with other dynamic fs
e.g. sysfs or procfs.

We can achieve ‘free the dentry and inodes if no longer requires’ using
create_file()->d_instantiate_anon() instead create_file()->d_instantiate(), but I faced 
some issues. This optimisation we may consider in future along with sysfs, procfs.

-Ajay
Ajay Kaher July 19, 2023, 11:08 a.m. UTC | #3
> On 15-Jul-2023, at 2:36 AM, Steven Rostedt <rostedt@goodmis.org> wrote:
> 
> !! External Email
> 
> On Thu, 13 Jul 2023 17:03:23 +0530
> Ajay Kaher <akaher@vmware.com> wrote:
> 
>> Till now /sys/kernel/debug/tracing/events is a part of tracefs,
>> with-in this patch creating 'events' and it's sub-dir as eventfs.
>> Basically replacing tracefs calls with eventfs calls for 'events'.
> 
> [ note: /sys/kernel/debug/tracing is deprecated. Please avoid referencing it. ]
> 
> Up until now, /sys/kernel/tracing/events was no different than any other
> part of tracefs. The files and directories within the events directory was
> created when the tracefs was mounted, and also created for the instances in
> /sys/kernel/tracing/instances/<instance>/events. Most of these files and
> directories will never be referenced. Since there are thousands of these
> files and directories they spend their time wasting precious memory
> resources.
> 
> Move the "events" directory to the new eventfs. The eventfs will take the
> meta data of the events that they represent and store that. When the files
> in the events directory are referenced, the dentry and inodes to represent
> them are then created. When the files are no longer referenced, they are
> freed. This saves the precious memory resources that were wasted on these
> seldom referenced dentries and inodes.
> 

Some correction here:

The dentry and inodes to represent eventfs files or directories will be freed only
during drop cache or eventfs_remove(). This is same as with other dynamic fs
e.g. sysfs or procfs.

We can achieve ‘free the dentry and inodes if no longer requires’ using
create_file()->d_instantiate_anon() instead create_file()->d_instantiate(), but I faced 
some issues. This optimisation we may consider in future along with sysfs, procfs.

-Ajay
Sven Schnelle Sept. 8, 2023, 12:14 p.m. UTC | #4
Ajay Kaher <akaher@vmware.com> writes:

> Till now /sys/kernel/debug/tracing/events is a part of tracefs,
> with-in this patch creating 'events' and it's sub-dir as eventfs.
> Basically replacing tracefs calls with eventfs calls for 'events'.
>
> Signed-off-by: Ajay Kaher <akaher@vmware.com>
> Co-developed-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
> Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
> Tested-by: Ching-lin Yu <chinglinyu@google.com>
> ---
>  fs/tracefs/inode.c           | 18 ++++++++++
>  include/linux/trace_events.h |  1 +
>  kernel/trace/trace.h         |  2 +-
>  kernel/trace/trace_events.c  | 67 +++++++++++++++++++-----------------
>  4 files changed, 55 insertions(+), 33 deletions(-)

With this patchset in next-20230908 the following crash is observed on
s390 while running ftracetest test.d/instances/instance-event.tc:

(It also crashes without KASAN, just not on every try)

[   26.728436] ==================================================================
[   26.728450] BUG: KASAN: slab-use-after-free in __ftrace_event_enable_disable+0x56c/0x648
[   26.728460] Read of size 8 at addr 00000000107d3838 by task ftracetest/514
[   26.728465]
[   26.728468] CPU: 1 PID: 514 Comm: ftracetest Not tainted 6.5.0-rc4-00014-g27152bceea1d #958
[   26.728474] Hardware name: IBM 3931 A01 704 (z/VM 7.3.0)
[   26.728478] Call Trace:
[   26.728482]  [<000000000262349e>] dump_stack_lvl+0x106/0x1c8
[   26.728490]  [<00000000009e3b8c>] print_address_description.constprop.0+0x34/0x378
[   26.728498]  [<00000000009e3f7c>] print_report+0xac/0x240
[   26.728504]  [<00000000009e42fa>] kasan_report+0xf2/0x130
[   26.728509]  [<00000000005c61c4>] __ftrace_event_enable_disable+0x56c/0x648
[   26.728516]  [<00000000005c6d02>] event_enable_write+0x132/0x218
[   26.728522]  [<0000000000a89598>] vfs_write+0x208/0x930
[   26.728528]  [<0000000000a8a148>] ksys_write+0x118/0x200
[   26.728535]  [<000000000010c5fc>] do_syscall+0x22c/0x328
[   26.728547]  [<000000000268e8b2>] __do_syscall+0x9a/0xf8
[   26.728553]  [<00000000026c1d28>] system_call+0x70/0x98
[   26.728559] 3 locks held by ftracetest/514:
[   26.728562]  #0: 00000000226addd8 (&f->f_pos_lock){+.+.}-{3:3}, at: __fdget_pos+0x84/0xa0
[   26.728577]  #1: 0000000008f19440 (sb_writers#14){.+.+}-{0:0}, at: ksys_write+0x118/0x200
[   26.728594]  #2: 00000000033205d0 (event_mutex){+.+.}-{3:3}, at: event_enable_write+0xdc/0x218
[   26.728609]
[   26.728613] Allocated by task 515:
[   26.728617]  kasan_save_stack+0x40/0x68
[   26.728627]  kasan_set_track+0x36/0x48
[   26.728632]  __kasan_slab_alloc+0x8e/0xb8
[   26.728638]  kmem_cache_alloc+0x1ec/0x568
[   26.728643]  trace_create_new_event+0x88/0x3d8
[   26.728648]  event_trace_add_tracer+0x13a/0x2a0
[   26.728654]  trace_array_create_dir+0x80/0x1a0
[   26.728660]  trace_array_create+0x344/0x700
[   26.728665]  instance_mkdir+0xbe/0x118
[   26.728670]  tracefs_syscall_mkdir+0xa8/0xf8
[   26.728678]  vfs_mkdir+0x454/0x6c0
[   26.728685]  do_mkdirat+0x24e/0x2a8
[   26.728692]  __s390x_sys_mkdir+0xf4/0x168
[   26.728696]  do_syscall+0x22c/0x328
[   26.728700]  __do_syscall+0x9a/0xf8
[   26.728704]  system_call+0x70/0x98
[   26.728710]
[   26.728713] Freed by task 553:
[   26.728716]  kasan_save_stack+0x40/0x68
[   26.728721]  kasan_set_track+0x36/0x48
[   26.728726]  kasan_save_free_info+0x42/0x60
[   26.728733]  ____kasan_slab_free+0x17c/0x1e0
[   26.728741]  __kasan_slab_free+0x24/0x30
[   26.728747]  slab_free_freelist_hook+0x272/0x3a0
[   26.728751]  kmem_cache_free+0xec/0x540
[   26.728757]  event_trace_del_tracer+0x108/0x1f8
[   26.728763]  __remove_instance+0x242/0x668
[   26.728781]  instance_rmdir+0xc4/0x108
[   26.728788]  tracefs_syscall_rmdir+0xd4/0x160
[   26.728796]  vfs_rmdir+0x18a/0x538
[   26.728802]  do_rmdir+0x2c0/0x358
[   26.728807]  __s390x_sys_rmdir+0xd2/0x138
[   26.728811]  do_syscall+0x22c/0x328
[   26.728817]  __do_syscall+0x9a/0xf8
[   26.728823]  system_call+0x70/0x98
[   26.728829]
[   26.728833] The buggy address belongs to the object at 00000000107d3828
[   26.728833]  which belongs to the cache trace_event_file of size 96
[   26.728841] The buggy address is located 16 bytes inside of
[   26.728841]  freed 96-byte region [00000000107d3828, 00000000107d3888)
[   26.728846]
[   26.728848] The buggy address belongs to the physical page:
[   26.728851] page:000040000041f4c0 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x107d3
[   26.728871] flags: 0x1ffff00000000200(slab|node=0|zone=0|lastcpupid=0x1ffff)
[   26.728880] page_type: 0xffffffff()
[   26.728889] raw: 1ffff00000000200 0000000005a2c100 0000400000891490 00004000006a89d0
[   26.728894] raw: 0000000000000000 0013002600000000 ffffffff00000001 0000000000000000
[   26.728898] page dumped because: kasan: bad access detected
[   26.728902]
[   26.728906] Memory state around the buggy address:
[   26.728912]  00000000107d3700: fc fc fc fc fc fc fc fc fc fc fc fa fb fb fb fb
[   26.728919]  00000000107d3780: fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc fc
[   26.728928] >00000000107d3800: fc fc fc fc fc fa fb fb fb fb fb fb fb fb fb fb
[   26.728933]                                         ^
[   26.728940]  00000000107d3880: fb fc fc fc fc fc fc fc fc fc fc fc fc fc fc fa
[   26.728943]  00000000107d3900: fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc fc
[   26.728946] ==================================================================
[   26.728955] Disabling lock debugging due to kernel taint
[   27.089717] Unable to handle kernel pointer dereference in virtual kernel address space
[   27.089725] Failing address: 0d896d6d6d6d6000 TEID: 0d896d6d6d6d6803
[   27.089729] Fault in home space mode while using kernel ASCE.
[   27.089737] AS:00000000046b800b R2:0000000000000028
[   27.089770] Oops: 0038 ilc:2 [#1] PREEMPT SMP
[   27.089779] Modules linked in:
[   27.089784] CPU: 1 PID: 514 Comm: ftracetest Tainted: G    B              6.5.0-rc4-00014-g27152bceea1d #958
[   27.089790] Hardware name: IBM 3931 A01 704 (z/VM 7.3.0)
[   27.089794] Krnl PSW : 0404e00180000000 00000000005c4488 (trace_event_buffer_reserve+0xf0/0x3e8)
[   27.089804]            R:0 T:1 IO:0 EX:0 Key:0 M:1 W:0 P:0 AS:3 CC:2 PM:0 RI:0 EA:3
[   27.089810] Krnl GPRS: 0000000000000001 0000000000000001 6b6b6b6b6b6b6c5b 001c000000000000
[   27.089816]            0d896d6d6d6d6d8b 00000000005c4702 001bff80013cf170 6b6b6b6b6b6b6b6b
[   27.089822]            6b6b6b6b6b6b6b6b 00037ff000279e26 00000000107d3828 001bff80013cf208
[   27.089827]            00000000031669f8 00000000026c1c1c 00000000005c4702 001bff80013cf078
[   27.089843] Krnl Code: 00000000005c447a: 412080f0            la      %r2,240(%r8)
[   27.089843]            00000000005c447e: eb420003000c        srlg    %r4,%r2,3
[   27.089843]           #00000000005c4484: b9080043            agr     %r4,%r3
[   27.089843]           >00000000005c4488: 95004000            cli     0(%r4),0
[   27.089843]            00000000005c448c: a7740142            brc     7,00000000005c4710
[   27.089843]            00000000005c4490: e3e080f00004        lg      %r14,240(%r8)
[   27.089843]            00000000005c4496: e3e0f0a80024        stg     %r14,168(%r15)
[   27.089843]            00000000005c449c: 412080f8            la      %r2,248(%r8)
[   27.089885] Call Trace:
[   27.089890]  [<00000000005c4488>] trace_event_buffer_reserve+0xf0/0x3e8
[   27.089897] ([<00000000005c4702>] trace_event_buffer_reserve+0x36a/0x3e8)
[   27.089903]  [<000000000028979e>] trace_event_raw_event_sched_switch+0xce/0x430
[   27.089910]  [<00000000026a1cd4>] __schedule+0xba4/0x1f58
[   27.089916]  [<00000000026a4544>] preempt_schedule_irq+0xdc/0x190
[   27.089925]  [<000000000268f99c>] irqentry_exit+0xac/0x128
[   27.089931]  [<00000000026c1fac>] ext_int_handler+0xc4/0xf0
[   27.089936]  [<0000000000135a98>] __unwind_start+0x190/0x568
[   27.089944] ([<0000000000135a7c>] __unwind_start+0x174/0x568)
[   27.089950]  [<000000000013bb90>] arch_stack_walk+0x130/0x1d0
[   27.089957]  [<0000000000424198>] stack_trace_save+0xc8/0xf8
[   27.089968]  [<00000000009d3468>] set_track_prepare+0x38/0x58
[   27.089974]  [<00000000009df35a>] free_to_partial_list+0x1ea/0x2f0
[   27.089979]  [<00000000009df656>] __slab_free+0x1f6/0x3c0
[   27.089984]  [<00000000009e19e8>] ___cache_free+0x158/0x188
[   27.089989]  [<00000000009e6df6>] qlist_free_all+0x7e/0x150
[   27.089994]  [<00000000009e752a>] kasan_quarantine_reduce+0x17a/0x1d8
[   27.089999]  [<00000000009e3488>] __kasan_slab_alloc+0x98/0xb8
[   27.090005]  [<00000000009de3fc>] kmem_cache_alloc+0x1ec/0x568
[   27.090011]  [<0000000000ab1d8e>] getname_flags.part.0+0x56/0x420
[   27.090018]  [<0000000000a81244>] do_sys_openat2+0xc4/0x178
00: HCPGSP2629I The virtual machine is placed in CP mode due to a SIGP stop from CPU 00.
00: HCPGSP2629I The virtual machine is placed in CP mode due to a SIGP stop from CPU 01.
[   27.090027]  [<0000000000a815ae>] do_sys_open+0xce/0x118
[   27.090033]  [<000000000010c5fc>] do_syscall+0x22c/0x328
[   27.090040]  [<000000000268e8b2>] __do_syscall+0x9a/0xf8
[   27.090048]  [<00000000026c1d28>] system_call+0x70/0x98
[   27.090056] INFO: lockdep is turned off.
[   27.090060] Last Breaking-Event-Address:
[   27.090066]  [<00000000005c4702>] trace_event_buffer_reserve+0x36a/0x3e8
[   27.090077] Kernel panic - not syncing: Fatal exception: panic_on_oops
Steven Rostedt Sept. 8, 2023, 12:31 p.m. UTC | #5
On Fri, 08 Sep 2023 14:14:20 +0200
Sven Schnelle <svens@linux.ibm.com> wrote:

> Ajay Kaher <akaher@vmware.com> writes:
> 
> > Till now /sys/kernel/debug/tracing/events is a part of tracefs,
> > with-in this patch creating 'events' and it's sub-dir as eventfs.
> > Basically replacing tracefs calls with eventfs calls for 'events'.
> >
> > Signed-off-by: Ajay Kaher <akaher@vmware.com>
> > Co-developed-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
> > Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
> > Tested-by: Ching-lin Yu <chinglinyu@google.com>
> > ---
> >  fs/tracefs/inode.c           | 18 ++++++++++
> >  include/linux/trace_events.h |  1 +
> >  kernel/trace/trace.h         |  2 +-
> >  kernel/trace/trace_events.c  | 67 +++++++++++++++++++-----------------
> >  4 files changed, 55 insertions(+), 33 deletions(-)  
> 
> With this patchset in next-20230908 the following crash is observed on
> s390 while running ftracetest test.d/instances/instance-event.tc:
> 
> (It also crashes without KASAN, just not on every try)

Yep, and I've spent the last few days fixing this :-)

  https://lore.kernel.org/linux-trace-kernel/20230907024710.866917011@goodmis.org/

-- Steve
diff mbox series

Patch

diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index 76820d3e97eb..a098d7153498 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -374,6 +374,23 @@  static const struct super_operations tracefs_super_operations = {
 	.show_options	= tracefs_show_options,
 };
 
+static void tracefs_dentry_iput(struct dentry *dentry, struct inode *inode)
+{
+	struct tracefs_inode *ti;
+
+	if (!dentry || !inode)
+		return;
+
+	ti = get_tracefs(inode);
+	if (ti && ti->flags & TRACEFS_EVENT_INODE)
+		eventfs_set_ef_status_free(dentry);
+	iput(inode);
+}
+
+static const struct dentry_operations tracefs_dentry_operations = {
+	.d_iput = tracefs_dentry_iput,
+};
+
 static int trace_fill_super(struct super_block *sb, void *data, int silent)
 {
 	static const struct tree_descr trace_files[] = {{""}};
@@ -396,6 +413,7 @@  static int trace_fill_super(struct super_block *sb, void *data, int silent)
 		goto fail;
 
 	sb->s_op = &tracefs_super_operations;
+	sb->s_d_op = &tracefs_dentry_operations;
 
 	tracefs_apply_options(sb, false);
 
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 7c4a0b72334e..c6bb74cccea3 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -635,6 +635,7 @@  struct trace_event_file {
 	struct list_head		list;
 	struct trace_event_call		*event_call;
 	struct event_filter __rcu	*filter;
+	struct eventfs_file             *ef;
 	struct dentry			*dir;
 	struct trace_array		*tr;
 	struct trace_subsystem_dir	*system;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 79bdefe9261b..d9a0ec2b918e 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1309,7 +1309,7 @@  struct trace_subsystem_dir {
 	struct list_head		list;
 	struct event_subsystem		*subsystem;
 	struct trace_array		*tr;
-	struct dentry			*entry;
+	struct eventfs_file             *ef;
 	int				ref_count;
 	int				nr_events;
 };
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 12ed71428939..4058178de682 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -990,7 +990,8 @@  static void remove_subsystem(struct trace_subsystem_dir *dir)
 		return;
 
 	if (!--dir->nr_events) {
-		tracefs_remove(dir->entry);
+		if (dir->ef)
+			eventfs_remove(dir->ef);
 		list_del(&dir->list);
 		__put_system_dir(dir);
 	}
@@ -1011,7 +1012,8 @@  static void remove_event_file_dir(struct trace_event_file *file)
 
 		tracefs_remove(dir);
 	}
-
+	if (file->ef)
+		eventfs_remove(file->ef);
 	list_del(&file->list);
 	remove_subsystem(file->system);
 	free_event_filter(file->filter);
@@ -2297,13 +2299,13 @@  create_new_subsystem(const char *name)
 	return NULL;
 }
 
-static struct dentry *
+static struct eventfs_file *
 event_subsystem_dir(struct trace_array *tr, const char *name,
 		    struct trace_event_file *file, struct dentry *parent)
 {
 	struct event_subsystem *system, *iter;
 	struct trace_subsystem_dir *dir;
-	struct dentry *entry;
+	int res;
 
 	/* First see if we did not already create this dir */
 	list_for_each_entry(dir, &tr->systems, list) {
@@ -2311,7 +2313,7 @@  event_subsystem_dir(struct trace_array *tr, const char *name,
 		if (strcmp(system->name, name) == 0) {
 			dir->nr_events++;
 			file->system = dir;
-			return dir->entry;
+			return dir->ef;
 		}
 	}
 
@@ -2335,8 +2337,8 @@  event_subsystem_dir(struct trace_array *tr, const char *name,
 	} else
 		__get_system(system);
 
-	dir->entry = tracefs_create_dir(name, parent);
-	if (!dir->entry) {
+	dir->ef = eventfs_add_subsystem_dir(name, parent);
+	if (IS_ERR(dir->ef)) {
 		pr_warn("Failed to create system directory %s\n", name);
 		__put_system(system);
 		goto out_free;
@@ -2351,22 +2353,22 @@  event_subsystem_dir(struct trace_array *tr, const char *name,
 	/* the ftrace system is special, do not create enable or filter files */
 	if (strcmp(name, "ftrace") != 0) {
 
-		entry = tracefs_create_file("filter", TRACE_MODE_WRITE,
-					    dir->entry, dir,
+		res = eventfs_add_file("filter", TRACE_MODE_WRITE,
+					    dir->ef, dir,
 					    &ftrace_subsystem_filter_fops);
-		if (!entry) {
+		if (res) {
 			kfree(system->filter);
 			system->filter = NULL;
 			pr_warn("Could not create tracefs '%s/filter' entry\n", name);
 		}
 
-		trace_create_file("enable", TRACE_MODE_WRITE, dir->entry, dir,
+		eventfs_add_file("enable", TRACE_MODE_WRITE, dir->ef, dir,
 				  &ftrace_system_enable_fops);
 	}
 
 	list_add(&dir->list, &tr->systems);
 
-	return dir->entry;
+	return dir->ef;
 
  out_free:
 	kfree(dir);
@@ -2420,7 +2422,7 @@  event_create_dir(struct dentry *parent, struct trace_event_file *file)
 {
 	struct trace_event_call *call = file->event_call;
 	struct trace_array *tr = file->tr;
-	struct dentry *d_events;
+	struct eventfs_file *ef_subsystem = NULL;
 	const char *name;
 	int ret;
 
@@ -2432,24 +2434,24 @@  event_create_dir(struct dentry *parent, struct trace_event_file *file)
 	if (WARN_ON_ONCE(strcmp(call->class->system, TRACE_SYSTEM) == 0))
 		return -ENODEV;
 
-	d_events = event_subsystem_dir(tr, call->class->system, file, parent);
-	if (!d_events)
+	ef_subsystem = event_subsystem_dir(tr, call->class->system, file, parent);
+	if (!ef_subsystem)
 		return -ENOMEM;
 
 	name = trace_event_name(call);
-	file->dir = tracefs_create_dir(name, d_events);
-	if (!file->dir) {
+	file->ef = eventfs_add_dir(name, ef_subsystem);
+	if (IS_ERR(file->ef)) {
 		pr_warn("Could not create tracefs '%s' directory\n", name);
 		return -1;
 	}
 
 	if (call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE))
-		trace_create_file("enable", TRACE_MODE_WRITE, file->dir, file,
+		eventfs_add_file("enable", TRACE_MODE_WRITE, file->ef, file,
 				  &ftrace_enable_fops);
 
 #ifdef CONFIG_PERF_EVENTS
 	if (call->event.type && call->class->reg)
-		trace_create_file("id", TRACE_MODE_READ, file->dir,
+		eventfs_add_file("id", TRACE_MODE_READ, file->ef,
 				  (void *)(long)call->event.type,
 				  &ftrace_event_id_fops);
 #endif
@@ -2465,27 +2467,27 @@  event_create_dir(struct dentry *parent, struct trace_event_file *file)
 	 * triggers or filters.
 	 */
 	if (!(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) {
-		trace_create_file("filter", TRACE_MODE_WRITE, file->dir,
+		eventfs_add_file("filter", TRACE_MODE_WRITE, file->ef,
 				  file, &ftrace_event_filter_fops);
 
-		trace_create_file("trigger", TRACE_MODE_WRITE, file->dir,
+		eventfs_add_file("trigger", TRACE_MODE_WRITE, file->ef,
 				  file, &event_trigger_fops);
 	}
 
 #ifdef CONFIG_HIST_TRIGGERS
-	trace_create_file("hist", TRACE_MODE_READ, file->dir, file,
+	eventfs_add_file("hist", TRACE_MODE_READ, file->ef, file,
 			  &event_hist_fops);
 #endif
 #ifdef CONFIG_HIST_TRIGGERS_DEBUG
-	trace_create_file("hist_debug", TRACE_MODE_READ, file->dir, file,
+	eventfs_add_file("hist_debug", TRACE_MODE_READ, file->ef, file,
 			  &event_hist_debug_fops);
 #endif
-	trace_create_file("format", TRACE_MODE_READ, file->dir, call,
+	eventfs_add_file("format", TRACE_MODE_READ, file->ef, call,
 			  &ftrace_event_format_fops);
 
 #ifdef CONFIG_TRACE_EVENT_INJECT
 	if (call->event.type && call->class->reg)
-		trace_create_file("inject", 0200, file->dir, file,
+		eventfs_add_file("inject", 0200, file->ef, file,
 				  &event_inject_fops);
 #endif
 
@@ -3638,21 +3640,22 @@  create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
 {
 	struct dentry *d_events;
 	struct dentry *entry;
+	int error = 0;
 
 	entry = trace_create_file("set_event", TRACE_MODE_WRITE, parent,
 				  tr, &ftrace_set_event_fops);
 	if (!entry)
 		return -ENOMEM;
 
-	d_events = tracefs_create_dir("events", parent);
-	if (!d_events) {
+	d_events = eventfs_create_events_dir("events", parent);
+	if (IS_ERR(d_events)) {
 		pr_warn("Could not create tracefs 'events' directory\n");
 		return -ENOMEM;
 	}
 
-	entry = trace_create_file("enable", TRACE_MODE_WRITE, d_events,
+	error = eventfs_add_top_file("enable", TRACE_MODE_WRITE, d_events,
 				  tr, &ftrace_tr_enable_fops);
-	if (!entry)
+	if (error)
 		return -ENOMEM;
 
 	/* There are not as crucial, just warn if they are not created */
@@ -3665,11 +3668,11 @@  create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
 			  &ftrace_set_event_notrace_pid_fops);
 
 	/* ring buffer internal formats */
-	trace_create_file("header_page", TRACE_MODE_READ, d_events,
+	eventfs_add_top_file("header_page", TRACE_MODE_READ, d_events,
 				  ring_buffer_print_page_header,
 				  &ftrace_show_header_fops);
 
-	trace_create_file("header_event", TRACE_MODE_READ, d_events,
+	eventfs_add_top_file("header_event", TRACE_MODE_READ, d_events,
 				  ring_buffer_print_entry_header,
 				  &ftrace_show_header_fops);
 
@@ -3757,7 +3760,7 @@  int event_trace_del_tracer(struct trace_array *tr)
 
 	down_write(&trace_event_sem);
 	__trace_remove_event_dirs(tr);
-	tracefs_remove(tr->event_dir);
+	eventfs_remove_events_dir(tr->event_dir);
 	up_write(&trace_event_sem);
 
 	tr->event_dir = NULL;