diff mbox series

[1/6] tracefs/eventfs: Use dput to free the toplevel events directory

Message ID 20230907024803.250873643@goodmis.org (mailing list archive)
State Accepted
Commit 9879e5e1c528d1ee9c4473b1d0668ba988bfb6ca
Headers show
Series tracing: Fix removing instances while reading/writing to their files | expand

Commit Message

Steven Rostedt Sept. 7, 2023, 2:47 a.m. UTC
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>

Currently when rmdir on an instance is done, eventfs_remove_events_dir()
is called and it does a dput on the dentry and then frees the
eventfs_inode that represents the events directory.

But there's no protection against a reader reading the top level events
directory at the same time and we can get a use after free error. Instead,
use the dput() associated to the dentry to also free the eventfs_inode
associated to the events directory, as that will get called when the last
reference to the directory is released.

Link: https://lore.kernel.org/all/1cb3aee2-19af-c472-e265-05176fe9bd84@huawei.com/

Cc: Ajay Kaher <akaher@vmware.com>
Fixes: 5bdcd5f5331a2 eventfs: ("Implement removal of meta data from eventfs")
Reported-by: Zheng Yejian <zhengyejian1@huawei.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
Changes since v1: https://lore.kernel.org/linux-trace-kernel/20230905183332.628d7cc0@gandalf.local.home
 - Removed left over "ei" variable (kernel test robot)

 fs/tracefs/event_inode.c | 17 ++++++++++++-----
 fs/tracefs/inode.c       |  2 +-
 fs/tracefs/internal.h    |  5 +++--
 3 files changed, 16 insertions(+), 8 deletions(-)

Comments

Ajay Kaher Sept. 7, 2023, 5:48 p.m. UTC | #1
> On 07-Sep-2023, at 8:17 AM, Steven Rostedt <rostedt@goodmis.org> wrote:
>
> !! External Email
>
> From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
>
> Currently when rmdir on an instance is done, eventfs_remove_events_dir()
> is called and it does a dput on the dentry and then frees the
> eventfs_inode that represents the events directory.
>
> But there's no protection against a reader reading the top level events
> directory at the same time and we can get a use after free error. Instead,
> use the dput() associated to the dentry to also free the eventfs_inode
> associated to the events directory, as that will get called when the last
> reference to the directory is released.
>

Nice catch Steve. Changes looks good to me.

Would like to know how did you map the backtrace with
use-after-free of eventfs_inode.

Thinking if same problem could happen for sub folder/files of eventfs as
free_ef() may get call earlier then dput().

-Ajay

> Link: https://lore.kernel.org/all/1cb3aee2-19af-c472-e265-05176fe9bd84@huawei.com/
>
> Cc: Ajay Kaher <akaher@vmware.com>
> Fixes: 5bdcd5f5331a2 eventfs: ("Implement removal of meta data from eventfs")
> Reported-by: Zheng Yejian <zhengyejian1@huawei.com>
> Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
> ---
> Changes since v1: https://lore.kernel.org/linux-trace-kernel/20230905183332.628d7cc0@gandalf.local.home
> - Removed left over "ei" variable (kernel test robot)
>
> fs/tracefs/event_inode.c | 17 ++++++++++++-----
> fs/tracefs/inode.c       |  2 +-
> fs/tracefs/internal.h    |  5 +++--
> 3 files changed, 16 insertions(+), 8 deletions(-)
>
> diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c
> index fa1a1679a886..609ccb5b7cfc 100644
> --- a/fs/tracefs/event_inode.c
> +++ b/fs/tracefs/event_inode.c
> @@ -185,17 +185,27 @@ static struct dentry *create_dir(const char *name, struct dentry *parent, void *
>
> /**
>  * eventfs_set_ef_status_free - set the ef->status to free
> + * @ti: the tracefs_inode of the dentry
>  * @dentry: dentry who's status to be freed
>  *
>  * eventfs_set_ef_status_free will be called if no more
>  * references remain
>  */
> -void eventfs_set_ef_status_free(struct dentry *dentry)
> +void eventfs_set_ef_status_free(struct tracefs_inode *ti, struct dentry *dentry)
> {
>        struct tracefs_inode *ti_parent;
> +       struct eventfs_inode *ei;
>        struct eventfs_file *ef;
>
>        mutex_lock(&eventfs_mutex);
> +
> +       /* The top level events directory may be freed by this */
> +       if (unlikely(ti->flags & TRACEFS_EVENT_TOP_INODE)) {
> +               ei = ti->private;
> +               kfree(ei);
> +               goto out;
> +       }
> +
>        ti_parent = get_tracefs(dentry->d_parent->d_inode);
>        if (!ti_parent || !(ti_parent->flags & TRACEFS_EVENT_INODE))
>                goto out;
> @@ -510,7 +520,7 @@ struct dentry *eventfs_create_events_dir(const char *name,
>        INIT_LIST_HEAD(&ei->e_top_files);
>
>        ti = get_tracefs(inode);
> -       ti->flags |= TRACEFS_EVENT_INODE;
> +       ti->flags |= TRACEFS_EVENT_INODE | TRACEFS_EVENT_TOP_INODE;
>        ti->private = ei;
>
>        inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
> @@ -806,7 +816,6 @@ void eventfs_remove(struct eventfs_file *ef)
> void eventfs_remove_events_dir(struct dentry *dentry)
> {
>        struct tracefs_inode *ti;
> -       struct eventfs_inode *ei;
>
>        if (!dentry || !dentry->d_inode)
>                return;
> @@ -815,8 +824,6 @@ void eventfs_remove_events_dir(struct dentry *dentry)
>        if (!ti || !(ti->flags & TRACEFS_EVENT_INODE))
>                return;
>
> -       ei = ti->private;
>        d_invalidate(dentry);
>        dput(dentry);
> -       kfree(ei);
> }
> diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
> index 3b8dd938b1c8..891653ba9cf3 100644
> --- a/fs/tracefs/inode.c
> +++ b/fs/tracefs/inode.c
> @@ -385,7 +385,7 @@ static void tracefs_dentry_iput(struct dentry *dentry, struct inode *inode)
>
>        ti = get_tracefs(inode);
>        if (ti && ti->flags & TRACEFS_EVENT_INODE)
> -               eventfs_set_ef_status_free(dentry);
> +               eventfs_set_ef_status_free(ti, dentry);
>        iput(inode);
> }
>
> diff --git a/fs/tracefs/internal.h b/fs/tracefs/internal.h
> index 69c2b1d87c46..4f2e49e2197b 100644
> --- a/fs/tracefs/internal.h
> +++ b/fs/tracefs/internal.h
> @@ -3,7 +3,8 @@
> #define _TRACEFS_INTERNAL_H
>
> enum {
> -       TRACEFS_EVENT_INODE     = BIT(1),
> +       TRACEFS_EVENT_INODE             = BIT(1),
> +       TRACEFS_EVENT_TOP_INODE         = BIT(2),
> };
>
> struct tracefs_inode {
> @@ -24,6 +25,6 @@ struct inode *tracefs_get_inode(struct super_block *sb);
> struct dentry *eventfs_start_creating(const char *name, struct dentry *parent);
> struct dentry *eventfs_failed_creating(struct dentry *dentry);
> struct dentry *eventfs_end_creating(struct dentry *dentry);
> -void eventfs_set_ef_status_free(struct dentry *dentry);
> +void eventfs_set_ef_status_free(struct tracefs_inode *ti, struct dentry *dentry);
>
> #endif /* _TRACEFS_INTERNAL_H */
> --
> 2.40.1
>
> !! External Email: This email originated from outside of the organization. Do not click links or open attachments unless you recognize the sender.
Steven Rostedt Sept. 7, 2023, 7:39 p.m. UTC | #2
On Thu, 7 Sep 2023 17:48:21 +0000
Ajay Kaher <akaher@vmware.com> wrote:

> Nice catch Steve. Changes looks good to me.

I'll add your Reviewed-by then?

> 
> Would like to know how did you map the backtrace with
> use-after-free of eventfs_inode.
> 
> Thinking if same problem could happen for sub folder/files of eventfs as
> free_ef() may get call earlier then dput().

I can also post the output. Here's the full dump:

 [  175.090834] ==================================================================
 [  175.092850] BUG: KASAN: slab-use-after-free in eventfs_root_lookup+0x88/0x1b0
 [  175.094635] Read of size 8 at addr ffff888120130ca0 by task ftracetest/1201
 [  175.096375] 
 [  175.096775] CPU: 4 PID: 1201 Comm: ftracetest Not tainted 6.5.0-test-10737-g469e0a8194e7 #13
 [  175.098727] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
 [  175.100877] Call Trace:
 [  175.101452]  <TASK>
 [  175.101955]  dump_stack_lvl+0x57/0x90
 [  175.102798]  print_report+0xcf/0x670
 [  175.103617]  ? __pfx_ring_buffer_record_off+0x10/0x10
 [  175.104752]  ? _raw_spin_lock_irqsave+0x2b/0x70
 [  175.105724]  ? __virt_addr_valid+0xd9/0x160
 [  175.106626]  kasan_report+0xd4/0x110
 [  175.107401]  ? eventfs_root_lookup+0x88/0x1b0
 [  175.108357]  ? eventfs_root_lookup+0x88/0x1b0
 [  175.109266]  eventfs_root_lookup+0x88/0x1b0
 [  175.110117]  ? eventfs_root_lookup+0x33/0x1b0
 [  175.111013]  __lookup_slow+0x194/0x2a0
 [  175.111785]  ? __pfx___lookup_slow+0x10/0x10
 [  175.112660]  ? down_read+0x11c/0x330
 [  175.113381]  walk_component+0x166/0x220
 [  175.114131]  link_path_walk.part.0.constprop.0+0x3a3/0x5a0
 [  175.115185]  ? seqcount_lockdep_reader_access+0x82/0x90
 [  175.116199]  ? __pfx_link_path_walk.part.0.constprop.0+0x10/0x10
 [  175.117353]  path_openat+0x143/0x11f0
 [  175.118073]  ? __lock_acquire+0xa1a/0x3220
 [  175.118874]  ? __pfx_path_openat+0x10/0x10
 [  175.119668]  ? __pfx___lock_acquire+0x10/0x10
 [  175.120535]  do_filp_open+0x166/0x290
 [  175.121259]  ? __pfx_do_filp_open+0x10/0x10
 [  175.122070]  ? lock_is_held_type+0xce/0x120
 [  175.122899]  ? preempt_count_sub+0xb7/0x100
 [  175.123714]  ? _raw_spin_unlock+0x29/0x50
 [  175.124504]  ? alloc_fd+0x1a0/0x320
 [  175.125202]  do_sys_openat2+0x126/0x160
 [  175.125955]  ? rcu_is_watching+0x34/0x60
 [  175.126728]  ? __pfx_do_sys_openat2+0x10/0x10
 [  175.127579]  ? __might_resched+0x2cf/0x3b0
 [  175.128391]  ? __fget_light+0xdf/0x100
 [  175.129138]  __x64_sys_openat+0xcd/0x140
 [  175.129906]  ? __pfx___x64_sys_openat+0x10/0x10
 [  175.133418]  ? syscall_enter_from_user_mode+0x22/0x90
 [  175.136987]  ? lockdep_hardirqs_on+0x7d/0x100
 [  175.140421]  do_syscall_64+0x3b/0xc0
 [  175.143605]  entry_SYSCALL_64_after_hwframe+0x6e/0xd8
 [  175.147005] RIP: 0033:0x7f1dceef5e51
 [  175.150173] Code: 75 57 89 f0 25 00 00 41 00 3d 00 00 41 00 74 49 80 3d 9a 27 0e 00 00 74 6d 89 da 48 89 ee bf 9c ff ff ff b8 01 01 00 00 0f 05 <48> 3d 00 f0 ff ff 0f 87 93 00 00 00 48 8b 54 24 28 64 48 2b 14 25
 [  175.158706] RSP: 002b:00007fff2cddf380 EFLAGS: 00000202 ORIG_RAX: 0000000000000101
 [  175.162676] RAX: ffffffffffffffda RBX: 0000000000000241 RCX: 00007f1dceef5e51
 [  175.166579] RDX: 0000000000000241 RSI: 000055d7520677d0 RDI: 00000000ffffff9c
 [  175.170442] RBP: 000055d7520677d0 R08: 000000000000001e R09: 0000000000000001
 [  175.174238] R10: 00000000000001b6 R11: 0000000000000202 R12: 0000000000000000
 [  175.177960] R13: 0000000000000003 R14: 000055d752035678 R15: 000055d752067788
 [  175.181738]  </TASK>
 [  175.184665] 
 [  175.187443] Allocated by task 1200:
 [  175.190554]  kasan_save_stack+0x2f/0x50
 [  175.193712]  kasan_set_track+0x21/0x30
 [  175.196836]  __kasan_kmalloc+0x8b/0x90
 [  175.199939]  eventfs_create_events_dir+0x54/0x220
 [  175.203234]  create_event_toplevel_files+0x42/0x130
 [  175.206609]  event_trace_add_tracer+0x33/0x180
 [  175.209934]  trace_array_create_dir+0x52/0xf0
 [  175.213222]  trace_array_create+0x361/0x410
 [  175.216461]  instance_mkdir+0x6b/0xb0
 [  175.219592]  tracefs_syscall_mkdir+0x57/0x80
 [  175.222852]  vfs_mkdir+0x275/0x380
 [  175.225949]  do_mkdirat+0x1da/0x210
 [  175.229048]  __x64_sys_mkdir+0x74/0xa0
 [  175.232161]  do_syscall_64+0x3b/0xc0
 [  175.235211]  entry_SYSCALL_64_after_hwframe+0x6e/0xd8
 [  175.238519] 
 [  175.241210] Freed by task 1251:
 [  175.244190]  kasan_save_stack+0x2f/0x50
 [  175.247293]  kasan_set_track+0x21/0x30
 [  175.250375]  kasan_save_free_info+0x27/0x40
 [  175.253517]  __kasan_slab_free+0x106/0x180
 [  175.256637]  __kmem_cache_free+0x149/0x2e0
 [  175.259749]  event_trace_del_tracer+0xcb/0x120
 [  175.262940]  __remove_instance+0x16a/0x340
 [  175.266080]  instance_rmdir+0x77/0xa0
 [  175.269139]  tracefs_syscall_rmdir+0x77/0xc0
 [  175.272323]  vfs_rmdir+0xed/0x2d0
 [  175.275321]  do_rmdir+0x235/0x280
 [  175.278327]  __x64_sys_rmdir+0x5f/0x90
 [  175.281395]  do_syscall_64+0x3b/0xc0
 [  175.284433]  entry_SYSCALL_64_after_hwframe+0x6e/0xd8
 [  175.287725] 
 [  175.290402] The buggy address belongs to the object at ffff888120130ca0
 [  175.290402]  which belongs to the cache kmalloc-16 of size 16
 [  175.297419] The buggy address is located 0 bytes inside of
 [  175.297419]  freed 16-byte region [ffff888120130ca0, ffff888120130cb0)
 [  175.304477] 
 [  175.307155] The buggy address belongs to the physical page:
 [  175.310514] page:000000004dbddbb0 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x120130
 [  175.314581] flags: 0x17ffffc0000800(slab|node=0|zone=2|lastcpupid=0x1fffff)
 [  175.318254] page_type: 0xffffffff()
 [  175.321321] raw: 0017ffffc0000800 ffff8881000423c0 dead000000000122 0000000000000000
 [  175.325190] raw: 0000000000000000 0000000000800080 00000001ffffffff 0000000000000000
 [  175.329080] page dumped because: kasan: bad access detected
 [  175.332608] 
 [  175.335350] Memory state around the buggy address:
 [  175.338752]  ffff888120130b80: 00 00 fc fc 00 05 fc fc 00 00 fc fc 00 02 fc fc
 [  175.342664]  ffff888120130c00: 00 07 fc fc 00 00 fc fc 00 00 fc fc fa fb fc fc
 [  175.346537] >ffff888120130c80: 00 00 fc fc fa fb fc fc 00 00 fc fc 00 00 fc fc
 [  175.350391]                                ^
 [  175.353753]  ffff888120130d00: 00 00 fc fc 00 00 fc fc 00 00 fc fc fa fb fc fc
 [  175.357707]  ffff888120130d80: 00 00 fc fc 00 00 fc fc 00 00 fc fc 00 00 fc fc
 [  175.361621] ==================================================================


In the "Freed by task" above, the event_trace_del_tracer+0xcb is the tail
call from the kfree(ei) in eventfs_remove_events_dir(). It's a tail call
so that screws up the unwind in ORC, as that address returned the next line
after the call to eventfs_remove_events_dir().

I can add this to the commit change log. And also I'll update the change
log of the other patch that shows its KASAN dump.

-- Steve
Steven Rostedt Sept. 7, 2023, 9:56 p.m. UTC | #3
On Thu, 7 Sep 2023 17:48:21 +0000
Ajay Kaher <akaher@vmware.com> wrote:

> Thinking if same problem could happen for sub folder/files of eventfs as
> free_ef() may get call earlier then dput().

Actually, they are not freed at all! Another patch coming.

-- Steve
Masami Hiramatsu (Google) Sept. 8, 2023, 7:45 a.m. UTC | #4
Hi, 

On Wed, 06 Sep 2023 22:47:11 -0400
Steven Rostedt <rostedt@goodmis.org> wrote:

> From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
> 
> Currently when rmdir on an instance is done, eventfs_remove_events_dir()
> is called and it does a dput on the dentry and then frees the
> eventfs_inode that represents the events directory.
> 
> But there's no protection against a reader reading the top level events
> directory at the same time and we can get a use after free error. Instead,
> use the dput() associated to the dentry to also free the eventfs_inode
> associated to the events directory, as that will get called when the last
> reference to the directory is released.
> 
> Link: https://lore.kernel.org/all/1cb3aee2-19af-c472-e265-05176fe9bd84@huawei.com/
> 
> Cc: Ajay Kaher <akaher@vmware.com>
> Fixes: 5bdcd5f5331a2 eventfs: ("Implement removal of meta data from eventfs")
> Reported-by: Zheng Yejian <zhengyejian1@huawei.com>
> Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
> ---
> Changes since v1: https://lore.kernel.org/linux-trace-kernel/20230905183332.628d7cc0@gandalf.local.home
>  - Removed left over "ei" variable (kernel test robot)
> 
>  fs/tracefs/event_inode.c | 17 ++++++++++++-----
>  fs/tracefs/inode.c       |  2 +-
>  fs/tracefs/internal.h    |  5 +++--
>  3 files changed, 16 insertions(+), 8 deletions(-)
> 
> diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c
> index fa1a1679a886..609ccb5b7cfc 100644
> --- a/fs/tracefs/event_inode.c
> +++ b/fs/tracefs/event_inode.c
> @@ -185,17 +185,27 @@ static struct dentry *create_dir(const char *name, struct dentry *parent, void *
>  
>  /**
>   * eventfs_set_ef_status_free - set the ef->status to free
> + * @ti: the tracefs_inode of the dentry
>   * @dentry: dentry who's status to be freed
>   *
>   * eventfs_set_ef_status_free will be called if no more
>   * references remain
>   */
> -void eventfs_set_ef_status_free(struct dentry *dentry)
> +void eventfs_set_ef_status_free(struct tracefs_inode *ti, struct dentry *dentry)
>  {
>  	struct tracefs_inode *ti_parent;
> +	struct eventfs_inode *ei;
>  	struct eventfs_file *ef;
>  
>  	mutex_lock(&eventfs_mutex);
> +
> +	/* The top level events directory may be freed by this */
> +	if (unlikely(ti->flags & TRACEFS_EVENT_TOP_INODE)) {
> +		ei = ti->private;
> +		kfree(ei);

Don't we need to clear 'ti->private' here to avoid accessing
(or double free) ti->private somewhare?

> +		goto out;
> +	}
> +
>  	ti_parent = get_tracefs(dentry->d_parent->d_inode);
>  	if (!ti_parent || !(ti_parent->flags & TRACEFS_EVENT_INODE))
>  		goto out;

Here, I guess this "!(ti_parent->flags & TRACEFS_EVENT_INODE)" means this
inode is TRACEFS_EVENT_TOP_INODE, so this check may not be needed,
is this correct?

Thank you,

> @@ -510,7 +520,7 @@ struct dentry *eventfs_create_events_dir(const char *name,
>  	INIT_LIST_HEAD(&ei->e_top_files);
>  
>  	ti = get_tracefs(inode);
> -	ti->flags |= TRACEFS_EVENT_INODE;
> +	ti->flags |= TRACEFS_EVENT_INODE | TRACEFS_EVENT_TOP_INODE;
>  	ti->private = ei;
>  
>  	inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
> @@ -806,7 +816,6 @@ void eventfs_remove(struct eventfs_file *ef)
>  void eventfs_remove_events_dir(struct dentry *dentry)
>  {
>  	struct tracefs_inode *ti;
> -	struct eventfs_inode *ei;
>  
>  	if (!dentry || !dentry->d_inode)
>  		return;
> @@ -815,8 +824,6 @@ void eventfs_remove_events_dir(struct dentry *dentry)
>  	if (!ti || !(ti->flags & TRACEFS_EVENT_INODE))
>  		return;
>  
> -	ei = ti->private;
>  	d_invalidate(dentry);
>  	dput(dentry);
> -	kfree(ei);
>  }
> diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
> index 3b8dd938b1c8..891653ba9cf3 100644
> --- a/fs/tracefs/inode.c
> +++ b/fs/tracefs/inode.c
> @@ -385,7 +385,7 @@ static void tracefs_dentry_iput(struct dentry *dentry, struct inode *inode)
>  
>  	ti = get_tracefs(inode);
>  	if (ti && ti->flags & TRACEFS_EVENT_INODE)
> -		eventfs_set_ef_status_free(dentry);
> +		eventfs_set_ef_status_free(ti, dentry);
>  	iput(inode);
>  }
>  
> diff --git a/fs/tracefs/internal.h b/fs/tracefs/internal.h
> index 69c2b1d87c46..4f2e49e2197b 100644
> --- a/fs/tracefs/internal.h
> +++ b/fs/tracefs/internal.h
> @@ -3,7 +3,8 @@
>  #define _TRACEFS_INTERNAL_H
>  
>  enum {
> -	TRACEFS_EVENT_INODE     = BIT(1),
> +	TRACEFS_EVENT_INODE		= BIT(1),
> +	TRACEFS_EVENT_TOP_INODE		= BIT(2),
>  };
>  
>  struct tracefs_inode {
> @@ -24,6 +25,6 @@ struct inode *tracefs_get_inode(struct super_block *sb);
>  struct dentry *eventfs_start_creating(const char *name, struct dentry *parent);
>  struct dentry *eventfs_failed_creating(struct dentry *dentry);
>  struct dentry *eventfs_end_creating(struct dentry *dentry);
> -void eventfs_set_ef_status_free(struct dentry *dentry);
> +void eventfs_set_ef_status_free(struct tracefs_inode *ti, struct dentry *dentry);
>  
>  #endif /* _TRACEFS_INTERNAL_H */
> -- 
> 2.40.1
Steven Rostedt Sept. 9, 2023, 3:09 a.m. UTC | #5
On Fri, 8 Sep 2023 16:45:53 +0900
Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:

> Hi, 
> 
> On Wed, 06 Sep 2023 22:47:11 -0400
> Steven Rostedt <rostedt@goodmis.org> wrote:
> 
> > From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
> > 
> > Currently when rmdir on an instance is done, eventfs_remove_events_dir()
> > is called and it does a dput on the dentry and then frees the
> > eventfs_inode that represents the events directory.
> > 
> > But there's no protection against a reader reading the top level events
> > directory at the same time and we can get a use after free error. Instead,
> > use the dput() associated to the dentry to also free the eventfs_inode
> > associated to the events directory, as that will get called when the last
> > reference to the directory is released.
> > 
> > Link: https://lore.kernel.org/all/1cb3aee2-19af-c472-e265-05176fe9bd84@huawei.com/
> > 
> > Cc: Ajay Kaher <akaher@vmware.com>
> > Fixes: 5bdcd5f5331a2 eventfs: ("Implement removal of meta data from eventfs")
> > Reported-by: Zheng Yejian <zhengyejian1@huawei.com>
> > Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
> > ---
> > Changes since v1: https://lore.kernel.org/linux-trace-kernel/20230905183332.628d7cc0@gandalf.local.home
> >  - Removed left over "ei" variable (kernel test robot)
> > 
> >  fs/tracefs/event_inode.c | 17 ++++++++++++-----
> >  fs/tracefs/inode.c       |  2 +-
> >  fs/tracefs/internal.h    |  5 +++--
> >  3 files changed, 16 insertions(+), 8 deletions(-)
> > 
> > diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c
> > index fa1a1679a886..609ccb5b7cfc 100644
> > --- a/fs/tracefs/event_inode.c
> > +++ b/fs/tracefs/event_inode.c
> > @@ -185,17 +185,27 @@ static struct dentry *create_dir(const char *name, struct dentry *parent, void *
> >  
> >  /**
> >   * eventfs_set_ef_status_free - set the ef->status to free
> > + * @ti: the tracefs_inode of the dentry
> >   * @dentry: dentry who's status to be freed
> >   *
> >   * eventfs_set_ef_status_free will be called if no more
> >   * references remain
> >   */
> > -void eventfs_set_ef_status_free(struct dentry *dentry)
> > +void eventfs_set_ef_status_free(struct tracefs_inode *ti, struct dentry *dentry)
> >  {
> >  	struct tracefs_inode *ti_parent;
> > +	struct eventfs_inode *ei;
> >  	struct eventfs_file *ef;
> >  
> >  	mutex_lock(&eventfs_mutex);
> > +
> > +	/* The top level events directory may be freed by this */
> > +	if (unlikely(ti->flags & TRACEFS_EVENT_TOP_INODE)) {
> > +		ei = ti->private;
> > +		kfree(ei);  
> 
> Don't we need to clear 'ti->private' here to avoid accessing
> (or double free) ti->private somewhare?

I don't think it's needed but I did add it to

  https://lore.kernel.org/linux-trace-kernel/20230907175859.6fedbaa2@gandalf.local.home/

Which you reviewed.

> 
> > +		goto out;
> > +	}
> > +
> >  	ti_parent = get_tracefs(dentry->d_parent->d_inode);
> >  	if (!ti_parent || !(ti_parent->flags & TRACEFS_EVENT_INODE))
> >  		goto out;  
> 
> Here, I guess this "!(ti_parent->flags & TRACEFS_EVENT_INODE)" means this
> inode is TRACEFS_EVENT_TOP_INODE, so this check may not be needed,
> is this correct?

The check isn't needed but I like to keep it because it will break things
badly if it is every called on something that is not an EVENT_INODE.

We could add a WARN() here if not, but this code is not critical if it is
called without it set.

-- Steve
diff mbox series

Patch

diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c
index fa1a1679a886..609ccb5b7cfc 100644
--- a/fs/tracefs/event_inode.c
+++ b/fs/tracefs/event_inode.c
@@ -185,17 +185,27 @@  static struct dentry *create_dir(const char *name, struct dentry *parent, void *
 
 /**
  * eventfs_set_ef_status_free - set the ef->status to free
+ * @ti: the tracefs_inode of the dentry
  * @dentry: dentry who's status to be freed
  *
  * eventfs_set_ef_status_free will be called if no more
  * references remain
  */
-void eventfs_set_ef_status_free(struct dentry *dentry)
+void eventfs_set_ef_status_free(struct tracefs_inode *ti, struct dentry *dentry)
 {
 	struct tracefs_inode *ti_parent;
+	struct eventfs_inode *ei;
 	struct eventfs_file *ef;
 
 	mutex_lock(&eventfs_mutex);
+
+	/* The top level events directory may be freed by this */
+	if (unlikely(ti->flags & TRACEFS_EVENT_TOP_INODE)) {
+		ei = ti->private;
+		kfree(ei);
+		goto out;
+	}
+
 	ti_parent = get_tracefs(dentry->d_parent->d_inode);
 	if (!ti_parent || !(ti_parent->flags & TRACEFS_EVENT_INODE))
 		goto out;
@@ -510,7 +520,7 @@  struct dentry *eventfs_create_events_dir(const char *name,
 	INIT_LIST_HEAD(&ei->e_top_files);
 
 	ti = get_tracefs(inode);
-	ti->flags |= TRACEFS_EVENT_INODE;
+	ti->flags |= TRACEFS_EVENT_INODE | TRACEFS_EVENT_TOP_INODE;
 	ti->private = ei;
 
 	inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
@@ -806,7 +816,6 @@  void eventfs_remove(struct eventfs_file *ef)
 void eventfs_remove_events_dir(struct dentry *dentry)
 {
 	struct tracefs_inode *ti;
-	struct eventfs_inode *ei;
 
 	if (!dentry || !dentry->d_inode)
 		return;
@@ -815,8 +824,6 @@  void eventfs_remove_events_dir(struct dentry *dentry)
 	if (!ti || !(ti->flags & TRACEFS_EVENT_INODE))
 		return;
 
-	ei = ti->private;
 	d_invalidate(dentry);
 	dput(dentry);
-	kfree(ei);
 }
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index 3b8dd938b1c8..891653ba9cf3 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -385,7 +385,7 @@  static void tracefs_dentry_iput(struct dentry *dentry, struct inode *inode)
 
 	ti = get_tracefs(inode);
 	if (ti && ti->flags & TRACEFS_EVENT_INODE)
-		eventfs_set_ef_status_free(dentry);
+		eventfs_set_ef_status_free(ti, dentry);
 	iput(inode);
 }
 
diff --git a/fs/tracefs/internal.h b/fs/tracefs/internal.h
index 69c2b1d87c46..4f2e49e2197b 100644
--- a/fs/tracefs/internal.h
+++ b/fs/tracefs/internal.h
@@ -3,7 +3,8 @@ 
 #define _TRACEFS_INTERNAL_H
 
 enum {
-	TRACEFS_EVENT_INODE     = BIT(1),
+	TRACEFS_EVENT_INODE		= BIT(1),
+	TRACEFS_EVENT_TOP_INODE		= BIT(2),
 };
 
 struct tracefs_inode {
@@ -24,6 +25,6 @@  struct inode *tracefs_get_inode(struct super_block *sb);
 struct dentry *eventfs_start_creating(const char *name, struct dentry *parent);
 struct dentry *eventfs_failed_creating(struct dentry *dentry);
 struct dentry *eventfs_end_creating(struct dentry *dentry);
-void eventfs_set_ef_status_free(struct dentry *dentry);
+void eventfs_set_ef_status_free(struct tracefs_inode *ti, struct dentry *dentry);
 
 #endif /* _TRACEFS_INTERNAL_H */