Message ID | 20250304-work-pidfs-kill_on_last_close-v2-5-44fdacfaa7b7@kernel.org (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | pidfs: provide information after task has been reaped | expand |
On Tue, 2025-03-04 at 10:41 +0100, Christian Brauner wrote: > Record the exit code and cgroupid in do_exit() and stash in struct > pidfs_exit_info so it can be retrieved even after the task has been > reaped. > > Signed-off-by: Christian Brauner <brauner@kernel.org> > --- > fs/internal.h | 1 + > fs/libfs.c | 4 ++-- > fs/pidfs.c | 41 ++++++++++++++++++++++++++++++++++++++++- > include/linux/pidfs.h | 1 + > kernel/exit.c | 2 ++ > 5 files changed, 46 insertions(+), 3 deletions(-) > > diff --git a/fs/internal.h b/fs/internal.h > index e7f02ae1e098..c1e6d8b294cb 100644 > --- a/fs/internal.h > +++ b/fs/internal.h > @@ -325,6 +325,7 @@ struct stashed_operations { > int path_from_stashed(struct dentry **stashed, struct vfsmount *mnt, void *data, > struct path *path); > void stashed_dentry_prune(struct dentry *dentry); > +struct dentry *stashed_dentry_get(struct dentry **stashed); > /** > * path_mounted - check whether path is mounted > * @path: path to check > diff --git a/fs/libfs.c b/fs/libfs.c > index 8444f5cc4064..cf5a267aafe4 100644 > --- a/fs/libfs.c > +++ b/fs/libfs.c > @@ -2113,7 +2113,7 @@ struct timespec64 simple_inode_init_ts(struct inode *inode) > } > EXPORT_SYMBOL(simple_inode_init_ts); > > -static inline struct dentry *get_stashed_dentry(struct dentry **stashed) > +struct dentry *stashed_dentry_get(struct dentry **stashed) > { > struct dentry *dentry; > > @@ -2215,7 +2215,7 @@ int path_from_stashed(struct dentry **stashed, struct vfsmount *mnt, void *data, > const struct stashed_operations *sops = mnt->mnt_sb->s_fs_info; > > /* See if dentry can be reused. */ > - path->dentry = get_stashed_dentry(stashed); > + path->dentry = stashed_dentry_get(stashed); > if (path->dentry) { > sops->put_data(data); > goto out_path; > diff --git a/fs/pidfs.c b/fs/pidfs.c > index eaecb0a947f0..258e1c13ee56 100644 > --- a/fs/pidfs.c > +++ b/fs/pidfs.c > @@ -32,7 +32,7 @@ static struct kmem_cache *pidfs_cachep __ro_after_init; > */ > struct pidfs_exit_info { > __u64 cgroupid; > - __u64 exit_code; > + __s32 exit_code; ^^^ The above delta should be folded into the previous patch. > }; > > struct pidfs_inode { > @@ -458,6 +458,45 @@ struct pid *pidfd_pid(const struct file *file) > return file_inode(file)->i_private; > } > > +/* > + * We're called from release_task(). We know there's at least one > + * reference to struct pid being held that won't be released until the > + * task has been reaped which cannot happen until we're out of > + * release_task(). > + * > + * If this struct pid is refered to by a pidfd then stashed_dentry_get() > + * will return the dentry and inode for that struct pid. Since we've > + * taken a reference on it there's now an additional reference from the > + * exit path on it. Which is fine. We're going to put it again in a > + * second and we know that the pid is kept alive anyway. > + * > + * Worst case is that we've filled in the info and immediately free the > + * dentry and inode afterwards since the pidfd has been closed. Since > + * pidfs_exit() currently is placed after exit_task_work() we know that > + * it cannot be us aka the exiting task holding a pidfd to ourselves. > + */ That is a subtle interaction. Thanks for the comment! > +void pidfs_exit(struct task_struct *tsk) > +{ > + struct dentry *dentry; > + > + dentry = stashed_dentry_get(&task_pid(tsk)->stashed); > + if (dentry) { > + struct inode *inode = d_inode(dentry); > + struct pidfs_exit_info *exit_info = &pidfs_i(inode)->exit_info; > +#ifdef CONFIG_CGROUPS > + struct cgroup *cgrp; > + > + rcu_read_lock(); > + cgrp = task_dfl_cgroup(tsk); > + exit_info->cgroupid = cgroup_id(cgrp); > + rcu_read_unlock(); > +#endif > + exit_info->exit_code = tsk->exit_code; > + > + dput(dentry); > + } > +} > + > static struct vfsmount *pidfs_mnt __ro_after_init; > > /* > diff --git a/include/linux/pidfs.h b/include/linux/pidfs.h > index 7c830d0dec9a..05e6f8f4a026 100644 > --- a/include/linux/pidfs.h > +++ b/include/linux/pidfs.h > @@ -6,6 +6,7 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags); > void __init pidfs_init(void); > void pidfs_add_pid(struct pid *pid); > void pidfs_remove_pid(struct pid *pid); > +void pidfs_exit(struct task_struct *tsk); > extern const struct dentry_operations pidfs_dentry_operations; > > #endif /* _LINUX_PID_FS_H */ > diff --git a/kernel/exit.c b/kernel/exit.c > index 3485e5fc499e..98d292120296 100644 > --- a/kernel/exit.c > +++ b/kernel/exit.c > @@ -69,6 +69,7 @@ > #include <linux/sysfs.h> > #include <linux/user_events.h> > #include <linux/uaccess.h> > +#include <linux/pidfs.h> > > #include <uapi/linux/wait.h> > > @@ -254,6 +255,7 @@ void release_task(struct task_struct *p) > write_lock_irq(&tasklist_lock); > ptrace_release_task(p); > thread_pid = get_pid(p->thread_pid); > + pidfs_exit(p); > __exit_signal(p); > > /* > Reviewed-by: Jeff Layton <jlayton@kernel.org>
I will read this series later, but I see nothing wrong after a quick glance. Minor nit below... On 03/04, Christian Brauner wrote: > > Record the exit code and cgroupid in do_exit() ^^^^^^^^^^^^ this is no longer true. In release_task(). > @@ -254,6 +255,7 @@ void release_task(struct task_struct *p) > write_lock_irq(&tasklist_lock); > ptrace_release_task(p); > thread_pid = get_pid(p->thread_pid); > + pidfs_exit(p); > __exit_signal(p); And the next patch rightly moves pidfs_exit() up outside of tasklist. Why not call it before write_lock_irq(&tasklist_lock) from the very beginning? Oleg.
On Tue, Mar 04, 2025 at 02:10:18PM +0100, Oleg Nesterov wrote: > I will read this series later, but I see nothing wrong after a quick glance. > > Minor nit below... > > On 03/04, Christian Brauner wrote: > > > > Record the exit code and cgroupid in do_exit() > ^^^^^^^^^^^^ > this is no longer true. In release_task(). Yes, thanks for catching this! > > > @@ -254,6 +255,7 @@ void release_task(struct task_struct *p) > > write_lock_irq(&tasklist_lock); > > ptrace_release_task(p); > > thread_pid = get_pid(p->thread_pid); > > + pidfs_exit(p); > > __exit_signal(p); > > And the next patch rightly moves pidfs_exit() up outside of tasklist. > > Why not call it before write_lock_irq(&tasklist_lock) from the very > beginning? Yes, sorry that was my intention. pidfs_exit() can sleep due to dput(). Thanks!
diff --git a/fs/internal.h b/fs/internal.h index e7f02ae1e098..c1e6d8b294cb 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -325,6 +325,7 @@ struct stashed_operations { int path_from_stashed(struct dentry **stashed, struct vfsmount *mnt, void *data, struct path *path); void stashed_dentry_prune(struct dentry *dentry); +struct dentry *stashed_dentry_get(struct dentry **stashed); /** * path_mounted - check whether path is mounted * @path: path to check diff --git a/fs/libfs.c b/fs/libfs.c index 8444f5cc4064..cf5a267aafe4 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -2113,7 +2113,7 @@ struct timespec64 simple_inode_init_ts(struct inode *inode) } EXPORT_SYMBOL(simple_inode_init_ts); -static inline struct dentry *get_stashed_dentry(struct dentry **stashed) +struct dentry *stashed_dentry_get(struct dentry **stashed) { struct dentry *dentry; @@ -2215,7 +2215,7 @@ int path_from_stashed(struct dentry **stashed, struct vfsmount *mnt, void *data, const struct stashed_operations *sops = mnt->mnt_sb->s_fs_info; /* See if dentry can be reused. */ - path->dentry = get_stashed_dentry(stashed); + path->dentry = stashed_dentry_get(stashed); if (path->dentry) { sops->put_data(data); goto out_path; diff --git a/fs/pidfs.c b/fs/pidfs.c index eaecb0a947f0..258e1c13ee56 100644 --- a/fs/pidfs.c +++ b/fs/pidfs.c @@ -32,7 +32,7 @@ static struct kmem_cache *pidfs_cachep __ro_after_init; */ struct pidfs_exit_info { __u64 cgroupid; - __u64 exit_code; + __s32 exit_code; }; struct pidfs_inode { @@ -458,6 +458,45 @@ struct pid *pidfd_pid(const struct file *file) return file_inode(file)->i_private; } +/* + * We're called from release_task(). We know there's at least one + * reference to struct pid being held that won't be released until the + * task has been reaped which cannot happen until we're out of + * release_task(). + * + * If this struct pid is refered to by a pidfd then stashed_dentry_get() + * will return the dentry and inode for that struct pid. Since we've + * taken a reference on it there's now an additional reference from the + * exit path on it. Which is fine. We're going to put it again in a + * second and we know that the pid is kept alive anyway. + * + * Worst case is that we've filled in the info and immediately free the + * dentry and inode afterwards since the pidfd has been closed. Since + * pidfs_exit() currently is placed after exit_task_work() we know that + * it cannot be us aka the exiting task holding a pidfd to ourselves. + */ +void pidfs_exit(struct task_struct *tsk) +{ + struct dentry *dentry; + + dentry = stashed_dentry_get(&task_pid(tsk)->stashed); + if (dentry) { + struct inode *inode = d_inode(dentry); + struct pidfs_exit_info *exit_info = &pidfs_i(inode)->exit_info; +#ifdef CONFIG_CGROUPS + struct cgroup *cgrp; + + rcu_read_lock(); + cgrp = task_dfl_cgroup(tsk); + exit_info->cgroupid = cgroup_id(cgrp); + rcu_read_unlock(); +#endif + exit_info->exit_code = tsk->exit_code; + + dput(dentry); + } +} + static struct vfsmount *pidfs_mnt __ro_after_init; /* diff --git a/include/linux/pidfs.h b/include/linux/pidfs.h index 7c830d0dec9a..05e6f8f4a026 100644 --- a/include/linux/pidfs.h +++ b/include/linux/pidfs.h @@ -6,6 +6,7 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags); void __init pidfs_init(void); void pidfs_add_pid(struct pid *pid); void pidfs_remove_pid(struct pid *pid); +void pidfs_exit(struct task_struct *tsk); extern const struct dentry_operations pidfs_dentry_operations; #endif /* _LINUX_PID_FS_H */ diff --git a/kernel/exit.c b/kernel/exit.c index 3485e5fc499e..98d292120296 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -69,6 +69,7 @@ #include <linux/sysfs.h> #include <linux/user_events.h> #include <linux/uaccess.h> +#include <linux/pidfs.h> #include <uapi/linux/wait.h> @@ -254,6 +255,7 @@ void release_task(struct task_struct *p) write_lock_irq(&tasklist_lock); ptrace_release_task(p); thread_pid = get_pid(p->thread_pid); + pidfs_exit(p); __exit_signal(p); /*
Record the exit code and cgroupid in do_exit() and stash in struct pidfs_exit_info so it can be retrieved even after the task has been reaped. Signed-off-by: Christian Brauner <brauner@kernel.org> --- fs/internal.h | 1 + fs/libfs.c | 4 ++-- fs/pidfs.c | 41 ++++++++++++++++++++++++++++++++++++++++- include/linux/pidfs.h | 1 + kernel/exit.c | 2 ++ 5 files changed, 46 insertions(+), 3 deletions(-)