diff mbox series

[v2,05/15] pidfs: record exit code and cgroupid at exit

Message ID 20250304-work-pidfs-kill_on_last_close-v2-5-44fdacfaa7b7@kernel.org (mailing list archive)
State New
Headers show
Series pidfs: provide information after task has been reaped | expand

Commit Message

Christian Brauner March 4, 2025, 9:41 a.m. UTC
Record the exit code and cgroupid in do_exit() and stash in struct
pidfs_exit_info so it can be retrieved even after the task has been
reaped.

Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/internal.h         |  1 +
 fs/libfs.c            |  4 ++--
 fs/pidfs.c            | 41 ++++++++++++++++++++++++++++++++++++++++-
 include/linux/pidfs.h |  1 +
 kernel/exit.c         |  2 ++
 5 files changed, 46 insertions(+), 3 deletions(-)

Comments

Jeff Layton March 4, 2025, 1:05 p.m. UTC | #1
On Tue, 2025-03-04 at 10:41 +0100, Christian Brauner wrote:
> Record the exit code and cgroupid in do_exit() and stash in struct
> pidfs_exit_info so it can be retrieved even after the task has been
> reaped.
> 
> Signed-off-by: Christian Brauner <brauner@kernel.org>
> ---
>  fs/internal.h         |  1 +
>  fs/libfs.c            |  4 ++--
>  fs/pidfs.c            | 41 ++++++++++++++++++++++++++++++++++++++++-
>  include/linux/pidfs.h |  1 +
>  kernel/exit.c         |  2 ++
>  5 files changed, 46 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/internal.h b/fs/internal.h
> index e7f02ae1e098..c1e6d8b294cb 100644
> --- a/fs/internal.h
> +++ b/fs/internal.h
> @@ -325,6 +325,7 @@ struct stashed_operations {
>  int path_from_stashed(struct dentry **stashed, struct vfsmount *mnt, void *data,
>  		      struct path *path);
>  void stashed_dentry_prune(struct dentry *dentry);
> +struct dentry *stashed_dentry_get(struct dentry **stashed);
>  /**
>   * path_mounted - check whether path is mounted
>   * @path: path to check
> diff --git a/fs/libfs.c b/fs/libfs.c
> index 8444f5cc4064..cf5a267aafe4 100644
> --- a/fs/libfs.c
> +++ b/fs/libfs.c
> @@ -2113,7 +2113,7 @@ struct timespec64 simple_inode_init_ts(struct inode *inode)
>  }
>  EXPORT_SYMBOL(simple_inode_init_ts);
>  
> -static inline struct dentry *get_stashed_dentry(struct dentry **stashed)
> +struct dentry *stashed_dentry_get(struct dentry **stashed)
>  {
>  	struct dentry *dentry;
>  
> @@ -2215,7 +2215,7 @@ int path_from_stashed(struct dentry **stashed, struct vfsmount *mnt, void *data,
>  	const struct stashed_operations *sops = mnt->mnt_sb->s_fs_info;
>  
>  	/* See if dentry can be reused. */
> -	path->dentry = get_stashed_dentry(stashed);
> +	path->dentry = stashed_dentry_get(stashed);
>  	if (path->dentry) {
>  		sops->put_data(data);
>  		goto out_path;
> diff --git a/fs/pidfs.c b/fs/pidfs.c
> index eaecb0a947f0..258e1c13ee56 100644
> --- a/fs/pidfs.c
> +++ b/fs/pidfs.c
> @@ -32,7 +32,7 @@ static struct kmem_cache *pidfs_cachep __ro_after_init;
>   */
>  struct pidfs_exit_info {
>  	__u64 cgroupid;
> -	__u64 exit_code;
> +	__s32 exit_code;

^^^
The above delta should be folded into the previous patch.

>  };
>  
>  struct pidfs_inode {
> @@ -458,6 +458,45 @@ struct pid *pidfd_pid(const struct file *file)
>  	return file_inode(file)->i_private;
>  }
>  
> +/*
> + * We're called from release_task(). We know there's at least one
> + * reference to struct pid being held that won't be released until the
> + * task has been reaped which cannot happen until we're out of
> + * release_task().
> + *
> + * If this struct pid is refered to by a pidfd then stashed_dentry_get()
> + * will return the dentry and inode for that struct pid. Since we've
> + * taken a reference on it there's now an additional reference from the
> + * exit path on it. Which is fine. We're going to put it again in a
> + * second and we know that the pid is kept alive anyway.
> + *
> + * Worst case is that we've filled in the info and immediately free the
> + * dentry and inode afterwards since the pidfd has been closed. Since
> + * pidfs_exit() currently is placed after exit_task_work() we know that
> + * it cannot be us aka the exiting task holding a pidfd to ourselves.
> + */

That is a subtle interaction. Thanks for the comment!

> +void pidfs_exit(struct task_struct *tsk)
> +{
> +	struct dentry *dentry;
> +
> +	dentry = stashed_dentry_get(&task_pid(tsk)->stashed);
> +	if (dentry) {
> +		struct inode *inode = d_inode(dentry);
> +		struct pidfs_exit_info *exit_info = &pidfs_i(inode)->exit_info;
> +#ifdef CONFIG_CGROUPS
> +		struct cgroup *cgrp;
> +
> +		rcu_read_lock();
> +		cgrp = task_dfl_cgroup(tsk);
> +		exit_info->cgroupid = cgroup_id(cgrp);
> +		rcu_read_unlock();
> +#endif
> +		exit_info->exit_code = tsk->exit_code;
> +
> +		dput(dentry);
> +	}
> +}
> +
>  static struct vfsmount *pidfs_mnt __ro_after_init;
>  
>  /*
> diff --git a/include/linux/pidfs.h b/include/linux/pidfs.h
> index 7c830d0dec9a..05e6f8f4a026 100644
> --- a/include/linux/pidfs.h
> +++ b/include/linux/pidfs.h
> @@ -6,6 +6,7 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags);
>  void __init pidfs_init(void);
>  void pidfs_add_pid(struct pid *pid);
>  void pidfs_remove_pid(struct pid *pid);
> +void pidfs_exit(struct task_struct *tsk);
>  extern const struct dentry_operations pidfs_dentry_operations;
>  
>  #endif /* _LINUX_PID_FS_H */
> diff --git a/kernel/exit.c b/kernel/exit.c
> index 3485e5fc499e..98d292120296 100644
> --- a/kernel/exit.c
> +++ b/kernel/exit.c
> @@ -69,6 +69,7 @@
>  #include <linux/sysfs.h>
>  #include <linux/user_events.h>
>  #include <linux/uaccess.h>
> +#include <linux/pidfs.h>
>  
>  #include <uapi/linux/wait.h>
>  
> @@ -254,6 +255,7 @@ void release_task(struct task_struct *p)
>  	write_lock_irq(&tasklist_lock);
>  	ptrace_release_task(p);
>  	thread_pid = get_pid(p->thread_pid);
> +	pidfs_exit(p);
>  	__exit_signal(p);
>  
>  	/*
> 

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Oleg Nesterov March 4, 2025, 1:10 p.m. UTC | #2
I will read this series later, but I see nothing wrong after a quick glance.

Minor nit below...

On 03/04, Christian Brauner wrote:
>
> Record the exit code and cgroupid in do_exit()
                                    ^^^^^^^^^^^^
this is no longer true. In release_task().

> @@ -254,6 +255,7 @@ void release_task(struct task_struct *p)
>  	write_lock_irq(&tasklist_lock);
>  	ptrace_release_task(p);
>  	thread_pid = get_pid(p->thread_pid);
> +	pidfs_exit(p);
>  	__exit_signal(p);

And the next patch rightly moves pidfs_exit() up outside of tasklist.

Why not call it before write_lock_irq(&tasklist_lock) from the very
beginning?

Oleg.
Christian Brauner March 4, 2025, 7:10 p.m. UTC | #3
On Tue, Mar 04, 2025 at 02:10:18PM +0100, Oleg Nesterov wrote:
> I will read this series later, but I see nothing wrong after a quick glance.
> 
> Minor nit below...
> 
> On 03/04, Christian Brauner wrote:
> >
> > Record the exit code and cgroupid in do_exit()
>                                     ^^^^^^^^^^^^
> this is no longer true. In release_task().

Yes, thanks for catching this!

> 
> > @@ -254,6 +255,7 @@ void release_task(struct task_struct *p)
> >  	write_lock_irq(&tasklist_lock);
> >  	ptrace_release_task(p);
> >  	thread_pid = get_pid(p->thread_pid);
> > +	pidfs_exit(p);
> >  	__exit_signal(p);
> 
> And the next patch rightly moves pidfs_exit() up outside of tasklist.
> 
> Why not call it before write_lock_irq(&tasklist_lock) from the very
> beginning?

Yes, sorry that was my intention. pidfs_exit() can sleep due to dput().
Thanks!
diff mbox series

Patch

diff --git a/fs/internal.h b/fs/internal.h
index e7f02ae1e098..c1e6d8b294cb 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -325,6 +325,7 @@  struct stashed_operations {
 int path_from_stashed(struct dentry **stashed, struct vfsmount *mnt, void *data,
 		      struct path *path);
 void stashed_dentry_prune(struct dentry *dentry);
+struct dentry *stashed_dentry_get(struct dentry **stashed);
 /**
  * path_mounted - check whether path is mounted
  * @path: path to check
diff --git a/fs/libfs.c b/fs/libfs.c
index 8444f5cc4064..cf5a267aafe4 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -2113,7 +2113,7 @@  struct timespec64 simple_inode_init_ts(struct inode *inode)
 }
 EXPORT_SYMBOL(simple_inode_init_ts);
 
-static inline struct dentry *get_stashed_dentry(struct dentry **stashed)
+struct dentry *stashed_dentry_get(struct dentry **stashed)
 {
 	struct dentry *dentry;
 
@@ -2215,7 +2215,7 @@  int path_from_stashed(struct dentry **stashed, struct vfsmount *mnt, void *data,
 	const struct stashed_operations *sops = mnt->mnt_sb->s_fs_info;
 
 	/* See if dentry can be reused. */
-	path->dentry = get_stashed_dentry(stashed);
+	path->dentry = stashed_dentry_get(stashed);
 	if (path->dentry) {
 		sops->put_data(data);
 		goto out_path;
diff --git a/fs/pidfs.c b/fs/pidfs.c
index eaecb0a947f0..258e1c13ee56 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -32,7 +32,7 @@  static struct kmem_cache *pidfs_cachep __ro_after_init;
  */
 struct pidfs_exit_info {
 	__u64 cgroupid;
-	__u64 exit_code;
+	__s32 exit_code;
 };
 
 struct pidfs_inode {
@@ -458,6 +458,45 @@  struct pid *pidfd_pid(const struct file *file)
 	return file_inode(file)->i_private;
 }
 
+/*
+ * We're called from release_task(). We know there's at least one
+ * reference to struct pid being held that won't be released until the
+ * task has been reaped which cannot happen until we're out of
+ * release_task().
+ *
+ * If this struct pid is refered to by a pidfd then stashed_dentry_get()
+ * will return the dentry and inode for that struct pid. Since we've
+ * taken a reference on it there's now an additional reference from the
+ * exit path on it. Which is fine. We're going to put it again in a
+ * second and we know that the pid is kept alive anyway.
+ *
+ * Worst case is that we've filled in the info and immediately free the
+ * dentry and inode afterwards since the pidfd has been closed. Since
+ * pidfs_exit() currently is placed after exit_task_work() we know that
+ * it cannot be us aka the exiting task holding a pidfd to ourselves.
+ */
+void pidfs_exit(struct task_struct *tsk)
+{
+	struct dentry *dentry;
+
+	dentry = stashed_dentry_get(&task_pid(tsk)->stashed);
+	if (dentry) {
+		struct inode *inode = d_inode(dentry);
+		struct pidfs_exit_info *exit_info = &pidfs_i(inode)->exit_info;
+#ifdef CONFIG_CGROUPS
+		struct cgroup *cgrp;
+
+		rcu_read_lock();
+		cgrp = task_dfl_cgroup(tsk);
+		exit_info->cgroupid = cgroup_id(cgrp);
+		rcu_read_unlock();
+#endif
+		exit_info->exit_code = tsk->exit_code;
+
+		dput(dentry);
+	}
+}
+
 static struct vfsmount *pidfs_mnt __ro_after_init;
 
 /*
diff --git a/include/linux/pidfs.h b/include/linux/pidfs.h
index 7c830d0dec9a..05e6f8f4a026 100644
--- a/include/linux/pidfs.h
+++ b/include/linux/pidfs.h
@@ -6,6 +6,7 @@  struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags);
 void __init pidfs_init(void);
 void pidfs_add_pid(struct pid *pid);
 void pidfs_remove_pid(struct pid *pid);
+void pidfs_exit(struct task_struct *tsk);
 extern const struct dentry_operations pidfs_dentry_operations;
 
 #endif /* _LINUX_PID_FS_H */
diff --git a/kernel/exit.c b/kernel/exit.c
index 3485e5fc499e..98d292120296 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -69,6 +69,7 @@ 
 #include <linux/sysfs.h>
 #include <linux/user_events.h>
 #include <linux/uaccess.h>
+#include <linux/pidfs.h>
 
 #include <uapi/linux/wait.h>
 
@@ -254,6 +255,7 @@  void release_task(struct task_struct *p)
 	write_lock_irq(&tasklist_lock);
 	ptrace_release_task(p);
 	thread_pid = get_pid(p->thread_pid);
+	pidfs_exit(p);
 	__exit_signal(p);
 
 	/*