diff mbox series

[v2,06/15] pidfs: allow to retrieve exit information

Message ID 20250304-work-pidfs-kill_on_last_close-v2-6-44fdacfaa7b7@kernel.org (mailing list archive)
State New
Headers show
Series pidfs: provide information after task has been reaped | expand

Commit Message

Christian Brauner March 4, 2025, 9:41 a.m. UTC
Some tools like systemd's jounral need to retrieve the exit and cgroup
information after a process has already been reaped. This can e.g.,
happen when retrieving a pidfd via SCM_PIDFD or SCM_PEERPIDFD.

Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/pidfs.c                 | 88 ++++++++++++++++++++++++++++++++++++----------
 include/uapi/linux/pidfd.h |  3 +-
 kernel/exit.c              |  2 +-
 3 files changed, 73 insertions(+), 20 deletions(-)

Comments

Jeff Layton March 4, 2025, 1:27 p.m. UTC | #1
On Tue, 2025-03-04 at 10:41 +0100, Christian Brauner wrote:
> Some tools like systemd's jounral need to retrieve the exit and cgroup
> information after a process has already been reaped. This can e.g.,
> happen when retrieving a pidfd via SCM_PIDFD or SCM_PEERPIDFD.
> 
> Signed-off-by: Christian Brauner <brauner@kernel.org>
> ---
>  fs/pidfs.c                 | 88 ++++++++++++++++++++++++++++++++++++----------
>  include/uapi/linux/pidfd.h |  3 +-
>  kernel/exit.c              |  2 +-
>  3 files changed, 73 insertions(+), 20 deletions(-)
> 
> diff --git a/fs/pidfs.c b/fs/pidfs.c
> index 258e1c13ee56..11744d7fe177 100644
> --- a/fs/pidfs.c
> +++ b/fs/pidfs.c
> @@ -36,7 +36,8 @@ struct pidfs_exit_info {
>  };
>  
>  struct pidfs_inode {
> -	struct pidfs_exit_info exit_info;
> +	struct pidfs_exit_info __pei;
> +	struct pidfs_exit_info *exit_info;
>  	struct inode vfs_inode;
>  };
>  
> @@ -228,17 +229,28 @@ static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
>  	return poll_flags;
>  }
>  
> -static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long arg)
> +static inline bool current_in_pidns(struct pid *pid)
> +{
> +	const struct pid_namespace *ns = task_active_pid_ns(current);
> +
> +	if (ns->level <= pid->level)
> +		return pid->numbers[ns->level].ns == ns;
> +
> +	return false;
> +}
> +
> +static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg)
>  {
>  	struct pidfd_info __user *uinfo = (struct pidfd_info __user *)arg;
> +	struct pid *pid = pidfd_pid(file);
>  	size_t usize = _IOC_SIZE(cmd);
>  	struct pidfd_info kinfo = {};
> +	struct pidfs_exit_info *exit_info;
> +	struct inode *inode = file_inode(file);
>  	struct user_namespace *user_ns;
> +	struct task_struct *task;
>  	const struct cred *c;
>  	__u64 mask;
> -#ifdef CONFIG_CGROUPS
> -	struct cgroup *cgrp;
> -#endif
>  
>  	if (!uinfo)
>  		return -EINVAL;
> @@ -248,6 +260,37 @@ static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long
>  	if (copy_from_user(&mask, &uinfo->mask, sizeof(mask)))
>  		return -EFAULT;
>  
> +	task = get_pid_task(pid, PIDTYPE_PID);
> +	if (!task) {
> +		if (!(mask & PIDFD_INFO_EXIT))
> +			return -ESRCH;
> +
> +		if (!current_in_pidns(pid))
> +			return -ESRCH;
> +	}
> +
> +	if (mask & PIDFD_INFO_EXIT) {
> +		exit_info = READ_ONCE(pidfs_i(inode)->exit_info);
> +		if (exit_info) {
> +#ifdef CONFIG_CGROUPS
> +			kinfo.cgroupid = exit_info->cgroupid;
> +			kinfo.mask |= PIDFD_INFO_EXIT | PIDFD_INFO_CGROUPID;
> +#endif
> +			kinfo.exit_code = exit_info->exit_code;
> +		}
> +	}
> +
> +	/*
> +	 * If the task has already been reaped only exit information
> +	 * can be provided. It's entirely possible that the task has
> +	 * already been reaped but we managed to grab a reference to it
> +	 * before that. So a full set of information about @task doesn't
> +	 * mean it hasn't been waited upon. Similarly, a full set of
> +	 * information doesn't mean that the task hasn't already exited.
> +	 */
> +	if (!task)
> +		goto copy_out;
> +
>  	c = get_task_cred(task);
>  	if (!c)
>  		return -ESRCH;
> @@ -267,11 +310,15 @@ static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long
>  	put_cred(c);
>  
>  #ifdef CONFIG_CGROUPS
> -	rcu_read_lock();
> -	cgrp = task_dfl_cgroup(task);
> -	kinfo.cgroupid = cgroup_id(cgrp);
> -	kinfo.mask |= PIDFD_INFO_CGROUPID;
> -	rcu_read_unlock();
> +	if (!kinfo.cgroupid) {
> +		struct cgroup *cgrp;
> +
> +		rcu_read_lock();
> +		cgrp = task_dfl_cgroup(task);
> +		kinfo.cgroupid = cgroup_id(cgrp);
> +		kinfo.mask |= PIDFD_INFO_CGROUPID;
> +		rcu_read_unlock();
> +	}
>  #endif
>  
>  	/*
> @@ -291,6 +338,7 @@ static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long
>  	if (kinfo.pid == 0 || kinfo.tgid == 0 || (kinfo.ppid == 0 && kinfo.pid != 1))
>  		return -ESRCH;
>  
> +copy_out:
>  	/*
>  	 * If userspace and the kernel have the same struct size it can just
>  	 * be copied. If userspace provides an older struct, only the bits that
> @@ -325,7 +373,6 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
>  {
>  	struct task_struct *task __free(put_task) = NULL;
>  	struct nsproxy *nsp __free(put_nsproxy) = NULL;
> -	struct pid *pid = pidfd_pid(file);
>  	struct ns_common *ns_common = NULL;
>  	struct pid_namespace *pid_ns;
>  
> @@ -340,13 +387,13 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
>  		return put_user(file_inode(file)->i_generation, argp);
>  	}
>  
> -	task = get_pid_task(pid, PIDTYPE_PID);
> -	if (!task)
> -		return -ESRCH;
> -
>  	/* Extensible IOCTL that does not open namespace FDs, take a shortcut */
>  	if (_IOC_NR(cmd) == _IOC_NR(PIDFD_GET_INFO))
> -		return pidfd_info(task, cmd, arg);
> +		return pidfd_info(file, cmd, arg);
> +
> +	task = get_pid_task(pidfd_pid(file), PIDTYPE_PID);
> +	if (!task)
> +		return -ESRCH;
>  
>  	if (arg)
>  		return -EINVAL;
> @@ -479,10 +526,12 @@ void pidfs_exit(struct task_struct *tsk)
>  {
>  	struct dentry *dentry;
>  
> +	might_sleep();
> +
>  	dentry = stashed_dentry_get(&task_pid(tsk)->stashed);
>  	if (dentry) {
>  		struct inode *inode = d_inode(dentry);
> -		struct pidfs_exit_info *exit_info = &pidfs_i(inode)->exit_info;
> +		struct pidfs_exit_info *exit_info = &pidfs_i(inode)->__pei;
>  #ifdef CONFIG_CGROUPS
>  		struct cgroup *cgrp;
>  
> @@ -493,6 +542,8 @@ void pidfs_exit(struct task_struct *tsk)
>  #endif
>  		exit_info->exit_code = tsk->exit_code;
>  
> +		/* Ensure that PIDFD_GET_INFO sees either all or nothing. */
> +		smp_store_release(&pidfs_i(inode)->exit_info, &pidfs_i(inode)->__pei);
>  		dput(dentry);
>  	}
>  }
> @@ -560,7 +611,8 @@ static struct inode *pidfs_alloc_inode(struct super_block *sb)
>  	if (!pi)
>  		return NULL;
>  
> -	memset(&pi->exit_info, 0, sizeof(pi->exit_info));
> +	memset(&pi->__pei, 0, sizeof(pi->__pei));
> +	pi->exit_info = NULL;
>  
>  	return &pi->vfs_inode;
>  }
> diff --git a/include/uapi/linux/pidfd.h b/include/uapi/linux/pidfd.h
> index e0abd0b18841..e5966f1a7743 100644
> --- a/include/uapi/linux/pidfd.h
> +++ b/include/uapi/linux/pidfd.h
> @@ -20,6 +20,7 @@
>  #define PIDFD_INFO_PID			(1UL << 0) /* Always returned, even if not requested */
>  #define PIDFD_INFO_CREDS		(1UL << 1) /* Always returned, even if not requested */
>  #define PIDFD_INFO_CGROUPID		(1UL << 2) /* Always returned if available, even if not requested */
> +#define PIDFD_INFO_EXIT			(1UL << 3) /* Always returned if available, even if not requested */
>  
>  #define PIDFD_INFO_SIZE_VER0		64 /* sizeof first published struct */
>  
> @@ -86,7 +87,7 @@ struct pidfd_info {
>  	__u32 sgid;
>  	__u32 fsuid;
>  	__u32 fsgid;
> -	__u32 spare0[1];
> +	__s32 exit_code;
>  };
>  
>  #define PIDFS_IOCTL_MAGIC 0xFF
> diff --git a/kernel/exit.c b/kernel/exit.c
> index 98d292120296..9916305e34d3 100644
> --- a/kernel/exit.c
> +++ b/kernel/exit.c
> @@ -250,12 +250,12 @@ void release_task(struct task_struct *p)
>  	dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
>  	rcu_read_unlock();
>  
> +	pidfs_exit(p);
>  	cgroup_release(p);
>  
>  	write_lock_irq(&tasklist_lock);
>  	ptrace_release_task(p);
>  	thread_pid = get_pid(p->thread_pid);
> -	pidfs_exit(p);

Why move this after you just added it?

>  	__exit_signal(p);
>  
>  	/*
> 

Everything else looks fine though. Assuming you fix the nit above, you
can add:

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Oleg Nesterov March 4, 2025, 5:22 p.m. UTC | #2
On 03/04, Christian Brauner wrote:
>
> @@ -248,6 +260,37 @@ static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long
>  	if (copy_from_user(&mask, &uinfo->mask, sizeof(mask)))
>  		return -EFAULT;
>
> +	task = get_pid_task(pid, PIDTYPE_PID);
> +	if (!task) {
> +		if (!(mask & PIDFD_INFO_EXIT))
> +			return -ESRCH;
> +
> +		if (!current_in_pidns(pid))
> +			return -ESRCH;
> +	}
> +
> +	if (mask & PIDFD_INFO_EXIT) {
> +		exit_info = READ_ONCE(pidfs_i(inode)->exit_info);
> +		if (exit_info) {
> +#ifdef CONFIG_CGROUPS
> +			kinfo.cgroupid = exit_info->cgroupid;
> +			kinfo.mask |= PIDFD_INFO_EXIT | PIDFD_INFO_CGROUPID;
> +#endif
> +			kinfo.exit_code = exit_info->exit_code;
> +		}

Confused... so, without CONFIG_CGROUPS pidfd_info() will never report
PIDFD_INFO_EXIT in kinfo.mask ?

> --- a/include/uapi/linux/pidfd.h
> +++ b/include/uapi/linux/pidfd.h
> @@ -20,6 +20,7 @@
>  #define PIDFD_INFO_PID			(1UL << 0) /* Always returned, even if not requested */
>  #define PIDFD_INFO_CREDS		(1UL << 1) /* Always returned, even if not requested */
>  #define PIDFD_INFO_CGROUPID		(1UL << 2) /* Always returned if available, even if not requested */
> +#define PIDFD_INFO_EXIT			(1UL << 3) /* Always returned if available, even if not requested */
                                                              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

The comment doesn't match the "if (mask & PIDFD_INFO_EXIT)" check above...

Oleg.
Christian Brauner March 4, 2025, 5:23 p.m. UTC | #3
On Tue, Mar 04, 2025 at 08:27:29AM -0500, Jeff Layton wrote:
> On Tue, 2025-03-04 at 10:41 +0100, Christian Brauner wrote:
> > Some tools like systemd's jounral need to retrieve the exit and cgroup
> > information after a process has already been reaped. This can e.g.,
> > happen when retrieving a pidfd via SCM_PIDFD or SCM_PEERPIDFD.
> > 
> > Signed-off-by: Christian Brauner <brauner@kernel.org>
> > ---
> >  fs/pidfs.c                 | 88 ++++++++++++++++++++++++++++++++++++----------
> >  include/uapi/linux/pidfd.h |  3 +-
> >  kernel/exit.c              |  2 +-
> >  3 files changed, 73 insertions(+), 20 deletions(-)
> > 
> > diff --git a/fs/pidfs.c b/fs/pidfs.c
> > index 258e1c13ee56..11744d7fe177 100644
> > --- a/fs/pidfs.c
> > +++ b/fs/pidfs.c
> > @@ -36,7 +36,8 @@ struct pidfs_exit_info {
> >  };
> >  
> >  struct pidfs_inode {
> > -	struct pidfs_exit_info exit_info;
> > +	struct pidfs_exit_info __pei;
> > +	struct pidfs_exit_info *exit_info;
> >  	struct inode vfs_inode;
> >  };
> >  
> > @@ -228,17 +229,28 @@ static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
> >  	return poll_flags;
> >  }
> >  
> > -static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long arg)
> > +static inline bool current_in_pidns(struct pid *pid)
> > +{
> > +	const struct pid_namespace *ns = task_active_pid_ns(current);
> > +
> > +	if (ns->level <= pid->level)
> > +		return pid->numbers[ns->level].ns == ns;
> > +
> > +	return false;
> > +}
> > +
> > +static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg)
> >  {
> >  	struct pidfd_info __user *uinfo = (struct pidfd_info __user *)arg;
> > +	struct pid *pid = pidfd_pid(file);
> >  	size_t usize = _IOC_SIZE(cmd);
> >  	struct pidfd_info kinfo = {};
> > +	struct pidfs_exit_info *exit_info;
> > +	struct inode *inode = file_inode(file);
> >  	struct user_namespace *user_ns;
> > +	struct task_struct *task;
> >  	const struct cred *c;
> >  	__u64 mask;
> > -#ifdef CONFIG_CGROUPS
> > -	struct cgroup *cgrp;
> > -#endif
> >  
> >  	if (!uinfo)
> >  		return -EINVAL;
> > @@ -248,6 +260,37 @@ static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long
> >  	if (copy_from_user(&mask, &uinfo->mask, sizeof(mask)))
> >  		return -EFAULT;
> >  
> > +	task = get_pid_task(pid, PIDTYPE_PID);
> > +	if (!task) {
> > +		if (!(mask & PIDFD_INFO_EXIT))
> > +			return -ESRCH;
> > +
> > +		if (!current_in_pidns(pid))
> > +			return -ESRCH;
> > +	}
> > +
> > +	if (mask & PIDFD_INFO_EXIT) {
> > +		exit_info = READ_ONCE(pidfs_i(inode)->exit_info);
> > +		if (exit_info) {
> > +#ifdef CONFIG_CGROUPS
> > +			kinfo.cgroupid = exit_info->cgroupid;
> > +			kinfo.mask |= PIDFD_INFO_EXIT | PIDFD_INFO_CGROUPID;
> > +#endif
> > +			kinfo.exit_code = exit_info->exit_code;
> > +		}
> > +	}
> > +
> > +	/*
> > +	 * If the task has already been reaped only exit information
> > +	 * can be provided. It's entirely possible that the task has
> > +	 * already been reaped but we managed to grab a reference to it
> > +	 * before that. So a full set of information about @task doesn't
> > +	 * mean it hasn't been waited upon. Similarly, a full set of
> > +	 * information doesn't mean that the task hasn't already exited.
> > +	 */
> > +	if (!task)
> > +		goto copy_out;
> > +
> >  	c = get_task_cred(task);
> >  	if (!c)
> >  		return -ESRCH;
> > @@ -267,11 +310,15 @@ static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long
> >  	put_cred(c);
> >  
> >  #ifdef CONFIG_CGROUPS
> > -	rcu_read_lock();
> > -	cgrp = task_dfl_cgroup(task);
> > -	kinfo.cgroupid = cgroup_id(cgrp);
> > -	kinfo.mask |= PIDFD_INFO_CGROUPID;
> > -	rcu_read_unlock();
> > +	if (!kinfo.cgroupid) {
> > +		struct cgroup *cgrp;
> > +
> > +		rcu_read_lock();
> > +		cgrp = task_dfl_cgroup(task);
> > +		kinfo.cgroupid = cgroup_id(cgrp);
> > +		kinfo.mask |= PIDFD_INFO_CGROUPID;
> > +		rcu_read_unlock();
> > +	}
> >  #endif
> >  
> >  	/*
> > @@ -291,6 +338,7 @@ static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long
> >  	if (kinfo.pid == 0 || kinfo.tgid == 0 || (kinfo.ppid == 0 && kinfo.pid != 1))
> >  		return -ESRCH;
> >  
> > +copy_out:
> >  	/*
> >  	 * If userspace and the kernel have the same struct size it can just
> >  	 * be copied. If userspace provides an older struct, only the bits that
> > @@ -325,7 +373,6 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
> >  {
> >  	struct task_struct *task __free(put_task) = NULL;
> >  	struct nsproxy *nsp __free(put_nsproxy) = NULL;
> > -	struct pid *pid = pidfd_pid(file);
> >  	struct ns_common *ns_common = NULL;
> >  	struct pid_namespace *pid_ns;
> >  
> > @@ -340,13 +387,13 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
> >  		return put_user(file_inode(file)->i_generation, argp);
> >  	}
> >  
> > -	task = get_pid_task(pid, PIDTYPE_PID);
> > -	if (!task)
> > -		return -ESRCH;
> > -
> >  	/* Extensible IOCTL that does not open namespace FDs, take a shortcut */
> >  	if (_IOC_NR(cmd) == _IOC_NR(PIDFD_GET_INFO))
> > -		return pidfd_info(task, cmd, arg);
> > +		return pidfd_info(file, cmd, arg);
> > +
> > +	task = get_pid_task(pidfd_pid(file), PIDTYPE_PID);
> > +	if (!task)
> > +		return -ESRCH;
> >  
> >  	if (arg)
> >  		return -EINVAL;
> > @@ -479,10 +526,12 @@ void pidfs_exit(struct task_struct *tsk)
> >  {
> >  	struct dentry *dentry;
> >  
> > +	might_sleep();
> > +
> >  	dentry = stashed_dentry_get(&task_pid(tsk)->stashed);
> >  	if (dentry) {
> >  		struct inode *inode = d_inode(dentry);
> > -		struct pidfs_exit_info *exit_info = &pidfs_i(inode)->exit_info;
> > +		struct pidfs_exit_info *exit_info = &pidfs_i(inode)->__pei;
> >  #ifdef CONFIG_CGROUPS
> >  		struct cgroup *cgrp;
> >  
> > @@ -493,6 +542,8 @@ void pidfs_exit(struct task_struct *tsk)
> >  #endif
> >  		exit_info->exit_code = tsk->exit_code;
> >  
> > +		/* Ensure that PIDFD_GET_INFO sees either all or nothing. */
> > +		smp_store_release(&pidfs_i(inode)->exit_info, &pidfs_i(inode)->__pei);
> >  		dput(dentry);
> >  	}
> >  }
> > @@ -560,7 +611,8 @@ static struct inode *pidfs_alloc_inode(struct super_block *sb)
> >  	if (!pi)
> >  		return NULL;
> >  
> > -	memset(&pi->exit_info, 0, sizeof(pi->exit_info));
> > +	memset(&pi->__pei, 0, sizeof(pi->__pei));
> > +	pi->exit_info = NULL;
> >  
> >  	return &pi->vfs_inode;
> >  }
> > diff --git a/include/uapi/linux/pidfd.h b/include/uapi/linux/pidfd.h
> > index e0abd0b18841..e5966f1a7743 100644
> > --- a/include/uapi/linux/pidfd.h
> > +++ b/include/uapi/linux/pidfd.h
> > @@ -20,6 +20,7 @@
> >  #define PIDFD_INFO_PID			(1UL << 0) /* Always returned, even if not requested */
> >  #define PIDFD_INFO_CREDS		(1UL << 1) /* Always returned, even if not requested */
> >  #define PIDFD_INFO_CGROUPID		(1UL << 2) /* Always returned if available, even if not requested */
> > +#define PIDFD_INFO_EXIT			(1UL << 3) /* Always returned if available, even if not requested */
> >  
> >  #define PIDFD_INFO_SIZE_VER0		64 /* sizeof first published struct */
> >  
> > @@ -86,7 +87,7 @@ struct pidfd_info {
> >  	__u32 sgid;
> >  	__u32 fsuid;
> >  	__u32 fsgid;
> > -	__u32 spare0[1];
> > +	__s32 exit_code;
> >  };
> >  
> >  #define PIDFS_IOCTL_MAGIC 0xFF
> > diff --git a/kernel/exit.c b/kernel/exit.c
> > index 98d292120296..9916305e34d3 100644
> > --- a/kernel/exit.c
> > +++ b/kernel/exit.c
> > @@ -250,12 +250,12 @@ void release_task(struct task_struct *p)
> >  	dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
> >  	rcu_read_unlock();
> >  
> > +	pidfs_exit(p);
> >  	cgroup_release(p);
> >  
> >  	write_lock_irq(&tasklist_lock);
> >  	ptrace_release_task(p);
> >  	thread_pid = get_pid(p->thread_pid);
> > -	pidfs_exit(p);
> 
> Why move this after you just added it?

Ah sorry that should've been done in the previous patch. pidfs_exit()
needs to be called outside of tasklist_lock as it can sleep due to dput().

> 
> >  	__exit_signal(p);
> >  
> >  	/*
> > 
> 
> Everything else looks fine though. Assuming you fix the nit above, you
> can add:
> 
> Reviewed-by: Jeff Layton <jlayton@kernel.org>
Oleg Nesterov March 4, 2025, 5:34 p.m. UTC | #4
On 03/04, Christian Brauner wrote:
>
> +	task = get_pid_task(pid, PIDTYPE_PID);
> +	if (!task) {
> +		if (!(mask & PIDFD_INFO_EXIT))
> +			return -ESRCH;
> +
> +		if (!current_in_pidns(pid))
> +			return -ESRCH;

Damn ;) could you explain the current_in_pidns() check to me ?
I am puzzled...

Oleg.
Christian Brauner March 4, 2025, 8:09 p.m. UTC | #5
On Tue, Mar 04, 2025 at 06:34:56PM +0100, Oleg Nesterov wrote:
> On 03/04, Christian Brauner wrote:
> >
> > +	task = get_pid_task(pid, PIDTYPE_PID);
> > +	if (!task) {
> > +		if (!(mask & PIDFD_INFO_EXIT))
> > +			return -ESRCH;
> > +
> > +		if (!current_in_pidns(pid))
> > +			return -ESRCH;
> 
> Damn ;) could you explain the current_in_pidns() check to me ?
> I am puzzled...

So we currently restrict interactions with pidfd by pid namespace
hierarchy. Meaning that we ensure that the pidfd is part of the caller's
pid namespace hierarchy. So this check is similar to:

pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns)
{
        struct upid *upid;
        pid_t nr = 0;

        if (pid && ns->level <= pid->level) {
                upid = &pid->numbers[ns->level];
                if (upid->ns == ns)
                        nr = upid->nr;
        }
        return nr;
}

IOW, we verify that the caller's pid namespace can be found within the
pid namespace hierarchy that the struct pid had a pid numbers allocated
in.

Only that by the time we perform this check the pid numbers have already
been freed so we can't use that function directly. But the pid namespace
hierarchy is still alive as that won't be released until the pidfd has
put the reference on struct @pid. So we can use current_in_pidns().

Is that understandable?
Christian Brauner March 4, 2025, 8:16 p.m. UTC | #6
On Tue, Mar 04, 2025 at 06:22:55PM +0100, Oleg Nesterov wrote:
> On 03/04, Christian Brauner wrote:
> >
> > @@ -248,6 +260,37 @@ static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long
> >  	if (copy_from_user(&mask, &uinfo->mask, sizeof(mask)))
> >  		return -EFAULT;
> >
> > +	task = get_pid_task(pid, PIDTYPE_PID);
> > +	if (!task) {
> > +		if (!(mask & PIDFD_INFO_EXIT))
> > +			return -ESRCH;
> > +
> > +		if (!current_in_pidns(pid))
> > +			return -ESRCH;
> > +	}
> > +
> > +	if (mask & PIDFD_INFO_EXIT) {
> > +		exit_info = READ_ONCE(pidfs_i(inode)->exit_info);
> > +		if (exit_info) {
> > +#ifdef CONFIG_CGROUPS
> > +			kinfo.cgroupid = exit_info->cgroupid;
> > +			kinfo.mask |= PIDFD_INFO_EXIT | PIDFD_INFO_CGROUPID;
> > +#endif
> > +			kinfo.exit_code = exit_info->exit_code;
> > +		}
> 
> Confused... so, without CONFIG_CGROUPS pidfd_info() will never report
> PIDFD_INFO_EXIT in kinfo.mask ?

Fixed.

> 
> > --- a/include/uapi/linux/pidfd.h
> > +++ b/include/uapi/linux/pidfd.h
> > @@ -20,6 +20,7 @@
> >  #define PIDFD_INFO_PID			(1UL << 0) /* Always returned, even if not requested */
> >  #define PIDFD_INFO_CREDS		(1UL << 1) /* Always returned, even if not requested */
> >  #define PIDFD_INFO_CGROUPID		(1UL << 2) /* Always returned if available, even if not requested */
> > +#define PIDFD_INFO_EXIT			(1UL << 3) /* Always returned if available, even if not requested */
>                                                               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> 
> The comment doesn't match the "if (mask & PIDFD_INFO_EXIT)" check above...

Also fixed.
Oleg Nesterov March 4, 2025, 9:47 p.m. UTC | #7
On 03/04, Christian Brauner wrote:
>
> On Tue, Mar 04, 2025 at 06:34:56PM +0100, Oleg Nesterov wrote:
> > On 03/04, Christian Brauner wrote:
> > >
> > > +	task = get_pid_task(pid, PIDTYPE_PID);
> > > +	if (!task) {
> > > +		if (!(mask & PIDFD_INFO_EXIT))
> > > +			return -ESRCH;
> > > +
> > > +		if (!current_in_pidns(pid))
> > > +			return -ESRCH;
> >
> > Damn ;) could you explain the current_in_pidns() check to me ?
> > I am puzzled...
>
> So we currently restrict interactions with pidfd by pid namespace
> hierarchy. Meaning that we ensure that the pidfd is part of the caller's
> pid namespace hierarchy.

Well this is clear... but sorry I still can't understand.

Why do we check current_in_pidns() only if get_pid_task(PIDTYPE_PID)
returns NULL?

And, unless (quite possibly) I am totally confused, if task != NULL
but current_in_pidns() would return false, then

	kinfo.pid = task_pid_vnr(task);

below will set kinfo.pid = 0, and pidfd_info() will return -ESRCH anyway?

> So this check is similar to:
>
> pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns)
> {
>         struct upid *upid;
>         pid_t nr = 0;
>
>         if (pid && ns->level <= pid->level) {
>                 upid = &pid->numbers[ns->level];
>                 if (upid->ns == ns)
>                         nr = upid->nr;
>         }
>         return nr;
> }
>
> Only that by the time we perform this check the pid numbers have already
> been freed so we can't use that function directly.

Confused again... Yes, the [u]pid numbers can be already "freed" in that
upid->nr can be already idr_remove()'ed, but

> But the pid namespace
> hierarchy is still alive as that won't be released until the pidfd has
> put the reference on struct @pid.

Yes, so I still don't undestand, sorry :/

IOW. Why not check current_in_pidns() at the start? and do
task = get_pid_task() later, right before

	if (!task)
		goto copy_out;

?

Oleg.
Christian Brauner March 5, 2025, 8:54 a.m. UTC | #8
On Tue, Mar 04, 2025 at 10:47:11PM +0100, Oleg Nesterov wrote:
> On 03/04, Christian Brauner wrote:
> >
> > On Tue, Mar 04, 2025 at 06:34:56PM +0100, Oleg Nesterov wrote:
> > > On 03/04, Christian Brauner wrote:
> > > >
> > > > +	task = get_pid_task(pid, PIDTYPE_PID);
> > > > +	if (!task) {
> > > > +		if (!(mask & PIDFD_INFO_EXIT))
> > > > +			return -ESRCH;
> > > > +
> > > > +		if (!current_in_pidns(pid))
> > > > +			return -ESRCH;
> > >
> > > Damn ;) could you explain the current_in_pidns() check to me ?
> > > I am puzzled...
> >
> > So we currently restrict interactions with pidfd by pid namespace
> > hierarchy. Meaning that we ensure that the pidfd is part of the caller's
> > pid namespace hierarchy.
> 
> Well this is clear... but sorry I still can't understand.

Ok, it sounded like you wanted me to explain what current_in_pidns()
does not why it's placed where it is placed. :)

> 
> Why do we check current_in_pidns() only if get_pid_task(PIDTYPE_PID)
> returns NULL?

Because if task != NULL we already catch it kinfo.pid and since we can't
skip the call to task_pid_vnr() it seemed redundant to do that check
twice. But if we do it before get_task_pid() it makes more sense ofc.

> 
> And, unless (quite possibly) I am totally confused, if task != NULL
> but current_in_pidns() would return false, then
> 
> 	kinfo.pid = task_pid_vnr(task);
> 
> below will set kinfo.pid = 0, and pidfd_info() will return -ESRCH anyway?

Yes.

> 
> > So this check is similar to:
> >
> > pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns)
> > {
> >         struct upid *upid;
> >         pid_t nr = 0;
> >
> >         if (pid && ns->level <= pid->level) {
> >                 upid = &pid->numbers[ns->level];
> >                 if (upid->ns == ns)
> >                         nr = upid->nr;
> >         }
> >         return nr;
> > }
> >
> > Only that by the time we perform this check the pid numbers have already
> > been freed so we can't use that function directly.
> 
> Confused again... Yes, the [u]pid numbers can be already "freed" in that
> upid->nr can be already idr_remove()'ed, but
> 
> > But the pid namespace
> > hierarchy is still alive as that won't be released until the pidfd has
> > put the reference on struct @pid.
> 
> Yes, so I still don't undestand, sorry :/
> 
> IOW. Why not check current_in_pidns() at the start? and do
> task = get_pid_task() later, right before

Sure, that's a good suggestion.

> 
> 	if (!task)
> 		goto copy_out;
> 
> ?
> 
> Oleg.
>
diff mbox series

Patch

diff --git a/fs/pidfs.c b/fs/pidfs.c
index 258e1c13ee56..11744d7fe177 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -36,7 +36,8 @@  struct pidfs_exit_info {
 };
 
 struct pidfs_inode {
-	struct pidfs_exit_info exit_info;
+	struct pidfs_exit_info __pei;
+	struct pidfs_exit_info *exit_info;
 	struct inode vfs_inode;
 };
 
@@ -228,17 +229,28 @@  static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
 	return poll_flags;
 }
 
-static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long arg)
+static inline bool current_in_pidns(struct pid *pid)
+{
+	const struct pid_namespace *ns = task_active_pid_ns(current);
+
+	if (ns->level <= pid->level)
+		return pid->numbers[ns->level].ns == ns;
+
+	return false;
+}
+
+static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg)
 {
 	struct pidfd_info __user *uinfo = (struct pidfd_info __user *)arg;
+	struct pid *pid = pidfd_pid(file);
 	size_t usize = _IOC_SIZE(cmd);
 	struct pidfd_info kinfo = {};
+	struct pidfs_exit_info *exit_info;
+	struct inode *inode = file_inode(file);
 	struct user_namespace *user_ns;
+	struct task_struct *task;
 	const struct cred *c;
 	__u64 mask;
-#ifdef CONFIG_CGROUPS
-	struct cgroup *cgrp;
-#endif
 
 	if (!uinfo)
 		return -EINVAL;
@@ -248,6 +260,37 @@  static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long
 	if (copy_from_user(&mask, &uinfo->mask, sizeof(mask)))
 		return -EFAULT;
 
+	task = get_pid_task(pid, PIDTYPE_PID);
+	if (!task) {
+		if (!(mask & PIDFD_INFO_EXIT))
+			return -ESRCH;
+
+		if (!current_in_pidns(pid))
+			return -ESRCH;
+	}
+
+	if (mask & PIDFD_INFO_EXIT) {
+		exit_info = READ_ONCE(pidfs_i(inode)->exit_info);
+		if (exit_info) {
+#ifdef CONFIG_CGROUPS
+			kinfo.cgroupid = exit_info->cgroupid;
+			kinfo.mask |= PIDFD_INFO_EXIT | PIDFD_INFO_CGROUPID;
+#endif
+			kinfo.exit_code = exit_info->exit_code;
+		}
+	}
+
+	/*
+	 * If the task has already been reaped only exit information
+	 * can be provided. It's entirely possible that the task has
+	 * already been reaped but we managed to grab a reference to it
+	 * before that. So a full set of information about @task doesn't
+	 * mean it hasn't been waited upon. Similarly, a full set of
+	 * information doesn't mean that the task hasn't already exited.
+	 */
+	if (!task)
+		goto copy_out;
+
 	c = get_task_cred(task);
 	if (!c)
 		return -ESRCH;
@@ -267,11 +310,15 @@  static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long
 	put_cred(c);
 
 #ifdef CONFIG_CGROUPS
-	rcu_read_lock();
-	cgrp = task_dfl_cgroup(task);
-	kinfo.cgroupid = cgroup_id(cgrp);
-	kinfo.mask |= PIDFD_INFO_CGROUPID;
-	rcu_read_unlock();
+	if (!kinfo.cgroupid) {
+		struct cgroup *cgrp;
+
+		rcu_read_lock();
+		cgrp = task_dfl_cgroup(task);
+		kinfo.cgroupid = cgroup_id(cgrp);
+		kinfo.mask |= PIDFD_INFO_CGROUPID;
+		rcu_read_unlock();
+	}
 #endif
 
 	/*
@@ -291,6 +338,7 @@  static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long
 	if (kinfo.pid == 0 || kinfo.tgid == 0 || (kinfo.ppid == 0 && kinfo.pid != 1))
 		return -ESRCH;
 
+copy_out:
 	/*
 	 * If userspace and the kernel have the same struct size it can just
 	 * be copied. If userspace provides an older struct, only the bits that
@@ -325,7 +373,6 @@  static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
 	struct task_struct *task __free(put_task) = NULL;
 	struct nsproxy *nsp __free(put_nsproxy) = NULL;
-	struct pid *pid = pidfd_pid(file);
 	struct ns_common *ns_common = NULL;
 	struct pid_namespace *pid_ns;
 
@@ -340,13 +387,13 @@  static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 		return put_user(file_inode(file)->i_generation, argp);
 	}
 
-	task = get_pid_task(pid, PIDTYPE_PID);
-	if (!task)
-		return -ESRCH;
-
 	/* Extensible IOCTL that does not open namespace FDs, take a shortcut */
 	if (_IOC_NR(cmd) == _IOC_NR(PIDFD_GET_INFO))
-		return pidfd_info(task, cmd, arg);
+		return pidfd_info(file, cmd, arg);
+
+	task = get_pid_task(pidfd_pid(file), PIDTYPE_PID);
+	if (!task)
+		return -ESRCH;
 
 	if (arg)
 		return -EINVAL;
@@ -479,10 +526,12 @@  void pidfs_exit(struct task_struct *tsk)
 {
 	struct dentry *dentry;
 
+	might_sleep();
+
 	dentry = stashed_dentry_get(&task_pid(tsk)->stashed);
 	if (dentry) {
 		struct inode *inode = d_inode(dentry);
-		struct pidfs_exit_info *exit_info = &pidfs_i(inode)->exit_info;
+		struct pidfs_exit_info *exit_info = &pidfs_i(inode)->__pei;
 #ifdef CONFIG_CGROUPS
 		struct cgroup *cgrp;
 
@@ -493,6 +542,8 @@  void pidfs_exit(struct task_struct *tsk)
 #endif
 		exit_info->exit_code = tsk->exit_code;
 
+		/* Ensure that PIDFD_GET_INFO sees either all or nothing. */
+		smp_store_release(&pidfs_i(inode)->exit_info, &pidfs_i(inode)->__pei);
 		dput(dentry);
 	}
 }
@@ -560,7 +611,8 @@  static struct inode *pidfs_alloc_inode(struct super_block *sb)
 	if (!pi)
 		return NULL;
 
-	memset(&pi->exit_info, 0, sizeof(pi->exit_info));
+	memset(&pi->__pei, 0, sizeof(pi->__pei));
+	pi->exit_info = NULL;
 
 	return &pi->vfs_inode;
 }
diff --git a/include/uapi/linux/pidfd.h b/include/uapi/linux/pidfd.h
index e0abd0b18841..e5966f1a7743 100644
--- a/include/uapi/linux/pidfd.h
+++ b/include/uapi/linux/pidfd.h
@@ -20,6 +20,7 @@ 
 #define PIDFD_INFO_PID			(1UL << 0) /* Always returned, even if not requested */
 #define PIDFD_INFO_CREDS		(1UL << 1) /* Always returned, even if not requested */
 #define PIDFD_INFO_CGROUPID		(1UL << 2) /* Always returned if available, even if not requested */
+#define PIDFD_INFO_EXIT			(1UL << 3) /* Always returned if available, even if not requested */
 
 #define PIDFD_INFO_SIZE_VER0		64 /* sizeof first published struct */
 
@@ -86,7 +87,7 @@  struct pidfd_info {
 	__u32 sgid;
 	__u32 fsuid;
 	__u32 fsgid;
-	__u32 spare0[1];
+	__s32 exit_code;
 };
 
 #define PIDFS_IOCTL_MAGIC 0xFF
diff --git a/kernel/exit.c b/kernel/exit.c
index 98d292120296..9916305e34d3 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -250,12 +250,12 @@  void release_task(struct task_struct *p)
 	dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
 	rcu_read_unlock();
 
+	pidfs_exit(p);
 	cgroup_release(p);
 
 	write_lock_irq(&tasklist_lock);
 	ptrace_release_task(p);
 	thread_pid = get_pid(p->thread_pid);
-	pidfs_exit(p);
 	__exit_signal(p);
 
 	/*