diff mbox series

[RFC] pidfs: ensure consistent ENOENT/ESRCH reporting

Message ID 20250409-rohstoff-ungnade-d1afa571f32c@brauner (mailing list archive)
State New
Headers show
Series [RFC] pidfs: ensure consistent ENOENT/ESRCH reporting | expand

Commit Message

Christian Brauner April 9, 2025, 6:18 p.m. UTC
In a prior patch series we tried to cleanly differentiate between:

(1) The task has already been reaped.
(2) The caller requested a pidfd for a thread-group leader but the pid
    actually references a struct pid that isn't used as a thread-group
    leader.

as this was causing issues for non-threaded workloads.

But there's cases where the current simple logic is wrong. Specifically,
if the pid was a leader pid and the check races with __unhash_process().

Stabilize this by using a sequence counter associated with tasklist_lock
and retry while we're in __unhash_process(). The seqcounter might be
useful independent of pidfs.

Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/pid.h |  1 +
 kernel/exit.c       | 11 +++++++++++
 kernel/fork.c       | 22 ++++++++++++----------
 kernel/pid.c        |  1 +
 4 files changed, 25 insertions(+), 10 deletions(-)

Comments

Oleg Nesterov April 9, 2025, 6:40 p.m. UTC | #1
Christian,

I will actually read your patch tomorrow, but at first glance

On 04/09, Christian Brauner wrote:
>
> The seqcounter might be
> useful independent of pidfs.

Are you sure? ;) to me the new pid->pid_seq needs more justification...

Again, can't we use pid->wait_pidfd->lock if we want to avoid the
(minor) problem with the wrong ENOENT?

or even signal->siglock, although in this case we will need
pid_task() + lock_task_sighand()...

Oleg.

> Signed-off-by: Christian Brauner <brauner@kernel.org>
> ---
>  include/linux/pid.h |  1 +
>  kernel/exit.c       | 11 +++++++++++
>  kernel/fork.c       | 22 ++++++++++++----------
>  kernel/pid.c        |  1 +
>  4 files changed, 25 insertions(+), 10 deletions(-)
> 
> diff --git a/include/linux/pid.h b/include/linux/pid.h
> index 311ecebd7d56..b54a4c1ef602 100644
> --- a/include/linux/pid.h
> +++ b/include/linux/pid.h
> @@ -65,6 +65,7 @@ struct pid
>  	struct hlist_head inodes;
>  	/* wait queue for pidfd notifications */
>  	wait_queue_head_t wait_pidfd;
> +	seqcount_rwlock_t pid_seq;
>  	struct rcu_head rcu;
>  	struct upid numbers[];
>  };
> diff --git a/kernel/exit.c b/kernel/exit.c
> index 1b51dc099f1e..8050572fe682 100644
> --- a/kernel/exit.c
> +++ b/kernel/exit.c
> @@ -133,17 +133,28 @@ struct release_task_post {
>  static void __unhash_process(struct release_task_post *post, struct task_struct *p,
>  			     bool group_dead)
>  {
> +	struct pid *pid;
> +
> +	lockdep_assert_held_write(&tasklist_lock);
> +
>  	nr_threads--;
> +
> +	pid = task_pid(p);
> +	raw_write_seqcount_begin(&pid->pid_seq);
>  	detach_pid(post->pids, p, PIDTYPE_PID);
>  	if (group_dead) {
>  		detach_pid(post->pids, p, PIDTYPE_TGID);
>  		detach_pid(post->pids, p, PIDTYPE_PGID);
>  		detach_pid(post->pids, p, PIDTYPE_SID);
> +	}
> +	raw_write_seqcount_end(&pid->pid_seq);
>  
> +	if (group_dead) {
>  		list_del_rcu(&p->tasks);
>  		list_del_init(&p->sibling);
>  		__this_cpu_dec(process_counts);
>  	}
> +
>  	list_del_rcu(&p->thread_node);
>  }
>  
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 4a2080b968c8..1480bf6f5f38 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -2109,24 +2109,26 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re
>  int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret)
>  {
>  	int err = 0;
> +	unsigned int seq;
>  
> -	if (!(flags & PIDFD_THREAD)) {
> +	do {
> +		seq = raw_seqcount_begin(&pid->pid_seq);
>  		/*
>  		 * If this is struct pid isn't used as a thread-group
>  		 * leader pid but the caller requested to create a
>  		 * thread-group leader pidfd then report ENOENT to the
>  		 * caller as a hint.
>  		 */
> -		if (!pid_has_task(pid, PIDTYPE_TGID))
> +		if (!(flags & PIDFD_THREAD) && !pid_has_task(pid, PIDTYPE_TGID))
>  			err = -ENOENT;
> -	}
> -
> -	/*
> -	 * If this wasn't a thread-group leader struct pid or the task
> -	 * got reaped in the meantime report -ESRCH to userspace.
> -	 */
> -	if (!pid_has_task(pid, PIDTYPE_PID))
> -		err = -ESRCH;
> +		/*
> +		 * If this wasn't a thread-group leader struct pid or
> +		 * the task got reaped in the meantime report -ESRCH to
> +		 * userspace.
> +		 */
> +		if (!pid_has_task(pid, PIDTYPE_PID))
> +			err = -ESRCH;
> +	} while (read_seqcount_retry(&pid->pid_seq, seq));
>  	if (err)
>  		return err;
>  
> diff --git a/kernel/pid.c b/kernel/pid.c
> index 4ac2ce46817f..bbca61f62faa 100644
> --- a/kernel/pid.c
> +++ b/kernel/pid.c
> @@ -271,6 +271,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
>  	upid = pid->numbers + ns->level;
>  	idr_preload(GFP_KERNEL);
>  	spin_lock(&pidmap_lock);
> +	seqcount_rwlock_init(&pid->pid_seq, &tasklist_lock);
>  	if (!(ns->pid_allocated & PIDNS_ADDING))
>  		goto out_unlock;
>  	pidfs_add_pid(pid);
> -- 
> 2.47.2
>
Oleg Nesterov April 10, 2025, 10:18 a.m. UTC | #2
On 04/09, Oleg Nesterov wrote:
>
> Christian,
>
> I will actually read your patch tomorrow, but at first glance
>
> On 04/09, Christian Brauner wrote:
> >
> > The seqcounter might be
> > useful independent of pidfs.
>
> Are you sure? ;) to me the new pid->pid_seq needs more justification...
>
> Again, can't we use pid->wait_pidfd->lock if we want to avoid the
> (minor) problem with the wrong ENOENT?

I mean

	int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret)
	{
		int err = 0;

		spin_lock_irq(&pid->wait_pidfd->lock);

		if (!pid_has_task(pid, PIDTYPE_PID))
			err = -ESRCH;
		else if (!(flags & PIDFD_THREAD) && !pid_has_task(pid, PIDTYPE_TGID))
			err = -ENOENT;

		spin_lock_irq(&pid->wait_pidfd->lock);

		return err ?: __pidfd_prepare(pid, flags, ret);
	}

To remind, detach_pid(pid, PIDTYPE_PID) does wake_up_all(&pid->wait_pidfd) and
takes pid->wait_pidfd->lock.

So if pid_has_task(PIDTYPE_PID) succeeds, __unhash_process() -> detach_pid(TGID)
is not possible until we drop pid->wait_pidfd->lock.

If detach_pid(PIDTYPE_PID) was already called and have passed wake_up_all(),
pid_has_task(PIDTYPE_PID) can't succeed.

Oleg.
Christian Brauner April 10, 2025, 10:43 a.m. UTC | #3
On Thu, Apr 10, 2025 at 12:18:01PM +0200, Oleg Nesterov wrote:
> On 04/09, Oleg Nesterov wrote:
> >
> > Christian,
> >
> > I will actually read your patch tomorrow, but at first glance
> >
> > On 04/09, Christian Brauner wrote:
> > >
> > > The seqcounter might be
> > > useful independent of pidfs.
> >
> > Are you sure? ;) to me the new pid->pid_seq needs more justification...

Yeah, pretty much. I'd make use of this in other cases where we need to
detect concurrent changes to struct pid without having to take any
locks. Multi-threaded exec in de_exec() comes to mind as well.

> > Again, can't we use pid->wait_pidfd->lock if we want to avoid the
> > (minor) problem with the wrong ENOENT?
> 
> I mean
> 
> 	int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret)
> 	{
> 		int err = 0;
> 
> 		spin_lock_irq(&pid->wait_pidfd->lock);
> 
> 		if (!pid_has_task(pid, PIDTYPE_PID))
> 			err = -ESRCH;
> 		else if (!(flags & PIDFD_THREAD) && !pid_has_task(pid, PIDTYPE_TGID))
> 			err = -ENOENT;
> 
> 		spin_lock_irq(&pid->wait_pidfd->lock);
> 
> 		return err ?: __pidfd_prepare(pid, flags, ret);
> 	}
> 
> To remind, detach_pid(pid, PIDTYPE_PID) does wake_up_all(&pid->wait_pidfd) and
> takes pid->wait_pidfd->lock.
> 
> So if pid_has_task(PIDTYPE_PID) succeeds, __unhash_process() -> detach_pid(TGID)
> is not possible until we drop pid->wait_pidfd->lock.
> 
> If detach_pid(PIDTYPE_PID) was already called and have passed wake_up_all(),
> pid_has_task(PIDTYPE_PID) can't succeed.

I know. I was trying to avoid having to take the lock and just make this
lockless. But if you think we should use this lock here instead I'm
willing to do this. I just find the sequence counter more elegant than
the spin_lock_irq().

And note that it doesn't grow struct pid. There's a 4 byte hole I would
place it into just before struct dentry *. So there's no downside to
this imho and it would give pidfds a reliable way to detect relevant
concurrent changes locklessly without penalizing other critical paths
(e.g., under tasklist_lock) in the kernel.
Oleg Nesterov April 10, 2025, 1:10 p.m. UTC | #4
On 04/10, Christian Brauner wrote:
>
> On Thu, Apr 10, 2025 at 12:18:01PM +0200, Oleg Nesterov wrote:
> > On 04/09, Oleg Nesterov wrote:
> > >
> > > On 04/09, Christian Brauner wrote:
> > > >
> > > > The seqcounter might be
> > > > useful independent of pidfs.
> > >
> > > Are you sure? ;) to me the new pid->pid_seq needs more justification...
>
> Yeah, pretty much. I'd make use of this in other cases where we need to
> detect concurrent changes to struct pid without having to take any
> locks. Multi-threaded exec in de_exec() comes to mind as well.

Perhaps you are right, but so far I am still not sure it makes sense.
And we can always add it later if we have another (more convincing)
use-case.

> > To remind, detach_pid(pid, PIDTYPE_PID) does wake_up_all(&pid->wait_pidfd) and
> > takes pid->wait_pidfd->lock.
> >
> > So if pid_has_task(PIDTYPE_PID) succeeds, __unhash_process() -> detach_pid(TGID)
> > is not possible until we drop pid->wait_pidfd->lock.
> >
> > If detach_pid(PIDTYPE_PID) was already called and have passed wake_up_all(),
> > pid_has_task(PIDTYPE_PID) can't succeed.
>
> I know. I was trying to avoid having to take the lock and just make this
> lockless. But if you think we should use this lock here instead I'm
> willing to do this. I just find the sequence counter more elegant than
> the spin_lock_irq().

This is subjective, and quite possibly I am wrong. But yes, I'd prefer
to (ab)use pid->wait_pidfd->lock in pidfd_prepare() for now and not
penalize __unhash_process(). Simply because this is simpler.

If you really dislike taking wait_pidfd->lock, we can add mb() into
__unhash_process() or even smp_mb__after_spinlock() into __change_pid(),
but this will need a lengthy comment...



As for your patch... it doesn't apply on top of 3/4, but I guess it
is clear what does it do, and (unfortunately ;) it looks correct, so
I won't insist too much. See a couple of nits below.

> this imho and it would give pidfds a reliable way to detect relevant
> concurrent changes locklessly without penalizing other critical paths
> (e.g., under tasklist_lock) in the kernel.

Can't resist... Note that raw_seqcount_begin() in pidfd_prepare() will
take/drop tasklist_lock if it races with __unhash_process() on PREEMPT_RT.
Yes, this is unlikely case, but still...

Now. Unless I misread your patch, pidfd_prepare() does "err = 0" only
once before the main loop. And this is correct. But this means that
we do not need the do/while loop.

If read_seqcount_retry() returns true, we can safely return -ESRCH. So
we can do

	seq = raw_seqcount_begin(&pid->pid_seq);

	if (!PIDFD_THREAD && !pid_has_task(PIDTYPE_TGID))
		err = -ENOENT;

	if (!pid_has_task(PIDTYPE_PID))
		err = -ESRCH;

	if (read_seqcount_retry(pid->pid_seq, seq))
		err = -ESRCH;

In fact we don't even need raw_seqcount_begin(), we could use
raw_seqcount_try_begin().

And why seqcount_rwlock_t? A plain seqcount_t can equally work.


And, if we use seqcount_rwlock_t,

	lockdep_assert_held_write(&tasklist_lock);
	...
	raw_write_seqcount_begin(pid->pid_seq);

in __unhash_process() looks a bit strange. I'd suggest to use
write_seqcount_begin() which does seqprop_assert() and kill
lockdep_assert_held_write().

Oleg.
Christian Brauner April 11, 2025, 11:08 a.m. UTC | #5
On Thu, Apr 10, 2025 at 10:24:22PM +0200, Christian Brauner wrote:
> On Thu, Apr 10, 2025 at 10:05:58PM +0200, Christian Brauner wrote:
> > On Thu, Apr 10, 2025 at 03:10:09PM +0200, Oleg Nesterov wrote:
> > > On 04/10, Christian Brauner wrote:
> > > >
> > > > On Thu, Apr 10, 2025 at 12:18:01PM +0200, Oleg Nesterov wrote:
> > > > > On 04/09, Oleg Nesterov wrote:
> > > > > >
> > > > > > On 04/09, Christian Brauner wrote:
> > > > > > >
> > > > > > > The seqcounter might be
> > > > > > > useful independent of pidfs.
> > > > > >
> > > > > > Are you sure? ;) to me the new pid->pid_seq needs more justification...
> > > >
> > > > Yeah, pretty much. I'd make use of this in other cases where we need to
> > > > detect concurrent changes to struct pid without having to take any
> > > > locks. Multi-threaded exec in de_exec() comes to mind as well.
> > > 
> > > Perhaps you are right, but so far I am still not sure it makes sense.
> > > And we can always add it later if we have another (more convincing)
> > > use-case.
> > > 
> > > > > To remind, detach_pid(pid, PIDTYPE_PID) does wake_up_all(&pid->wait_pidfd) and
> > > > > takes pid->wait_pidfd->lock.
> > > > >
> > > > > So if pid_has_task(PIDTYPE_PID) succeeds, __unhash_process() -> detach_pid(TGID)
> > > > > is not possible until we drop pid->wait_pidfd->lock.
> > > > >
> > > > > If detach_pid(PIDTYPE_PID) was already called and have passed wake_up_all(),
> > > > > pid_has_task(PIDTYPE_PID) can't succeed.
> > > >
> > > > I know. I was trying to avoid having to take the lock and just make this
> > > > lockless. But if you think we should use this lock here instead I'm
> > > > willing to do this. I just find the sequence counter more elegant than
> > > > the spin_lock_irq().
> > > 
> > > This is subjective, and quite possibly I am wrong. But yes, I'd prefer
> > > to (ab)use pid->wait_pidfd->lock in pidfd_prepare() for now and not
> > > penalize __unhash_process(). Simply because this is simpler.
> 
> Looking close at this. Why is:
> 
>         if (type == PIDTYPE_PID) {
>                 WARN_ON_ONCE(pid_has_task(pid, PIDTYPE_PID));
>                 wake_up_all(&pid->wait_pidfd);
>         }
> 
> located in __change_pid()? The only valid call to __change_pid() with a NULL
> argument and PIDTYPE_PID is from __unhash_process(), no?

We used to perform free_pid() directly from __change_pid() so prior to
v6.15 changes it wasn't possible. Now that we free the pids separately let's
just move the notification into __unhash_process(). I have a patch ready
for this.

> 
> So why isn't this in __unhash_process() where it's immediately obvious
> that it's the only valid place this can currently be called from?
> 
> diff --git a/kernel/exit.c b/kernel/exit.c
> index 1b51dc099f1e..d92e8bee0ab7 100644
> --- a/kernel/exit.c
> +++ b/kernel/exit.c
> @@ -135,6 +135,7 @@ static void __unhash_process(struct release_task_post *post, struct task_struct
>  {
>         nr_threads--;
>         detach_pid(post->pids, p, PIDTYPE_PID);
> +       wake_up_all(&post->pids[PIDTYPE_PID]->wait_pidfd);
>         if (group_dead) {
>                 detach_pid(post->pids, p, PIDTYPE_TGID);
>                 detach_pid(post->pids, p, PIDTYPE_PGID);
> diff --git a/kernel/pid.c b/kernel/pid.c
> index 4ac2ce46817f..26f1e136f017 100644
> --- a/kernel/pid.c
> +++ b/kernel/pid.c
> @@ -359,11 +359,6 @@ static void __change_pid(struct pid **pids, struct task_struct *task,
>         hlist_del_rcu(&task->pid_links[type]);
>         *pid_ptr = new;
> 
> -       if (type == PIDTYPE_PID) {
> -               WARN_ON_ONCE(pid_has_task(pid, PIDTYPE_PID));
> -               wake_up_all(&pid->wait_pidfd);
> -       }
> -
>         for (tmp = PIDTYPE_MAX; --tmp >= 0; )
>                 if (pid_has_task(pid, tmp))
>                         return;
> 
> I'm probably missing something obvious.
Oleg Nesterov April 11, 2025, 11:25 a.m. UTC | #6
On 04/11, Christian Brauner wrote:
>
> > Looking close at this. Why is:
> >
> >         if (type == PIDTYPE_PID) {
> >                 WARN_ON_ONCE(pid_has_task(pid, PIDTYPE_PID));
> >                 wake_up_all(&pid->wait_pidfd);
> >         }
> >
> > located in __change_pid()? The only valid call to __change_pid() with a NULL
> > argument and PIDTYPE_PID is from __unhash_process(), no?
>
> We used to perform free_pid() directly from __change_pid() so prior to
> v6.15 changes it wasn't possible.

Yes, exactly ;)

> Now that we free the pids separately let's
> just move the notification into __unhash_process(). I have a patch ready
> for this.

Agreed,

Acked-by: Oleg Nesterov <oleg@redhat.com>
Oleg Nesterov April 11, 2025, 11:41 a.m. UTC | #7
On 04/11, Oleg Nesterov wrote:
>
> On 04/11, Christian Brauner wrote:
> >
> > > Looking close at this. Why is:
> > >
> > >         if (type == PIDTYPE_PID) {
> > >                 WARN_ON_ONCE(pid_has_task(pid, PIDTYPE_PID));
> > >                 wake_up_all(&pid->wait_pidfd);
> > >         }
> > >
> > > located in __change_pid()? The only valid call to __change_pid() with a NULL
> > > argument and PIDTYPE_PID is from __unhash_process(), no?
> >
> > We used to perform free_pid() directly from __change_pid() so prior to
> > v6.15 changes it wasn't possible.
>
> Yes, exactly ;)

To clarify, it was actually possible because the caller, release_task(),
does

	thread_pid = get_pid(p->thread_pid);

before __exit_signal() and detach_pid(PIDTYPE_PID) uses the same
task_struct->thread_pid. But I didn't want to rely on this fact.

And it seems we can do another cleanup... We can kill the no longer
needed get_pid/put_pid in release_task(). I'll send the patch.

> > Now that we free the pids separately let's
> > just move the notification into __unhash_process(). I have a patch ready
> > for this.
>
> Agreed,
>
> Acked-by: Oleg Nesterov <oleg@redhat.com>
diff mbox series

Patch

diff --git a/include/linux/pid.h b/include/linux/pid.h
index 311ecebd7d56..b54a4c1ef602 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -65,6 +65,7 @@  struct pid
 	struct hlist_head inodes;
 	/* wait queue for pidfd notifications */
 	wait_queue_head_t wait_pidfd;
+	seqcount_rwlock_t pid_seq;
 	struct rcu_head rcu;
 	struct upid numbers[];
 };
diff --git a/kernel/exit.c b/kernel/exit.c
index 1b51dc099f1e..8050572fe682 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -133,17 +133,28 @@  struct release_task_post {
 static void __unhash_process(struct release_task_post *post, struct task_struct *p,
 			     bool group_dead)
 {
+	struct pid *pid;
+
+	lockdep_assert_held_write(&tasklist_lock);
+
 	nr_threads--;
+
+	pid = task_pid(p);
+	raw_write_seqcount_begin(&pid->pid_seq);
 	detach_pid(post->pids, p, PIDTYPE_PID);
 	if (group_dead) {
 		detach_pid(post->pids, p, PIDTYPE_TGID);
 		detach_pid(post->pids, p, PIDTYPE_PGID);
 		detach_pid(post->pids, p, PIDTYPE_SID);
+	}
+	raw_write_seqcount_end(&pid->pid_seq);
 
+	if (group_dead) {
 		list_del_rcu(&p->tasks);
 		list_del_init(&p->sibling);
 		__this_cpu_dec(process_counts);
 	}
+
 	list_del_rcu(&p->thread_node);
 }
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 4a2080b968c8..1480bf6f5f38 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2109,24 +2109,26 @@  static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re
 int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret)
 {
 	int err = 0;
+	unsigned int seq;
 
-	if (!(flags & PIDFD_THREAD)) {
+	do {
+		seq = raw_seqcount_begin(&pid->pid_seq);
 		/*
 		 * If this is struct pid isn't used as a thread-group
 		 * leader pid but the caller requested to create a
 		 * thread-group leader pidfd then report ENOENT to the
 		 * caller as a hint.
 		 */
-		if (!pid_has_task(pid, PIDTYPE_TGID))
+		if (!(flags & PIDFD_THREAD) && !pid_has_task(pid, PIDTYPE_TGID))
 			err = -ENOENT;
-	}
-
-	/*
-	 * If this wasn't a thread-group leader struct pid or the task
-	 * got reaped in the meantime report -ESRCH to userspace.
-	 */
-	if (!pid_has_task(pid, PIDTYPE_PID))
-		err = -ESRCH;
+		/*
+		 * If this wasn't a thread-group leader struct pid or
+		 * the task got reaped in the meantime report -ESRCH to
+		 * userspace.
+		 */
+		if (!pid_has_task(pid, PIDTYPE_PID))
+			err = -ESRCH;
+	} while (read_seqcount_retry(&pid->pid_seq, seq));
 	if (err)
 		return err;
 
diff --git a/kernel/pid.c b/kernel/pid.c
index 4ac2ce46817f..bbca61f62faa 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -271,6 +271,7 @@  struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
 	upid = pid->numbers + ns->level;
 	idr_preload(GFP_KERNEL);
 	spin_lock(&pidmap_lock);
+	seqcount_rwlock_init(&pid->pid_seq, &tasklist_lock);
 	if (!(ns->pid_allocated & PIDNS_ADDING))
 		goto out_unlock;
 	pidfs_add_pid(pid);