diff mbox series

[v3,13/16] exit: Factor thread_group_exited out of pidfd_poll

Message ID 20200702164140.4468-13-ebiederm@xmission.com (mailing list archive)
State New, archived
Headers show
Series Make the user mode driver code a better citizen | expand

Commit Message

Eric W. Biederman July 2, 2020, 4:41 p.m. UTC
Create an independent helper thread_group_exited report return true
when all threads have passed exit_notify in do_exit.  AKA all of the
threads are at least zombies and might be dead or completely gone.

Create this helper by taking the logic out of pidfd_poll where
it is already tested, and adding a missing READ_ONCE on
the read of task->exit_state.

I will be changing the user mode driver code to use this same logic
to know when a user mode driver needs to be restarted.

Place the new helper thread_group_exited in kernel/exit.c and
EXPORT it so it can be used by modules.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/sched/signal.h |  2 ++
 kernel/exit.c                | 24 ++++++++++++++++++++++++
 kernel/fork.c                |  6 +-----
 3 files changed, 27 insertions(+), 5 deletions(-)

Comments

Alexei Starovoitov July 3, 2020, 8:30 p.m. UTC | #1
On Thu, Jul 02, 2020 at 11:41:37AM -0500, Eric W. Biederman wrote:
> Create an independent helper thread_group_exited report return true
> when all threads have passed exit_notify in do_exit.  AKA all of the
> threads are at least zombies and might be dead or completely gone.
> 
> Create this helper by taking the logic out of pidfd_poll where
> it is already tested, and adding a missing READ_ONCE on
> the read of task->exit_state.
> 
> I will be changing the user mode driver code to use this same logic
> to know when a user mode driver needs to be restarted.
> 
> Place the new helper thread_group_exited in kernel/exit.c and
> EXPORT it so it can be used by modules.
> 
> Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
> ---
>  include/linux/sched/signal.h |  2 ++
>  kernel/exit.c                | 24 ++++++++++++++++++++++++
>  kernel/fork.c                |  6 +-----
>  3 files changed, 27 insertions(+), 5 deletions(-)
> 
> diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
> index 0ee5e696c5d8..1bad18a1d8ba 100644
> --- a/include/linux/sched/signal.h
> +++ b/include/linux/sched/signal.h
> @@ -674,6 +674,8 @@ static inline int thread_group_empty(struct task_struct *p)
>  #define delay_group_leader(p) \
>  		(thread_group_leader(p) && !thread_group_empty(p))
>  
> +extern bool thread_group_exited(struct pid *pid);
> +
>  extern struct sighand_struct *__lock_task_sighand(struct task_struct *task,
>  							unsigned long *flags);
>  
> diff --git a/kernel/exit.c b/kernel/exit.c
> index d3294b611df1..a7f112feb0f6 100644
> --- a/kernel/exit.c
> +++ b/kernel/exit.c
> @@ -1713,6 +1713,30 @@ COMPAT_SYSCALL_DEFINE5(waitid,
>  }
>  #endif
>  
> +/**
> + * thread_group_exited - check that a thread group has exited
> + * @pid: tgid of thread group to be checked.
> + *
> + * Test if thread group is has exited (all threads are zombies, dead
> + * or completely gone).
> + *
> + * Return: true if the thread group has exited. false otherwise.
> + */
> +bool thread_group_exited(struct pid *pid)
> +{
> +	struct task_struct *task;
> +	bool exited;
> +
> +	rcu_read_lock();
> +	task = pid_task(pid, PIDTYPE_PID);
> +	exited = !task ||
> +		(READ_ONCE(task->exit_state) && thread_group_empty(task));
> +	rcu_read_unlock();
> +
> +	return exited;
> +}

I'm not sure why you think READ_ONCE was missing.
It's different in wait_consider_task() where READ_ONCE is needed because
of multiple checks. Here it's done once.

The rest all looks good to me. Tested with and without bpf_preload patches.
Feel free to create a frozen branch with this set.

btw I'll be offline starting tomorrow for a week.
Will catch up with threads afterwards.
Eric W. Biederman July 3, 2020, 9:37 p.m. UTC | #2
Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:

> On Thu, Jul 02, 2020 at 11:41:37AM -0500, Eric W. Biederman wrote:
>> Create an independent helper thread_group_exited report return true
>> when all threads have passed exit_notify in do_exit.  AKA all of the
>> threads are at least zombies and might be dead or completely gone.
>> 
>> Create this helper by taking the logic out of pidfd_poll where
>> it is already tested, and adding a missing READ_ONCE on
>> the read of task->exit_state.
>> 
>> I will be changing the user mode driver code to use this same logic
>> to know when a user mode driver needs to be restarted.
>> 
>> Place the new helper thread_group_exited in kernel/exit.c and
>> EXPORT it so it can be used by modules.
>> 
>> Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
>> ---
>>  include/linux/sched/signal.h |  2 ++
>>  kernel/exit.c                | 24 ++++++++++++++++++++++++
>>  kernel/fork.c                |  6 +-----
>>  3 files changed, 27 insertions(+), 5 deletions(-)
>> 
>> diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
>> index 0ee5e696c5d8..1bad18a1d8ba 100644
>> --- a/include/linux/sched/signal.h
>> +++ b/include/linux/sched/signal.h
>> @@ -674,6 +674,8 @@ static inline int thread_group_empty(struct task_struct *p)
>>  #define delay_group_leader(p) \
>>  		(thread_group_leader(p) && !thread_group_empty(p))
>>  
>> +extern bool thread_group_exited(struct pid *pid);
>> +
>>  extern struct sighand_struct *__lock_task_sighand(struct task_struct *task,
>>  							unsigned long *flags);
>>  
>> diff --git a/kernel/exit.c b/kernel/exit.c
>> index d3294b611df1..a7f112feb0f6 100644
>> --- a/kernel/exit.c
>> +++ b/kernel/exit.c
>> @@ -1713,6 +1713,30 @@ COMPAT_SYSCALL_DEFINE5(waitid,
>>  }
>>  #endif
>>  
>> +/**
>> + * thread_group_exited - check that a thread group has exited
>> + * @pid: tgid of thread group to be checked.
>> + *
>> + * Test if thread group is has exited (all threads are zombies, dead
>> + * or completely gone).
>> + *
>> + * Return: true if the thread group has exited. false otherwise.
>> + */
>> +bool thread_group_exited(struct pid *pid)
>> +{
>> +	struct task_struct *task;
>> +	bool exited;
>> +
>> +	rcu_read_lock();
>> +	task = pid_task(pid, PIDTYPE_PID);
>> +	exited = !task ||
>> +		(READ_ONCE(task->exit_state) && thread_group_empty(task));
>> +	rcu_read_unlock();
>> +
>> +	return exited;
>> +}
>
> I'm not sure why you think READ_ONCE was missing.
> It's different in wait_consider_task() where READ_ONCE is needed because
> of multiple checks. Here it's done once.

In practice it probably has no effect on the generated code.  But
READ_ONCE is about telling the compiler not to be clever.  Don't use
tearing loads or stores etc.  When all of the other readers are using
READ_ONCE I just get nervous if we have a case that doesn't.

> The rest all looks good to me. Tested with and without bpf_preload patches.
> Feel free to create a frozen branch with this set.

Can I have your Tested-by and Acked-by?

> btw I'll be offline starting tomorrow for a week.
> Will catch up with threads afterwards.

Eric
Alexei Starovoitov July 4, 2020, 12:03 a.m. UTC | #3
On Fri, Jul 03, 2020 at 04:37:47PM -0500, Eric W. Biederman wrote:
> 
> > The rest all looks good to me. Tested with and without bpf_preload patches.
> > Feel free to create a frozen branch with this set.
> 
> Can I have your Tested-by and Acked-by?

For the set:
Acked-by: Alexei Starovoitov <ast@kernel.org>
Tested-by: Alexei Starovoitov <ast@kernel.org>
Christian Brauner July 4, 2020, 3:50 p.m. UTC | #4
On Fri, Jul 03, 2020 at 04:37:47PM -0500, Eric W. Biederman wrote:
> Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:
> 
> > On Thu, Jul 02, 2020 at 11:41:37AM -0500, Eric W. Biederman wrote:
> >> Create an independent helper thread_group_exited report return true
> >> when all threads have passed exit_notify in do_exit.  AKA all of the
> >> threads are at least zombies and might be dead or completely gone.
> >> 
> >> Create this helper by taking the logic out of pidfd_poll where
> >> it is already tested, and adding a missing READ_ONCE on
> >> the read of task->exit_state.
> >> 
> >> I will be changing the user mode driver code to use this same logic
> >> to know when a user mode driver needs to be restarted.
> >> 
> >> Place the new helper thread_group_exited in kernel/exit.c and
> >> EXPORT it so it can be used by modules.
> >> 
> >> Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
> >> ---
> >>  include/linux/sched/signal.h |  2 ++
> >>  kernel/exit.c                | 24 ++++++++++++++++++++++++
> >>  kernel/fork.c                |  6 +-----
> >>  3 files changed, 27 insertions(+), 5 deletions(-)
> >> 
> >> diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
> >> index 0ee5e696c5d8..1bad18a1d8ba 100644
> >> --- a/include/linux/sched/signal.h
> >> +++ b/include/linux/sched/signal.h
> >> @@ -674,6 +674,8 @@ static inline int thread_group_empty(struct task_struct *p)
> >>  #define delay_group_leader(p) \
> >>  		(thread_group_leader(p) && !thread_group_empty(p))
> >>  
> >> +extern bool thread_group_exited(struct pid *pid);
> >> +
> >>  extern struct sighand_struct *__lock_task_sighand(struct task_struct *task,
> >>  							unsigned long *flags);
> >>  
> >> diff --git a/kernel/exit.c b/kernel/exit.c
> >> index d3294b611df1..a7f112feb0f6 100644
> >> --- a/kernel/exit.c
> >> +++ b/kernel/exit.c
> >> @@ -1713,6 +1713,30 @@ COMPAT_SYSCALL_DEFINE5(waitid,
> >>  }
> >>  #endif
> >>  
> >> +/**
> >> + * thread_group_exited - check that a thread group has exited
> >> + * @pid: tgid of thread group to be checked.
> >> + *
> >> + * Test if thread group is has exited (all threads are zombies, dead
> >> + * or completely gone).
> >> + *
> >> + * Return: true if the thread group has exited. false otherwise.
> >> + */
> >> +bool thread_group_exited(struct pid *pid)
> >> +{
> >> +	struct task_struct *task;
> >> +	bool exited;
> >> +
> >> +	rcu_read_lock();
> >> +	task = pid_task(pid, PIDTYPE_PID);
> >> +	exited = !task ||
> >> +		(READ_ONCE(task->exit_state) && thread_group_empty(task));
> >> +	rcu_read_unlock();
> >> +
> >> +	return exited;
> >> +}
> >
> > I'm not sure why you think READ_ONCE was missing.
> > It's different in wait_consider_task() where READ_ONCE is needed because
> > of multiple checks. Here it's done once.
> 
> In practice it probably has no effect on the generated code.  But
> READ_ONCE is about telling the compiler not to be clever.  Don't use
> tearing loads or stores etc.  When all of the other readers are using
> READ_ONCE I just get nervous if we have a case that doesn't.

That's not true. The only place where READ_ONCE(->exit_state) is used is
in wait_consider_task() and nowhere else. We had that discussion a while
ago where I or someone proposed to simply place a READ_ONCE() around all
accesses to exit_state for the sake of kcsan and we agreed that it's
unnecessary and not to do this.
But it obviously doesn't hurt to have it.

Christian
Christian Brauner July 4, 2020, 4 p.m. UTC | #5
On Thu, Jul 02, 2020 at 11:41:37AM -0500, Eric W. Biederman wrote:
> Create an independent helper thread_group_exited report return true

s/report return/which reports/

> when all threads have passed exit_notify in do_exit.  AKA all of the
> threads are at least zombies and might be dead or completely gone.
> 
> Create this helper by taking the logic out of pidfd_poll where
> it is already tested, and adding a missing READ_ONCE on
> the read of task->exit_state.

I would prefer to have this comment dropped as this read_once() is not
missing as you can see from the comments elsewhere in this thread.

> 
> I will be changing the user mode driver code to use this same logic
> to know when a user mode driver needs to be restarted.
> 
> Place the new helper thread_group_exited in kernel/exit.c and
> EXPORT it so it can be used by modules.
> 
> Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
> ---

Minus the typos above and below, this looks good and passes the pidfd
and process test-suite.
Acked-by: Christian Brauner <christian.brauner@ubuntu.com>

Thanks!
Christian

>  include/linux/sched/signal.h |  2 ++
>  kernel/exit.c                | 24 ++++++++++++++++++++++++
>  kernel/fork.c                |  6 +-----
>  3 files changed, 27 insertions(+), 5 deletions(-)
> 
> diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
> index 0ee5e696c5d8..1bad18a1d8ba 100644
> --- a/include/linux/sched/signal.h
> +++ b/include/linux/sched/signal.h
> @@ -674,6 +674,8 @@ static inline int thread_group_empty(struct task_struct *p)
>  #define delay_group_leader(p) \
>  		(thread_group_leader(p) && !thread_group_empty(p))
>  
> +extern bool thread_group_exited(struct pid *pid);
> +
>  extern struct sighand_struct *__lock_task_sighand(struct task_struct *task,
>  							unsigned long *flags);
>  
> diff --git a/kernel/exit.c b/kernel/exit.c
> index d3294b611df1..a7f112feb0f6 100644
> --- a/kernel/exit.c
> +++ b/kernel/exit.c
> @@ -1713,6 +1713,30 @@ COMPAT_SYSCALL_DEFINE5(waitid,
>  }
>  #endif
>  
> +/**
> + * thread_group_exited - check that a thread group has exited
> + * @pid: tgid of thread group to be checked.
> + *
> + * Test if thread group is has exited (all threads are zombies, dead

s/is has exited/has exited/

> + * or completely gone).
> + *
> + * Return: true if the thread group has exited. false otherwise.
> + */
> +bool thread_group_exited(struct pid *pid)
> +{
> +	struct task_struct *task;
> +	bool exited;
> +
> +	rcu_read_lock();
> +	task = pid_task(pid, PIDTYPE_PID);
> +	exited = !task ||
> +		(READ_ONCE(task->exit_state) && thread_group_empty(task));
> +	rcu_read_unlock();
> +
> +	return exited;
> +}
> +EXPORT_SYMBOL(thread_group_exited);
> +
>  __weak void abort(void)
>  {
>  	BUG();
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 142b23645d82..bf215af7a904 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1787,22 +1787,18 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
>   */
>  static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
>  {
> -	struct task_struct *task;
>  	struct pid *pid = file->private_data;
>  	__poll_t poll_flags = 0;
>  
>  	poll_wait(file, &pid->wait_pidfd, pts);
>  
> -	rcu_read_lock();
> -	task = pid_task(pid, PIDTYPE_PID);
>  	/*
>  	 * Inform pollers only when the whole thread group exits.
>  	 * If the thread group leader exits before all other threads in the
>  	 * group, then poll(2) should block, similar to the wait(2) family.
>  	 */
> -	if (!task || (task->exit_state && thread_group_empty(task)))
> +	if (thread_group_exited(pid))
>  		poll_flags = EPOLLIN | EPOLLRDNORM;
> -	rcu_read_unlock();
>  
>  	return poll_flags;
>  }
> -- 
> 2.25.0
>
Eric W. Biederman July 7, 2020, 5:09 p.m. UTC | #6
Christian Brauner <christian.brauner@ubuntu.com> writes:

> On Fri, Jul 03, 2020 at 04:37:47PM -0500, Eric W. Biederman wrote:
>> Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:
>> 
>> > On Thu, Jul 02, 2020 at 11:41:37AM -0500, Eric W. Biederman wrote:
>> >> Create an independent helper thread_group_exited report return true
>> >> when all threads have passed exit_notify in do_exit.  AKA all of the
>> >> threads are at least zombies and might be dead or completely gone.
>> >> 
>> >> Create this helper by taking the logic out of pidfd_poll where
>> >> it is already tested, and adding a missing READ_ONCE on
>> >> the read of task->exit_state.
>> >> 
>> >> I will be changing the user mode driver code to use this same logic
>> >> to know when a user mode driver needs to be restarted.
>> >> 
>> >> Place the new helper thread_group_exited in kernel/exit.c and
>> >> EXPORT it so it can be used by modules.
>> >> 
>> >> Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
>> >> ---
>> >>  include/linux/sched/signal.h |  2 ++
>> >>  kernel/exit.c                | 24 ++++++++++++++++++++++++
>> >>  kernel/fork.c                |  6 +-----
>> >>  3 files changed, 27 insertions(+), 5 deletions(-)
>> >> 
>> >> diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
>> >> index 0ee5e696c5d8..1bad18a1d8ba 100644
>> >> --- a/include/linux/sched/signal.h
>> >> +++ b/include/linux/sched/signal.h
>> >> @@ -674,6 +674,8 @@ static inline int thread_group_empty(struct task_struct *p)
>> >>  #define delay_group_leader(p) \
>> >>  		(thread_group_leader(p) && !thread_group_empty(p))
>> >>  
>> >> +extern bool thread_group_exited(struct pid *pid);
>> >> +
>> >>  extern struct sighand_struct *__lock_task_sighand(struct task_struct *task,
>> >>  							unsigned long *flags);
>> >>  
>> >> diff --git a/kernel/exit.c b/kernel/exit.c
>> >> index d3294b611df1..a7f112feb0f6 100644
>> >> --- a/kernel/exit.c
>> >> +++ b/kernel/exit.c
>> >> @@ -1713,6 +1713,30 @@ COMPAT_SYSCALL_DEFINE5(waitid,
>> >>  }
>> >>  #endif
>> >>  
>> >> +/**
>> >> + * thread_group_exited - check that a thread group has exited
>> >> + * @pid: tgid of thread group to be checked.
>> >> + *
>> >> + * Test if thread group is has exited (all threads are zombies, dead
>> >> + * or completely gone).
>> >> + *
>> >> + * Return: true if the thread group has exited. false otherwise.
>> >> + */
>> >> +bool thread_group_exited(struct pid *pid)
>> >> +{
>> >> +	struct task_struct *task;
>> >> +	bool exited;
>> >> +
>> >> +	rcu_read_lock();
>> >> +	task = pid_task(pid, PIDTYPE_PID);
>> >> +	exited = !task ||
>> >> +		(READ_ONCE(task->exit_state) && thread_group_empty(task));
>> >> +	rcu_read_unlock();
>> >> +
>> >> +	return exited;
>> >> +}
>> >
>> > I'm not sure why you think READ_ONCE was missing.
>> > It's different in wait_consider_task() where READ_ONCE is needed because
>> > of multiple checks. Here it's done once.
>> 
>> In practice it probably has no effect on the generated code.  But
>> READ_ONCE is about telling the compiler not to be clever.  Don't use
>> tearing loads or stores etc.  When all of the other readers are using
>> READ_ONCE I just get nervous if we have a case that doesn't.
>
> That's not true. The only place where READ_ONCE(->exit_state) is used is
> in wait_consider_task() and nowhere else. We had that discussion a while
> ago where I or someone proposed to simply place a READ_ONCE() around all
> accesses to exit_state for the sake of kcsan and we agreed that it's
> unnecessary and not to do this.
> But it obviously doesn't hurt to have it.

There is a larger discussion to be had around the proper handling of
exit_state.

In this particular case because we are accessing exit_state with
only rcu_read_lock protection, because the outcome of the read
is about correctness, and because the compiler has nothing else
telling it not to re-read exit_state, I believe we actually need
the READ_ONCE.

At the same time it would take a pretty special compiler to want to
reaccess that field in thread_group_exited.

I have looked through and I don't find any of the other access of
exit_state where the result is about correctness (so that we care)
and we don't hold tasklist_lock.

But I have removed the necessary wording from the commit comment.

There is a much larger discussion to be had about what to do with
exit_state, because I think I found about half the accesses were
slightly buggy in one form or another.

Eric
Daniel Borkmann July 8, 2020, 12:05 a.m. UTC | #7
On 7/7/20 7:09 PM, Eric W. Biederman wrote:
> Christian Brauner <christian.brauner@ubuntu.com> writes:
>> On Fri, Jul 03, 2020 at 04:37:47PM -0500, Eric W. Biederman wrote:
>>> Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:
>>>
>>>> On Thu, Jul 02, 2020 at 11:41:37AM -0500, Eric W. Biederman wrote:
>>>>> Create an independent helper thread_group_exited report return true
>>>>> when all threads have passed exit_notify in do_exit.  AKA all of the
>>>>> threads are at least zombies and might be dead or completely gone.
>>>>>
>>>>> Create this helper by taking the logic out of pidfd_poll where
>>>>> it is already tested, and adding a missing READ_ONCE on
>>>>> the read of task->exit_state.
>>>>>
>>>>> I will be changing the user mode driver code to use this same logic
>>>>> to know when a user mode driver needs to be restarted.
>>>>>
>>>>> Place the new helper thread_group_exited in kernel/exit.c and
>>>>> EXPORT it so it can be used by modules.
>>>>>
>>>>> Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
>>>>> ---
>>>>>   include/linux/sched/signal.h |  2 ++
>>>>>   kernel/exit.c                | 24 ++++++++++++++++++++++++
>>>>>   kernel/fork.c                |  6 +-----
>>>>>   3 files changed, 27 insertions(+), 5 deletions(-)
>>>>>
>>>>> diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
>>>>> index 0ee5e696c5d8..1bad18a1d8ba 100644
>>>>> --- a/include/linux/sched/signal.h
>>>>> +++ b/include/linux/sched/signal.h
>>>>> @@ -674,6 +674,8 @@ static inline int thread_group_empty(struct task_struct *p)
>>>>>   #define delay_group_leader(p) \
>>>>>   		(thread_group_leader(p) && !thread_group_empty(p))
>>>>>   
>>>>> +extern bool thread_group_exited(struct pid *pid);
>>>>> +
>>>>>   extern struct sighand_struct *__lock_task_sighand(struct task_struct *task,
>>>>>   							unsigned long *flags);
>>>>>   
>>>>> diff --git a/kernel/exit.c b/kernel/exit.c
>>>>> index d3294b611df1..a7f112feb0f6 100644
>>>>> --- a/kernel/exit.c
>>>>> +++ b/kernel/exit.c
>>>>> @@ -1713,6 +1713,30 @@ COMPAT_SYSCALL_DEFINE5(waitid,
>>>>>   }
>>>>>   #endif
>>>>>   
>>>>> +/**
>>>>> + * thread_group_exited - check that a thread group has exited
>>>>> + * @pid: tgid of thread group to be checked.
>>>>> + *
>>>>> + * Test if thread group is has exited (all threads are zombies, dead
>>>>> + * or completely gone).
>>>>> + *
>>>>> + * Return: true if the thread group has exited. false otherwise.
>>>>> + */
>>>>> +bool thread_group_exited(struct pid *pid)
>>>>> +{
>>>>> +	struct task_struct *task;
>>>>> +	bool exited;
>>>>> +
>>>>> +	rcu_read_lock();
>>>>> +	task = pid_task(pid, PIDTYPE_PID);
>>>>> +	exited = !task ||
>>>>> +		(READ_ONCE(task->exit_state) && thread_group_empty(task));
>>>>> +	rcu_read_unlock();
>>>>> +
>>>>> +	return exited;
>>>>> +}
>>>>
>>>> I'm not sure why you think READ_ONCE was missing.
>>>> It's different in wait_consider_task() where READ_ONCE is needed because
>>>> of multiple checks. Here it's done once.
>>>
>>> In practice it probably has no effect on the generated code.  But
>>> READ_ONCE is about telling the compiler not to be clever.  Don't use
>>> tearing loads or stores etc.  When all of the other readers are using
>>> READ_ONCE I just get nervous if we have a case that doesn't.
>>
>> That's not true. The only place where READ_ONCE(->exit_state) is used is
>> in wait_consider_task() and nowhere else. We had that discussion a while
>> ago where I or someone proposed to simply place a READ_ONCE() around all
>> accesses to exit_state for the sake of kcsan and we agreed that it's
>> unnecessary and not to do this.
>> But it obviously doesn't hurt to have it.
> 
> There is a larger discussion to be had around the proper handling of
> exit_state.
> 
> In this particular case because we are accessing exit_state with
> only rcu_read_lock protection, because the outcome of the read
> is about correctness, and because the compiler has nothing else
> telling it not to re-read exit_state, I believe we actually need
> the READ_ONCE.
> 
> At the same time it would take a pretty special compiler to want to
> reaccess that field in thread_group_exited.
> 
> I have looked through and I don't find any of the other access of
> exit_state where the result is about correctness (so that we care)
> and we don't hold tasklist_lock.
> 
> But I have removed the necessary wording from the commit comment.

Hey Eric, are you planning to push the final version into a topic branch
so it can be pulled into bpf-next as discussed earlier?

Thanks,
Daniel
Eric W. Biederman July 8, 2020, 3:50 a.m. UTC | #8
Daniel Borkmann <daniel@iogearbox.net> writes:

> Hey Eric, are you planning to push the final version into a topic branch
> so it can be pulled into bpf-next as discussed earlier?

Yes.  I just about have it ready.  I am taking one last pass through the
review comments to make certain I have not missed anything before I do.

I am hoping I can get it out tonight. Fingers crossed.

Eric
diff mbox series

Patch

diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 0ee5e696c5d8..1bad18a1d8ba 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -674,6 +674,8 @@  static inline int thread_group_empty(struct task_struct *p)
 #define delay_group_leader(p) \
 		(thread_group_leader(p) && !thread_group_empty(p))
 
+extern bool thread_group_exited(struct pid *pid);
+
 extern struct sighand_struct *__lock_task_sighand(struct task_struct *task,
 							unsigned long *flags);
 
diff --git a/kernel/exit.c b/kernel/exit.c
index d3294b611df1..a7f112feb0f6 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1713,6 +1713,30 @@  COMPAT_SYSCALL_DEFINE5(waitid,
 }
 #endif
 
+/**
+ * thread_group_exited - check that a thread group has exited
+ * @pid: tgid of thread group to be checked.
+ *
+ * Test if thread group is has exited (all threads are zombies, dead
+ * or completely gone).
+ *
+ * Return: true if the thread group has exited. false otherwise.
+ */
+bool thread_group_exited(struct pid *pid)
+{
+	struct task_struct *task;
+	bool exited;
+
+	rcu_read_lock();
+	task = pid_task(pid, PIDTYPE_PID);
+	exited = !task ||
+		(READ_ONCE(task->exit_state) && thread_group_empty(task));
+	rcu_read_unlock();
+
+	return exited;
+}
+EXPORT_SYMBOL(thread_group_exited);
+
 __weak void abort(void)
 {
 	BUG();
diff --git a/kernel/fork.c b/kernel/fork.c
index 142b23645d82..bf215af7a904 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1787,22 +1787,18 @@  static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
  */
 static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
 {
-	struct task_struct *task;
 	struct pid *pid = file->private_data;
 	__poll_t poll_flags = 0;
 
 	poll_wait(file, &pid->wait_pidfd, pts);
 
-	rcu_read_lock();
-	task = pid_task(pid, PIDTYPE_PID);
 	/*
 	 * Inform pollers only when the whole thread group exits.
 	 * If the thread group leader exits before all other threads in the
 	 * group, then poll(2) should block, similar to the wait(2) family.
 	 */
-	if (!task || (task->exit_state && thread_group_empty(task)))
+	if (thread_group_exited(pid))
 		poll_flags = EPOLLIN | EPOLLRDNORM;
-	rcu_read_unlock();
 
 	return poll_flags;
 }