diff mbox series

[2/2] exec: Add a exec_update_mutex to replace cred_guard_mutex

Message ID 87imjicxjw.fsf_-_@x220.int.ebiederm.org (mailing list archive)
State New, archived
Headers show
Series Infrastructure to allow fixing exec deadlocks | expand

Commit Message

Eric W. Biederman March 5, 2020, 9:16 p.m. UTC
The cred_guard_mutex is problematic.  The cred_guard_mutex is held
over the userspace accesses as the arguments from userspace are read.
The cred_guard_mutex is held of PTRACE_EVENT_EXIT as the the other
threads are killed.  The cred_guard_mutex is held over
"put_user(0, tsk->clear_child_tid)" in exit_mm().

Any of those can result in deadlock, as the cred_guard_mutex is held
over a possible indefinite userspace waits for userspace.

Add exec_update_mutex that is only held over exec updating process
with the new contents of exec, so that code that needs not to be
confused by exec changing the mm and the cred in ways that can not
happen during ordinary execution of a process can take.

The plan is to switch the users of cred_guard_mutex to
exed_udpate_mutex one by one.  This lets us move forward while still
being careful and not introducing any regressions.

Link: https://lore.kernel.org/lkml/20160921152946.GA24210@dhcp22.suse.cz/
Link: https://lore.kernel.org/lkml/AM6PR03MB5170B06F3A2B75EFB98D071AE4E60@AM6PR03MB5170.eurprd03.prod.outlook.com/
Link: https://lore.kernel.org/linux-fsdevel/20161102181806.GB1112@redhat.com/
Link: https://lore.kernel.org/lkml/20160923095031.GA14923@redhat.com/
Link: https://lore.kernel.org/lkml/20170213141452.GA30203@redhat.com/
Ref: 45c1a159b85b ("Add PTRACE_O_TRACEVFORKDONE and PTRACE_O_TRACEEXIT facilities.")
Ref: 456f17cd1a28 ("[PATCH] user-vm-unlock-2.5.31-A2")
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/exec.c                    | 4 ++++
 include/linux/sched/signal.h | 9 ++++++++-
 kernel/fork.c                | 1 +
 3 files changed, 13 insertions(+), 1 deletion(-)

Comments

Bernd Edlinger March 5, 2020, 9:51 p.m. UTC | #1
On 3/5/20 10:16 PM, Eric W. Biederman wrote:
> 
> The cred_guard_mutex is problematic.  The cred_guard_mutex is held
> over the userspace accesses as the arguments from userspace are read.
> The cred_guard_mutex is held of PTRACE_EVENT_EXIT as the the other
> threads are killed.  The cred_guard_mutex is held over
> "put_user(0, tsk->clear_child_tid)" in exit_mm().
> 
> Any of those can result in deadlock, as the cred_guard_mutex is held
> over a possible indefinite userspace waits for userspace.
> 
> Add exec_update_mutex that is only held over exec updating process
> with the new contents of exec, so that code that needs not to be
> confused by exec changing the mm and the cred in ways that can not
> happen during ordinary execution of a process can take.
> 
> The plan is to switch the users of cred_guard_mutex to
> exed_udpate_mutex one by one.  This lets us move forward while still
> being careful and not introducing any regressions.
> 
> Link: https://lore.kernel.org/lkml/20160921152946.GA24210@dhcp22.suse.cz/
> Link: https://lore.kernel.org/lkml/AM6PR03MB5170B06F3A2B75EFB98D071AE4E60@AM6PR03MB5170.eurprd03.prod.outlook.com/
> Link: https://lore.kernel.org/linux-fsdevel/20161102181806.GB1112@redhat.com/
> Link: https://lore.kernel.org/lkml/20160923095031.GA14923@redhat.com/
> Link: https://lore.kernel.org/lkml/20170213141452.GA30203@redhat.com/
> Ref: 45c1a159b85b ("Add PTRACE_O_TRACEVFORKDONE and PTRACE_O_TRACEEXIT facilities.")
> Ref: 456f17cd1a28 ("[PATCH] user-vm-unlock-2.5.31-A2")
> Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
> ---
>  fs/exec.c                    | 4 ++++
>  include/linux/sched/signal.h | 9 ++++++++-
>  kernel/fork.c                | 1 +
>  3 files changed, 13 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/exec.c b/fs/exec.c
> index c243f9660d46..ad7b518f906d 100644
> --- a/fs/exec.c
> +++ b/fs/exec.c
> @@ -1182,6 +1182,7 @@ static int de_thread(struct linux_binprm *bprm, struct task_struct *tsk)
>  		release_task(leader);
>  	}
>  
> +	mutex_lock(&current->signal->exec_update_mutex);
>  	bprm->unrecoverable = true;
>  	sig->group_exit_task = NULL;
>  	sig->notify_count = 0;
> @@ -1425,6 +1426,8 @@ static void free_bprm(struct linux_binprm *bprm)
>  {
>  	free_arg_pages(bprm);
>  	if (bprm->cred) {
> +		if (bprm->unrecoverable)
> +			mutex_unlock(&current->signal->exec_update_mutex);
>  		mutex_unlock(&current->signal->cred_guard_mutex);
>  		abort_creds(bprm->cred);
>  	}
> @@ -1474,6 +1477,7 @@ void install_exec_creds(struct linux_binprm *bprm)
>  	 * credentials; any time after this it may be unlocked.
>  	 */
>  	security_bprm_committed_creds(bprm);
> +	mutex_unlock(&current->signal->exec_update_mutex);
>  	mutex_unlock(&current->signal->cred_guard_mutex);
>  }
>  EXPORT_SYMBOL(install_exec_creds);
> diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
> index 88050259c466..a29df79540ce 100644
> --- a/include/linux/sched/signal.h
> +++ b/include/linux/sched/signal.h
> @@ -224,7 +224,14 @@ struct signal_struct {
>  
>  	struct mutex cred_guard_mutex;	/* guard against foreign influences on
>  					 * credential calculations
> -					 * (notably. ptrace) */
> +					 * (notably. ptrace)
> +					 * Deprecated do not use in new code.
> +					 * Use exec_update_mutex instead.
> +					 */
> +	struct mutex exec_update_mutex;	/* Held while task_struct is being
> +					 * updated during exec, and may have
> +					 * inconsistent permissions.
> +					 */
>  } __randomize_layout;
>  
>  /*
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 60a1295f4384..12896a6ecee6 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1594,6 +1594,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
>  	sig->oom_score_adj_min = current->signal->oom_score_adj_min;
>  
>  	mutex_init(&sig->cred_guard_mutex);
> +	mutex_init(&sig->exec_update_mutex);
>  
>  	return 0;
>  }
> 
Don't you need to add something like this to init/init_task.c ?

.exec_update_mutex = __MUTEX_INITIALIZER(init_signals.exec_update_mutex),


Bernd.
Eric W. Biederman March 6, 2020, 5:17 a.m. UTC | #2
Bernd Edlinger <bernd.edlinger@hotmail.de> writes:

> On 3/5/20 10:16 PM, Eric W. Biederman wrote:
>> 
>> The cred_guard_mutex is problematic.  The cred_guard_mutex is held
>> over the userspace accesses as the arguments from userspace are read.
>> The cred_guard_mutex is held of PTRACE_EVENT_EXIT as the the other
>> threads are killed.  The cred_guard_mutex is held over
>> "put_user(0, tsk->clear_child_tid)" in exit_mm().
>> 
>> Any of those can result in deadlock, as the cred_guard_mutex is held
>> over a possible indefinite userspace waits for userspace.
>> 
>> Add exec_update_mutex that is only held over exec updating process
>> with the new contents of exec, so that code that needs not to be
>> confused by exec changing the mm and the cred in ways that can not
>> happen during ordinary execution of a process can take.
>> 
>> The plan is to switch the users of cred_guard_mutex to
>> exed_udpate_mutex one by one.  This lets us move forward while still
>> being careful and not introducing any regressions.
>> 
>> Link: https://lore.kernel.org/lkml/20160921152946.GA24210@dhcp22.suse.cz/
>> Link: https://lore.kernel.org/lkml/AM6PR03MB5170B06F3A2B75EFB98D071AE4E60@AM6PR03MB5170.eurprd03.prod.outlook.com/
>> Link: https://lore.kernel.org/linux-fsdevel/20161102181806.GB1112@redhat.com/
>> Link: https://lore.kernel.org/lkml/20160923095031.GA14923@redhat.com/
>> Link: https://lore.kernel.org/lkml/20170213141452.GA30203@redhat.com/
>> Ref: 45c1a159b85b ("Add PTRACE_O_TRACEVFORKDONE and PTRACE_O_TRACEEXIT facilities.")
>> Ref: 456f17cd1a28 ("[PATCH] user-vm-unlock-2.5.31-A2")
>> Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
>> ---
>>  fs/exec.c                    | 4 ++++
>>  include/linux/sched/signal.h | 9 ++++++++-
>>  kernel/fork.c                | 1 +
>>  3 files changed, 13 insertions(+), 1 deletion(-)
>> 
>> diff --git a/fs/exec.c b/fs/exec.c
>> index c243f9660d46..ad7b518f906d 100644
>> --- a/fs/exec.c
>> +++ b/fs/exec.c
>> @@ -1182,6 +1182,7 @@ static int de_thread(struct linux_binprm *bprm, struct task_struct *tsk)
>>  		release_task(leader);
>>  	}
>>  
>> +	mutex_lock(&current->signal->exec_update_mutex);
>>  	bprm->unrecoverable = true;
>>  	sig->group_exit_task = NULL;
>>  	sig->notify_count = 0;
>> @@ -1425,6 +1426,8 @@ static void free_bprm(struct linux_binprm *bprm)
>>  {
>>  	free_arg_pages(bprm);
>>  	if (bprm->cred) {
>> +		if (bprm->unrecoverable)
>> +			mutex_unlock(&current->signal->exec_update_mutex);
>>  		mutex_unlock(&current->signal->cred_guard_mutex);
>>  		abort_creds(bprm->cred);
>>  	}
>> @@ -1474,6 +1477,7 @@ void install_exec_creds(struct linux_binprm *bprm)
>>  	 * credentials; any time after this it may be unlocked.
>>  	 */
>>  	security_bprm_committed_creds(bprm);
>> +	mutex_unlock(&current->signal->exec_update_mutex);
>>  	mutex_unlock(&current->signal->cred_guard_mutex);
>>  }
>>  EXPORT_SYMBOL(install_exec_creds);
>> diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
>> index 88050259c466..a29df79540ce 100644
>> --- a/include/linux/sched/signal.h
>> +++ b/include/linux/sched/signal.h
>> @@ -224,7 +224,14 @@ struct signal_struct {
>>  
>>  	struct mutex cred_guard_mutex;	/* guard against foreign influences on
>>  					 * credential calculations
>> -					 * (notably. ptrace) */
>> +					 * (notably. ptrace)
>> +					 * Deprecated do not use in new code.
>> +					 * Use exec_update_mutex instead.
>> +					 */
>> +	struct mutex exec_update_mutex;	/* Held while task_struct is being
>> +					 * updated during exec, and may have
>> +					 * inconsistent permissions.
>> +					 */
>>  } __randomize_layout;
>>  
>>  /*
>> diff --git a/kernel/fork.c b/kernel/fork.c
>> index 60a1295f4384..12896a6ecee6 100644
>> --- a/kernel/fork.c
>> +++ b/kernel/fork.c
>> @@ -1594,6 +1594,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
>>  	sig->oom_score_adj_min = current->signal->oom_score_adj_min;
>>  
>>  	mutex_init(&sig->cred_guard_mutex);
>> +	mutex_init(&sig->exec_update_mutex);
>>  
>>  	return 0;
>>  }
>> 
> Don't you need to add something like this to init/init_task.c ?
>
> .exec_update_mutex = __MUTEX_INITIALIZER(init_signals.exec_update_mutex),

Yes.  I overlooked that.  Thank you.

Eric
Bernd Edlinger March 6, 2020, 11:46 a.m. UTC | #3
Am 06.03.20 um 06:17 schrieb Eric W. Biederman:
> Bernd Edlinger <bernd.edlinger@hotmail.de> writes:
> 
>> On 3/5/20 10:16 PM, Eric W. Biederman wrote:
>>>
>>> The cred_guard_mutex is problematic.  The cred_guard_mutex is held
>>> over the userspace accesses as the arguments from userspace are read.
>>> The cred_guard_mutex is held of PTRACE_EVENT_EXIT as the the other
>>> threads are killed.  The cred_guard_mutex is held over
>>> "put_user(0, tsk->clear_child_tid)" in exit_mm().
>>>
>>> Any of those can result in deadlock, as the cred_guard_mutex is held
>>> over a possible indefinite userspace waits for userspace.
>>>
>>> Add exec_update_mutex that is only held over exec updating process
>>> with the new contents of exec, so that code that needs not to be
>>> confused by exec changing the mm and the cred in ways that can not
>>> happen during ordinary execution of a process can take.
>>>
>>> The plan is to switch the users of cred_guard_mutex to
>>> exed_udpate_mutex one by one.  This lets us move forward while still
>>> being careful and not introducing any regressions.
>>>
>>> Link: https://lore.kernel.org/lkml/20160921152946.GA24210@dhcp22.suse.cz/
>>> Link: https://lore.kernel.org/lkml/AM6PR03MB5170B06F3A2B75EFB98D071AE4E60@AM6PR03MB5170.eurprd03.prod.outlook.com/
>>> Link: https://lore.kernel.org/linux-fsdevel/20161102181806.GB1112@redhat.com/
>>> Link: https://lore.kernel.org/lkml/20160923095031.GA14923@redhat.com/
>>> Link: https://lore.kernel.org/lkml/20170213141452.GA30203@redhat.com/
>>> Ref: 45c1a159b85b ("Add PTRACE_O_TRACEVFORKDONE and PTRACE_O_TRACEEXIT facilities.")
>>> Ref: 456f17cd1a28 ("[PATCH] user-vm-unlock-2.5.31-A2")
>>> Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
>>> ---
>>>  fs/exec.c                    | 4 ++++
>>>  include/linux/sched/signal.h | 9 ++++++++-
>>>  kernel/fork.c                | 1 +
>>>  3 files changed, 13 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/fs/exec.c b/fs/exec.c
>>> index c243f9660d46..ad7b518f906d 100644
>>> --- a/fs/exec.c
>>> +++ b/fs/exec.c
>>> @@ -1182,6 +1182,7 @@ static int de_thread(struct linux_binprm *bprm, struct task_struct *tsk)
>>>  		release_task(leader);
>>>  	}
>>>  
>>> +	mutex_lock(&current->signal->exec_update_mutex);

And by the way, could you make this mutex_lock_killable?


Bernd.
Bernd Edlinger March 6, 2020, 7:16 p.m. UTC | #4
On 3/6/20 6:17 AM, Eric W. Biederman wrote:
> Bernd Edlinger <bernd.edlinger@hotmail.de> writes:
> 
>> On 3/5/20 10:16 PM, Eric W. Biederman wrote:
>>>
>>> The cred_guard_mutex is problematic.  The cred_guard_mutex is held
>>> over the userspace accesses as the arguments from userspace are read.
>>> The cred_guard_mutex is held of PTRACE_EVENT_EXIT as the the other
>>> threads are killed.  The cred_guard_mutex is held over
>>> "put_user(0, tsk->clear_child_tid)" in exit_mm().
>>>

I am all for this patch, and the direction it is heading, Eric.

I just wanted to add a note that I think it is
possible that exec_mm_release can also invoke put_user(0, tsk->clear_child_tid),
under the new exec_update_mutex, since vm_access increments the
mm->mm_users, under the cred_update_mutex, but releases the mutex,
and the caller can hold the reference for a while and then exec_mmap is not
releasing the last reference.


Bernd.
Eric W. Biederman March 6, 2020, 9:18 p.m. UTC | #5
Bernd Edlinger <bernd.edlinger@hotmail.de> writes:

> Am 06.03.20 um 06:17 schrieb Eric W. Biederman:
>> Bernd Edlinger <bernd.edlinger@hotmail.de> writes:
>> 
>>> On 3/5/20 10:16 PM, Eric W. Biederman wrote:
>>>>
>>>> The cred_guard_mutex is problematic.  The cred_guard_mutex is held
>>>> over the userspace accesses as the arguments from userspace are read.
>>>> The cred_guard_mutex is held of PTRACE_EVENT_EXIT as the the other
>>>> threads are killed.  The cred_guard_mutex is held over
>>>> "put_user(0, tsk->clear_child_tid)" in exit_mm().
>>>>
>>>> Any of those can result in deadlock, as the cred_guard_mutex is held
>>>> over a possible indefinite userspace waits for userspace.
>>>>
>>>> Add exec_update_mutex that is only held over exec updating process
>>>> with the new contents of exec, so that code that needs not to be
>>>> confused by exec changing the mm and the cred in ways that can not
>>>> happen during ordinary execution of a process can take.
>>>>
>>>> The plan is to switch the users of cred_guard_mutex to
>>>> exed_udpate_mutex one by one.  This lets us move forward while still
>>>> being careful and not introducing any regressions.
>>>>
>>>> Link: https://lore.kernel.org/lkml/20160921152946.GA24210@dhcp22.suse.cz/
>>>> Link: https://lore.kernel.org/lkml/AM6PR03MB5170B06F3A2B75EFB98D071AE4E60@AM6PR03MB5170.eurprd03.prod.outlook.com/
>>>> Link: https://lore.kernel.org/linux-fsdevel/20161102181806.GB1112@redhat.com/
>>>> Link: https://lore.kernel.org/lkml/20160923095031.GA14923@redhat.com/
>>>> Link: https://lore.kernel.org/lkml/20170213141452.GA30203@redhat.com/
>>>> Ref: 45c1a159b85b ("Add PTRACE_O_TRACEVFORKDONE and PTRACE_O_TRACEEXIT facilities.")
>>>> Ref: 456f17cd1a28 ("[PATCH] user-vm-unlock-2.5.31-A2")
>>>> Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
>>>> ---
>>>>  fs/exec.c                    | 4 ++++
>>>>  include/linux/sched/signal.h | 9 ++++++++-
>>>>  kernel/fork.c                | 1 +
>>>>  3 files changed, 13 insertions(+), 1 deletion(-)
>>>>
>>>> diff --git a/fs/exec.c b/fs/exec.c
>>>> index c243f9660d46..ad7b518f906d 100644
>>>> --- a/fs/exec.c
>>>> +++ b/fs/exec.c
>>>> @@ -1182,6 +1182,7 @@ static int de_thread(struct linux_binprm *bprm, struct task_struct *tsk)
>>>>  		release_task(leader);
>>>>  	}
>>>>  
>>>> +	mutex_lock(&current->signal->exec_update_mutex);
>
> And by the way, could you make this mutex_lock_killable?

For some reason when I first read this suggestion I thought making this
mutex_lock_killable would cause me to rework the logic of when I
set unrecoverable and when I unlock the mutex.  I blame a tired brain.
If a process has received a fatal signal none of that matters.

So yes I will do that just to make things robust in case we miss
something that would still make it possible to deadlock in with the new
mutex.

I am a little worried that the new mutex might still cover a little too
much.  But past a certain point I we are not being able to make this
code perfect in the first change.  The best we can do is to be careful
and avoid regressions.  Whatever slips through we can fix when we spot
the problem.

Eric
Eric W. Biederman March 6, 2020, 9:58 p.m. UTC | #6
Bernd Edlinger <bernd.edlinger@hotmail.de> writes:

> On 3/6/20 6:17 AM, Eric W. Biederman wrote:
>> Bernd Edlinger <bernd.edlinger@hotmail.de> writes:
>> 
>>> On 3/5/20 10:16 PM, Eric W. Biederman wrote:
>>>>
>>>> The cred_guard_mutex is problematic.  The cred_guard_mutex is held
>>>> over the userspace accesses as the arguments from userspace are read.
>>>> The cred_guard_mutex is held of PTRACE_EVENT_EXIT as the the other
>>>> threads are killed.  The cred_guard_mutex is held over
>>>> "put_user(0, tsk->clear_child_tid)" in exit_mm().
>>>>
>
> I am all for this patch, and the direction it is heading, Eric.
>
> I just wanted to add a note that I think it is
> possible that exec_mm_release can also invoke put_user(0, tsk->clear_child_tid),
> under the new exec_update_mutex, since vm_access increments the
> mm->mm_users, under the cred_update_mutex, but releases the mutex,
> and the caller can hold the reference for a while and then exec_mmap is not
> releasing the last reference.

Good catch.  I really appreciate your close look at the details.

I am wondering if process_vm_readv and process_vm_writev could be
safely changed to use mmgrab and mmdrop, instead of mmget and mmput.

That would resolve the potential issue you have pointed out.  I just
haven't figured out if it is safe.  The mm code has been seriously
refactored since I knew how it all worked.

Eric
Eric W. Biederman March 6, 2020, 10:29 p.m. UTC | #7
ebiederm@xmission.com (Eric W. Biederman) writes:

> Bernd Edlinger <bernd.edlinger@hotmail.de> writes:
>
>> On 3/6/20 6:17 AM, Eric W. Biederman wrote:
>>> Bernd Edlinger <bernd.edlinger@hotmail.de> writes:
>>> 
>>>> On 3/5/20 10:16 PM, Eric W. Biederman wrote:
>>>>>
>>>>> The cred_guard_mutex is problematic.  The cred_guard_mutex is held
>>>>> over the userspace accesses as the arguments from userspace are read.
>>>>> The cred_guard_mutex is held of PTRACE_EVENT_EXIT as the the other
>>>>> threads are killed.  The cred_guard_mutex is held over
>>>>> "put_user(0, tsk->clear_child_tid)" in exit_mm().
>>>>>
>>
>> I am all for this patch, and the direction it is heading, Eric.
>>
>> I just wanted to add a note that I think it is
>> possible that exec_mm_release can also invoke put_user(0, tsk->clear_child_tid),
>> under the new exec_update_mutex, since vm_access increments the
>> mm->mm_users, under the cred_update_mutex, but releases the mutex,
>> and the caller can hold the reference for a while and then exec_mmap is not
>> releasing the last reference.
>
> Good catch.  I really appreciate your close look at the details.
>
> I am wondering if process_vm_readv and process_vm_writev could be
> safely changed to use mmgrab and mmdrop, instead of mmget and mmput.
>
> That would resolve the potential issue you have pointed out.  I just
> haven't figured out if it is safe.  The mm code has been seriously
> refactored since I knew how it all worked.

Nope, mmget can not be replaced by mmgrab.

It might be possible to do something creative like store a cred in place
of the userns on the mm and use that for mm_access permission checks.
Still we are talking a pretty narrow window, and a case that no one has
figured out how to trigger yet.  So I will leave that corner case as
something for future improvements.

Eric
Eric W. Biederman March 7, 2020, 1:03 a.m. UTC | #8
ebiederm@xmission.com (Eric W. Biederman) writes:

> ebiederm@xmission.com (Eric W. Biederman) writes:
>
>> Bernd Edlinger <bernd.edlinger@hotmail.de> writes:
>>
>>> On 3/6/20 6:17 AM, Eric W. Biederman wrote:
>>>> Bernd Edlinger <bernd.edlinger@hotmail.de> writes:
>>>> 
>>>>> On 3/5/20 10:16 PM, Eric W. Biederman wrote:
>>>>>>
>>>>>> The cred_guard_mutex is problematic.  The cred_guard_mutex is held
>>>>>> over the userspace accesses as the arguments from userspace are read.
>>>>>> The cred_guard_mutex is held of PTRACE_EVENT_EXIT as the the other
>>>>>> threads are killed.  The cred_guard_mutex is held over
>>>>>> "put_user(0, tsk->clear_child_tid)" in exit_mm().
>>>>>>
>>>
>>> I am all for this patch, and the direction it is heading, Eric.
>>>
>>> I just wanted to add a note that I think it is
>>> possible that exec_mm_release can also invoke put_user(0, tsk->clear_child_tid),
>>> under the new exec_update_mutex, since vm_access increments the
>>> mm->mm_users, under the cred_update_mutex, but releases the mutex,
>>> and the caller can hold the reference for a while and then exec_mmap is not
>>> releasing the last reference.
>>
>> Good catch.  I really appreciate your close look at the details.
>>
>> I am wondering if process_vm_readv and process_vm_writev could be
>> safely changed to use mmgrab and mmdrop, instead of mmget and mmput.
>>
>> That would resolve the potential issue you have pointed out.  I just
>> haven't figured out if it is safe.  The mm code has been seriously
>> refactored since I knew how it all worked.
>
> Nope, mmget can not be replaced by mmgrab.
>
> It might be possible to do something creative like store a cred in place
> of the userns on the mm and use that for mm_access permission checks.
> Still we are talking a pretty narrow window, and a case that no one has
> figured out how to trigger yet.  So I will leave that corner case as
> something for future improvements.

My brain is restless and keep looking at it.

The worst case is processes created with CLONE_VM|CLONE_CHILD_CLEARTID
but not CLONE_THREAD.  For those that put_user will occur ever time
in exec_mmap.

The only solution that I can see is to move taking the new mutex after
exec_mm_release.  Which may be feasible given how close exec_mmap
follows de_thread.

I am going to sleep on that and perhaps I will be able to see how to
move taking the mutex lower.

It would be very nice not to have a known issue going into this set of
changes.

Eric
diff mbox series

Patch

diff --git a/fs/exec.c b/fs/exec.c
index c243f9660d46..ad7b518f906d 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1182,6 +1182,7 @@  static int de_thread(struct linux_binprm *bprm, struct task_struct *tsk)
 		release_task(leader);
 	}
 
+	mutex_lock(&current->signal->exec_update_mutex);
 	bprm->unrecoverable = true;
 	sig->group_exit_task = NULL;
 	sig->notify_count = 0;
@@ -1425,6 +1426,8 @@  static void free_bprm(struct linux_binprm *bprm)
 {
 	free_arg_pages(bprm);
 	if (bprm->cred) {
+		if (bprm->unrecoverable)
+			mutex_unlock(&current->signal->exec_update_mutex);
 		mutex_unlock(&current->signal->cred_guard_mutex);
 		abort_creds(bprm->cred);
 	}
@@ -1474,6 +1477,7 @@  void install_exec_creds(struct linux_binprm *bprm)
 	 * credentials; any time after this it may be unlocked.
 	 */
 	security_bprm_committed_creds(bprm);
+	mutex_unlock(&current->signal->exec_update_mutex);
 	mutex_unlock(&current->signal->cred_guard_mutex);
 }
 EXPORT_SYMBOL(install_exec_creds);
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 88050259c466..a29df79540ce 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -224,7 +224,14 @@  struct signal_struct {
 
 	struct mutex cred_guard_mutex;	/* guard against foreign influences on
 					 * credential calculations
-					 * (notably. ptrace) */
+					 * (notably. ptrace)
+					 * Deprecated do not use in new code.
+					 * Use exec_update_mutex instead.
+					 */
+	struct mutex exec_update_mutex;	/* Held while task_struct is being
+					 * updated during exec, and may have
+					 * inconsistent permissions.
+					 */
 } __randomize_layout;
 
 /*
diff --git a/kernel/fork.c b/kernel/fork.c
index 60a1295f4384..12896a6ecee6 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1594,6 +1594,7 @@  static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 	sig->oom_score_adj_min = current->signal->oom_score_adj_min;
 
 	mutex_init(&sig->cred_guard_mutex);
+	mutex_init(&sig->exec_update_mutex);
 
 	return 0;
 }