diff mbox series

[v3,5/5] exec: Add a exec_update_mutex to replace cred_guard_mutex

Message ID AM6PR03MB5170353DF3575FF7742BB155E4FB0@AM6PR03MB5170.eurprd03.prod.outlook.com (mailing list archive)
State New, archived
Headers show
Series [1/4] exec: Fix a deadlock in ptrace | expand

Commit Message

Bernd Edlinger March 14, 2020, 9:11 a.m. UTC
The cred_guard_mutex is problematic.  The cred_guard_mutex is held
over the userspace accesses as the arguments from userspace are read.
The cred_guard_mutex is held of PTRACE_EVENT_EXIT as the the other
threads are killed.  The cred_guard_mutex is held over
"put_user(0, tsk->clear_child_tid)" in exit_mm().

Any of those can result in deadlock, as the cred_guard_mutex is held
over a possible indefinite userspace waits for userspace.

Add exec_update_mutex that is only held over exec updating process
with the new contents of exec, so that code that needs not to be
confused by exec changing the mm and the cred in ways that can not
happen during ordinary execution of a process.

The plan is to switch the users of cred_guard_mutex to
exec_udpate_mutex one by one.  This lets us move forward while still
being careful and not introducing any regressions.

Link: https://lore.kernel.org/lkml/20160921152946.GA24210@dhcp22.suse.cz/
Link: https://lore.kernel.org/lkml/AM6PR03MB5170B06F3A2B75EFB98D071AE4E60@AM6PR03MB5170.eurprd03.prod.outlook.com/
Link: https://lore.kernel.org/linux-fsdevel/20161102181806.GB1112@redhat.com/
Link: https://lore.kernel.org/lkml/20160923095031.GA14923@redhat.com/
Link: https://lore.kernel.org/lkml/20170213141452.GA30203@redhat.com/
Ref: 45c1a159b85b ("Add PTRACE_O_TRACEVFORKDONE and PTRACE_O_TRACEEXIT facilities.")
Ref: 456f17cd1a28 ("[PATCH] user-vm-unlock-2.5.31-A2")
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Bernd Edlinger <bernd.edlinger@hotmail.de>
---
 fs/exec.c                    | 17 ++++++++++++++---
 include/linux/binfmts.h      |  8 +++++++-
 include/linux/sched/signal.h |  9 ++++++++-
 init/init_task.c             |  1 +
 kernel/fork.c                |  1 +
 5 files changed, 31 insertions(+), 5 deletions(-)

v3: this update fixes lock-order and adds an explicit data member in linux_binprm

Comments

Kirill Tkhai March 17, 2020, 8:56 a.m. UTC | #1
On 14.03.2020 12:11, Bernd Edlinger wrote:
> The cred_guard_mutex is problematic.  The cred_guard_mutex is held
> over the userspace accesses as the arguments from userspace are read.
> The cred_guard_mutex is held of PTRACE_EVENT_EXIT as the the other
> threads are killed.  The cred_guard_mutex is held over
> "put_user(0, tsk->clear_child_tid)" in exit_mm().
> 
> Any of those can result in deadlock, as the cred_guard_mutex is held
> over a possible indefinite userspace waits for userspace.
> 
> Add exec_update_mutex that is only held over exec updating process
> with the new contents of exec, so that code that needs not to be
> confused by exec changing the mm and the cred in ways that can not
> happen during ordinary execution of a process.
> 
> The plan is to switch the users of cred_guard_mutex to
> exec_udpate_mutex one by one.  This lets us move forward while still
> being careful and not introducing any regressions.
> 
> Link: https://lore.kernel.org/lkml/20160921152946.GA24210@dhcp22.suse.cz/
> Link: https://lore.kernel.org/lkml/AM6PR03MB5170B06F3A2B75EFB98D071AE4E60@AM6PR03MB5170.eurprd03.prod.outlook.com/
> Link: https://lore.kernel.org/linux-fsdevel/20161102181806.GB1112@redhat.com/
> Link: https://lore.kernel.org/lkml/20160923095031.GA14923@redhat.com/
> Link: https://lore.kernel.org/lkml/20170213141452.GA30203@redhat.com/
> Ref: 45c1a159b85b ("Add PTRACE_O_TRACEVFORKDONE and PTRACE_O_TRACEEXIT facilities.")
> Ref: 456f17cd1a28 ("[PATCH] user-vm-unlock-2.5.31-A2")
> Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
> Signed-off-by: Bernd Edlinger <bernd.edlinger@hotmail.de>
> ---
>  fs/exec.c                    | 17 ++++++++++++++---
>  include/linux/binfmts.h      |  8 +++++++-
>  include/linux/sched/signal.h |  9 ++++++++-
>  init/init_task.c             |  1 +
>  kernel/fork.c                |  1 +
>  5 files changed, 31 insertions(+), 5 deletions(-)
> 
> v3: this update fixes lock-order and adds an explicit data member in linux_binprm
> 
> diff --git a/fs/exec.c b/fs/exec.c
> index d820a72..11974a1 100644
> --- a/fs/exec.c
> +++ b/fs/exec.c
> @@ -1014,12 +1014,17 @@ static int exec_mmap(struct mm_struct *mm)
>  {
>  	struct task_struct *tsk;
>  	struct mm_struct *old_mm, *active_mm;
> +	int ret;
>  
>  	/* Notify parent that we're no longer interested in the old VM */
>  	tsk = current;
>  	old_mm = current->mm;
>  	exec_mm_release(tsk, old_mm);
>  
> +	ret = mutex_lock_killable(&tsk->signal->exec_update_mutex);
> +	if (ret)
> +		return ret;
> +
>  	if (old_mm) {
>  		sync_mm_rss(old_mm);
>  		/*
> @@ -1031,9 +1036,11 @@ static int exec_mmap(struct mm_struct *mm)
>  		down_read(&old_mm->mmap_sem);
>  		if (unlikely(old_mm->core_state)) {
>  			up_read(&old_mm->mmap_sem);
> +			mutex_unlock(&tsk->signal->exec_update_mutex);
>  			return -EINTR;
>  		}
>  	}
> +
>  	task_lock(tsk);
>  	active_mm = tsk->active_mm;
>  	membarrier_exec_mmap(mm);
> @@ -1288,11 +1295,12 @@ int flush_old_exec(struct linux_binprm * bprm)
>  		goto out;
>  
>  	/*
> -	 * After clearing bprm->mm (to mark that current is using the
> -	 * prepared mm now), we have nothing left of the original
> +	 * After setting bprm->called_exec_mmap (to mark that current is
> +	 * using the prepared mm now), we have nothing left of the original
>  	 * process. If anything from here on returns an error, the check
>  	 * in search_binary_handler() will SEGV current.
>  	 */
> +	bprm->called_exec_mmap = 1;

The two below is non-breaking pair:

exec_mmap(bprm->mm);
bprm->called_exec_mmap = 1;

Why not move this into exec_mmap(), so nobody definitely inserts something
between them?

>  	bprm->mm = NULL;
>  
>  #ifdef CONFIG_POSIX_TIMERS
> @@ -1438,6 +1446,8 @@ static void free_bprm(struct linux_binprm *bprm)
>  {
>  	free_arg_pages(bprm);
>  	if (bprm->cred) {
> +		if (bprm->called_exec_mmap)
> +			mutex_unlock(&current->signal->exec_update_mutex);
>  		mutex_unlock(&current->signal->cred_guard_mutex);
>  		abort_creds(bprm->cred);
>  	}
> @@ -1487,6 +1497,7 @@ void install_exec_creds(struct linux_binprm *bprm)
>  	 * credentials; any time after this it may be unlocked.
>  	 */
>  	security_bprm_committed_creds(bprm);
> +	mutex_unlock(&current->signal->exec_update_mutex);
>  	mutex_unlock(&current->signal->cred_guard_mutex);
>  }
>  EXPORT_SYMBOL(install_exec_creds);
> @@ -1678,7 +1689,7 @@ int search_binary_handler(struct linux_binprm *bprm)
>  
>  		read_lock(&binfmt_lock);
>  		put_binfmt(fmt);
> -		if (retval < 0 && !bprm->mm) {
> +		if (retval < 0 && bprm->called_exec_mmap) {
>  			/* we got to flush_old_exec() and failed after it */
>  			read_unlock(&binfmt_lock);
>  			force_sigsegv(SIGSEGV);
> diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
> index b40fc63..a345d9f 100644
> --- a/include/linux/binfmts.h
> +++ b/include/linux/binfmts.h
> @@ -44,7 +44,13 @@ struct linux_binprm {
>  		 * exec has happened. Used to sanitize execution environment
>  		 * and to set AT_SECURE auxv for glibc.
>  		 */
> -		secureexec:1;
> +		secureexec:1,
> +		/*
> +		 * Set by flush_old_exec, when exec_mmap has been called.
> +		 * This is past the point of no return, when the
> +		 * exec_update_mutex has been taken.
> +		 */
> +		called_exec_mmap:1;
>  #ifdef __alpha__
>  	unsigned int taso:1;
>  #endif
> diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
> index 8805025..a29df79 100644
> --- a/include/linux/sched/signal.h
> +++ b/include/linux/sched/signal.h
> @@ -224,7 +224,14 @@ struct signal_struct {
>  
>  	struct mutex cred_guard_mutex;	/* guard against foreign influences on
>  					 * credential calculations
> -					 * (notably. ptrace) */
> +					 * (notably. ptrace)
> +					 * Deprecated do not use in new code.
> +					 * Use exec_update_mutex instead.
> +					 */
> +	struct mutex exec_update_mutex;	/* Held while task_struct is being
> +					 * updated during exec, and may have
> +					 * inconsistent permissions.
> +					 */
>  } __randomize_layout;
>  
>  /*
> diff --git a/init/init_task.c b/init/init_task.c
> index 9e5cbe5..bd403ed 100644
> --- a/init/init_task.c
> +++ b/init/init_task.c
> @@ -26,6 +26,7 @@
>  	.multiprocess	= HLIST_HEAD_INIT,
>  	.rlim		= INIT_RLIMITS,
>  	.cred_guard_mutex = __MUTEX_INITIALIZER(init_signals.cred_guard_mutex),
> +	.exec_update_mutex = __MUTEX_INITIALIZER(init_signals.exec_update_mutex),
>  #ifdef CONFIG_POSIX_TIMERS
>  	.posix_timers = LIST_HEAD_INIT(init_signals.posix_timers),
>  	.cputimer	= {
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 8642530..036b692 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1594,6 +1594,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
>  	sig->oom_score_adj_min = current->signal->oom_score_adj_min;
>  
>  	mutex_init(&sig->cred_guard_mutex);
> +	mutex_init(&sig->exec_update_mutex);
>  
>  	return 0;
>  }
>
Bernd Edlinger March 17, 2020, 9:53 p.m. UTC | #2
On 3/17/20 9:56 AM, Kirill Tkhai wrote:
> On 14.03.2020 12:11, Bernd Edlinger wrote:
>> The cred_guard_mutex is problematic.  The cred_guard_mutex is held
>> over the userspace accesses as the arguments from userspace are read.
>> The cred_guard_mutex is held of PTRACE_EVENT_EXIT as the the other
>> threads are killed.  The cred_guard_mutex is held over
>> "put_user(0, tsk->clear_child_tid)" in exit_mm().
>>
>> Any of those can result in deadlock, as the cred_guard_mutex is held
>> over a possible indefinite userspace waits for userspace.
>>
>> Add exec_update_mutex that is only held over exec updating process
>> with the new contents of exec, so that code that needs not to be
>> confused by exec changing the mm and the cred in ways that can not
>> happen during ordinary execution of a process.
>>
>> The plan is to switch the users of cred_guard_mutex to
>> exec_udpate_mutex one by one.  This lets us move forward while still
>> being careful and not introducing any regressions.
>>
>> Link: https://lore.kernel.org/lkml/20160921152946.GA24210@dhcp22.suse.cz/
>> Link: https://lore.kernel.org/lkml/AM6PR03MB5170B06F3A2B75EFB98D071AE4E60@AM6PR03MB5170.eurprd03.prod.outlook.com/
>> Link: https://lore.kernel.org/linux-fsdevel/20161102181806.GB1112@redhat.com/
>> Link: https://lore.kernel.org/lkml/20160923095031.GA14923@redhat.com/
>> Link: https://lore.kernel.org/lkml/20170213141452.GA30203@redhat.com/
>> Ref: 45c1a159b85b ("Add PTRACE_O_TRACEVFORKDONE and PTRACE_O_TRACEEXIT facilities.")
>> Ref: 456f17cd1a28 ("[PATCH] user-vm-unlock-2.5.31-A2")
>> Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
>> Signed-off-by: Bernd Edlinger <bernd.edlinger@hotmail.de>
>> ---
>>  fs/exec.c                    | 17 ++++++++++++++---
>>  include/linux/binfmts.h      |  8 +++++++-
>>  include/linux/sched/signal.h |  9 ++++++++-
>>  init/init_task.c             |  1 +
>>  kernel/fork.c                |  1 +
>>  5 files changed, 31 insertions(+), 5 deletions(-)
>>
>> v3: this update fixes lock-order and adds an explicit data member in linux_binprm
>>
>> diff --git a/fs/exec.c b/fs/exec.c
>> index d820a72..11974a1 100644
>> --- a/fs/exec.c
>> +++ b/fs/exec.c
>> @@ -1014,12 +1014,17 @@ static int exec_mmap(struct mm_struct *mm)
>>  {
>>  	struct task_struct *tsk;
>>  	struct mm_struct *old_mm, *active_mm;
>> +	int ret;
>>  
>>  	/* Notify parent that we're no longer interested in the old VM */
>>  	tsk = current;
>>  	old_mm = current->mm;
>>  	exec_mm_release(tsk, old_mm);
>>  
>> +	ret = mutex_lock_killable(&tsk->signal->exec_update_mutex);
>> +	if (ret)
>> +		return ret;
>> +
>>  	if (old_mm) {
>>  		sync_mm_rss(old_mm);
>>  		/*
>> @@ -1031,9 +1036,11 @@ static int exec_mmap(struct mm_struct *mm)
>>  		down_read(&old_mm->mmap_sem);
>>  		if (unlikely(old_mm->core_state)) {
>>  			up_read(&old_mm->mmap_sem);
>> +			mutex_unlock(&tsk->signal->exec_update_mutex);
>>  			return -EINTR;
>>  		}
>>  	}
>> +
>>  	task_lock(tsk);
>>  	active_mm = tsk->active_mm;
>>  	membarrier_exec_mmap(mm);
>> @@ -1288,11 +1295,12 @@ int flush_old_exec(struct linux_binprm * bprm)
>>  		goto out;
>>  
>>  	/*
>> -	 * After clearing bprm->mm (to mark that current is using the
>> -	 * prepared mm now), we have nothing left of the original
>> +	 * After setting bprm->called_exec_mmap (to mark that current is
>> +	 * using the prepared mm now), we have nothing left of the original
>>  	 * process. If anything from here on returns an error, the check
>>  	 * in search_binary_handler() will SEGV current.
>>  	 */
>> +	bprm->called_exec_mmap = 1;
> 
> The two below is non-breaking pair:
> 
> exec_mmap(bprm->mm);
> bprm->called_exec_mmap = 1;
> 
> Why not move this into exec_mmap(), so nobody definitely inserts something
> between them?
> 

Hmm, could be done, but then I would probably need a different name than
"called_exec_mmap".

How about adding a nice function comment to exec_mmap that calls out the
changed behaviour that the exec_update_mutex is taken unless the function
fails?


Bernd.


>>  	bprm->mm = NULL;
>>  
>>  #ifdef CONFIG_POSIX_TIMERS
>> @@ -1438,6 +1446,8 @@ static void free_bprm(struct linux_binprm *bprm)
>>  {
>>  	free_arg_pages(bprm);
>>  	if (bprm->cred) {
>> +		if (bprm->called_exec_mmap)
>> +			mutex_unlock(&current->signal->exec_update_mutex);
>>  		mutex_unlock(&current->signal->cred_guard_mutex);
>>  		abort_creds(bprm->cred);
>>  	}
>> @@ -1487,6 +1497,7 @@ void install_exec_creds(struct linux_binprm *bprm)
>>  	 * credentials; any time after this it may be unlocked.
>>  	 */
>>  	security_bprm_committed_creds(bprm);
>> +	mutex_unlock(&current->signal->exec_update_mutex);
>>  	mutex_unlock(&current->signal->cred_guard_mutex);
>>  }
>>  EXPORT_SYMBOL(install_exec_creds);
>> @@ -1678,7 +1689,7 @@ int search_binary_handler(struct linux_binprm *bprm)
>>  
>>  		read_lock(&binfmt_lock);
>>  		put_binfmt(fmt);
>> -		if (retval < 0 && !bprm->mm) {
>> +		if (retval < 0 && bprm->called_exec_mmap) {
>>  			/* we got to flush_old_exec() and failed after it */
>>  			read_unlock(&binfmt_lock);
>>  			force_sigsegv(SIGSEGV);
>> diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
>> index b40fc63..a345d9f 100644
>> --- a/include/linux/binfmts.h
>> +++ b/include/linux/binfmts.h
>> @@ -44,7 +44,13 @@ struct linux_binprm {
>>  		 * exec has happened. Used to sanitize execution environment
>>  		 * and to set AT_SECURE auxv for glibc.
>>  		 */
>> -		secureexec:1;
>> +		secureexec:1,
>> +		/*
>> +		 * Set by flush_old_exec, when exec_mmap has been called.
>> +		 * This is past the point of no return, when the
>> +		 * exec_update_mutex has been taken.
>> +		 */
>> +		called_exec_mmap:1;
>>  #ifdef __alpha__
>>  	unsigned int taso:1;
>>  #endif
>> diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
>> index 8805025..a29df79 100644
>> --- a/include/linux/sched/signal.h
>> +++ b/include/linux/sched/signal.h
>> @@ -224,7 +224,14 @@ struct signal_struct {
>>  
>>  	struct mutex cred_guard_mutex;	/* guard against foreign influences on
>>  					 * credential calculations
>> -					 * (notably. ptrace) */
>> +					 * (notably. ptrace)
>> +					 * Deprecated do not use in new code.
>> +					 * Use exec_update_mutex instead.
>> +					 */
>> +	struct mutex exec_update_mutex;	/* Held while task_struct is being
>> +					 * updated during exec, and may have
>> +					 * inconsistent permissions.
>> +					 */
>>  } __randomize_layout;
>>  
>>  /*
>> diff --git a/init/init_task.c b/init/init_task.c
>> index 9e5cbe5..bd403ed 100644
>> --- a/init/init_task.c
>> +++ b/init/init_task.c
>> @@ -26,6 +26,7 @@
>>  	.multiprocess	= HLIST_HEAD_INIT,
>>  	.rlim		= INIT_RLIMITS,
>>  	.cred_guard_mutex = __MUTEX_INITIALIZER(init_signals.cred_guard_mutex),
>> +	.exec_update_mutex = __MUTEX_INITIALIZER(init_signals.exec_update_mutex),
>>  #ifdef CONFIG_POSIX_TIMERS
>>  	.posix_timers = LIST_HEAD_INIT(init_signals.posix_timers),
>>  	.cputimer	= {
>> diff --git a/kernel/fork.c b/kernel/fork.c
>> index 8642530..036b692 100644
>> --- a/kernel/fork.c
>> +++ b/kernel/fork.c
>> @@ -1594,6 +1594,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
>>  	sig->oom_score_adj_min = current->signal->oom_score_adj_min;
>>  
>>  	mutex_init(&sig->cred_guard_mutex);
>> +	mutex_init(&sig->exec_update_mutex);
>>  
>>  	return 0;
>>  }
>>
>
Kirill Tkhai March 18, 2020, 12:22 p.m. UTC | #3
On 18.03.2020 00:53, Bernd Edlinger wrote:
> On 3/17/20 9:56 AM, Kirill Tkhai wrote:
>> On 14.03.2020 12:11, Bernd Edlinger wrote:
>>> The cred_guard_mutex is problematic.  The cred_guard_mutex is held
>>> over the userspace accesses as the arguments from userspace are read.
>>> The cred_guard_mutex is held of PTRACE_EVENT_EXIT as the the other
>>> threads are killed.  The cred_guard_mutex is held over
>>> "put_user(0, tsk->clear_child_tid)" in exit_mm().
>>>
>>> Any of those can result in deadlock, as the cred_guard_mutex is held
>>> over a possible indefinite userspace waits for userspace.
>>>
>>> Add exec_update_mutex that is only held over exec updating process
>>> with the new contents of exec, so that code that needs not to be
>>> confused by exec changing the mm and the cred in ways that can not
>>> happen during ordinary execution of a process.
>>>
>>> The plan is to switch the users of cred_guard_mutex to
>>> exec_udpate_mutex one by one.  This lets us move forward while still
>>> being careful and not introducing any regressions.
>>>
>>> Link: https://lore.kernel.org/lkml/20160921152946.GA24210@dhcp22.suse.cz/
>>> Link: https://lore.kernel.org/lkml/AM6PR03MB5170B06F3A2B75EFB98D071AE4E60@AM6PR03MB5170.eurprd03.prod.outlook.com/
>>> Link: https://lore.kernel.org/linux-fsdevel/20161102181806.GB1112@redhat.com/
>>> Link: https://lore.kernel.org/lkml/20160923095031.GA14923@redhat.com/
>>> Link: https://lore.kernel.org/lkml/20170213141452.GA30203@redhat.com/
>>> Ref: 45c1a159b85b ("Add PTRACE_O_TRACEVFORKDONE and PTRACE_O_TRACEEXIT facilities.")
>>> Ref: 456f17cd1a28 ("[PATCH] user-vm-unlock-2.5.31-A2")
>>> Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
>>> Signed-off-by: Bernd Edlinger <bernd.edlinger@hotmail.de>
>>> ---
>>>  fs/exec.c                    | 17 ++++++++++++++---
>>>  include/linux/binfmts.h      |  8 +++++++-
>>>  include/linux/sched/signal.h |  9 ++++++++-
>>>  init/init_task.c             |  1 +
>>>  kernel/fork.c                |  1 +
>>>  5 files changed, 31 insertions(+), 5 deletions(-)
>>>
>>> v3: this update fixes lock-order and adds an explicit data member in linux_binprm
>>>
>>> diff --git a/fs/exec.c b/fs/exec.c
>>> index d820a72..11974a1 100644
>>> --- a/fs/exec.c
>>> +++ b/fs/exec.c
>>> @@ -1014,12 +1014,17 @@ static int exec_mmap(struct mm_struct *mm)
>>>  {
>>>  	struct task_struct *tsk;
>>>  	struct mm_struct *old_mm, *active_mm;
>>> +	int ret;
>>>  
>>>  	/* Notify parent that we're no longer interested in the old VM */
>>>  	tsk = current;
>>>  	old_mm = current->mm;
>>>  	exec_mm_release(tsk, old_mm);
>>>  
>>> +	ret = mutex_lock_killable(&tsk->signal->exec_update_mutex);
>>> +	if (ret)
>>> +		return ret;
>>> +
>>>  	if (old_mm) {
>>>  		sync_mm_rss(old_mm);
>>>  		/*
>>> @@ -1031,9 +1036,11 @@ static int exec_mmap(struct mm_struct *mm)
>>>  		down_read(&old_mm->mmap_sem);
>>>  		if (unlikely(old_mm->core_state)) {
>>>  			up_read(&old_mm->mmap_sem);
>>> +			mutex_unlock(&tsk->signal->exec_update_mutex);
>>>  			return -EINTR;
>>>  		}
>>>  	}
>>> +
>>>  	task_lock(tsk);
>>>  	active_mm = tsk->active_mm;
>>>  	membarrier_exec_mmap(mm);
>>> @@ -1288,11 +1295,12 @@ int flush_old_exec(struct linux_binprm * bprm)
>>>  		goto out;
>>>  
>>>  	/*
>>> -	 * After clearing bprm->mm (to mark that current is using the
>>> -	 * prepared mm now), we have nothing left of the original
>>> +	 * After setting bprm->called_exec_mmap (to mark that current is
>>> +	 * using the prepared mm now), we have nothing left of the original
>>>  	 * process. If anything from here on returns an error, the check
>>>  	 * in search_binary_handler() will SEGV current.
>>>  	 */
>>> +	bprm->called_exec_mmap = 1;
>>
>> The two below is non-breaking pair:
>>
>> exec_mmap(bprm->mm);
>> bprm->called_exec_mmap = 1;
>>
>> Why not move this into exec_mmap(), so nobody definitely inserts something
>> between them?
>>
> 
> Hmm, could be done, but then I would probably need a different name than
> "called_exec_mmap".
> 
> How about adding a nice function comment to exec_mmap that calls out the
> changed behaviour that the exec_update_mutex is taken unless the function
> fails?

Not sure, I understand correct.

Could you post this like a small patch hunk (on top of anything you want)?

> Bernd.
> 
> 
>>>  	bprm->mm = NULL;
>>>  
>>>  #ifdef CONFIG_POSIX_TIMERS
>>> @@ -1438,6 +1446,8 @@ static void free_bprm(struct linux_binprm *bprm)
>>>  {
>>>  	free_arg_pages(bprm);
>>>  	if (bprm->cred) {
>>> +		if (bprm->called_exec_mmap)
>>> +			mutex_unlock(&current->signal->exec_update_mutex);
>>>  		mutex_unlock(&current->signal->cred_guard_mutex);
>>>  		abort_creds(bprm->cred);
>>>  	}
>>> @@ -1487,6 +1497,7 @@ void install_exec_creds(struct linux_binprm *bprm)
>>>  	 * credentials; any time after this it may be unlocked.
>>>  	 */
>>>  	security_bprm_committed_creds(bprm);
>>> +	mutex_unlock(&current->signal->exec_update_mutex);
>>>  	mutex_unlock(&current->signal->cred_guard_mutex);
>>>  }
>>>  EXPORT_SYMBOL(install_exec_creds);
>>> @@ -1678,7 +1689,7 @@ int search_binary_handler(struct linux_binprm *bprm)
>>>  
>>>  		read_lock(&binfmt_lock);
>>>  		put_binfmt(fmt);
>>> -		if (retval < 0 && !bprm->mm) {
>>> +		if (retval < 0 && bprm->called_exec_mmap) {
>>>  			/* we got to flush_old_exec() and failed after it */
>>>  			read_unlock(&binfmt_lock);
>>>  			force_sigsegv(SIGSEGV);
>>> diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
>>> index b40fc63..a345d9f 100644
>>> --- a/include/linux/binfmts.h
>>> +++ b/include/linux/binfmts.h
>>> @@ -44,7 +44,13 @@ struct linux_binprm {
>>>  		 * exec has happened. Used to sanitize execution environment
>>>  		 * and to set AT_SECURE auxv for glibc.
>>>  		 */
>>> -		secureexec:1;
>>> +		secureexec:1,
>>> +		/*
>>> +		 * Set by flush_old_exec, when exec_mmap has been called.
>>> +		 * This is past the point of no return, when the
>>> +		 * exec_update_mutex has been taken.
>>> +		 */
>>> +		called_exec_mmap:1;
>>>  #ifdef __alpha__
>>>  	unsigned int taso:1;
>>>  #endif
>>> diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
>>> index 8805025..a29df79 100644
>>> --- a/include/linux/sched/signal.h
>>> +++ b/include/linux/sched/signal.h
>>> @@ -224,7 +224,14 @@ struct signal_struct {
>>>  
>>>  	struct mutex cred_guard_mutex;	/* guard against foreign influences on
>>>  					 * credential calculations
>>> -					 * (notably. ptrace) */
>>> +					 * (notably. ptrace)
>>> +					 * Deprecated do not use in new code.
>>> +					 * Use exec_update_mutex instead.
>>> +					 */
>>> +	struct mutex exec_update_mutex;	/* Held while task_struct is being
>>> +					 * updated during exec, and may have
>>> +					 * inconsistent permissions.
>>> +					 */
>>>  } __randomize_layout;
>>>  
>>>  /*
>>> diff --git a/init/init_task.c b/init/init_task.c
>>> index 9e5cbe5..bd403ed 100644
>>> --- a/init/init_task.c
>>> +++ b/init/init_task.c
>>> @@ -26,6 +26,7 @@
>>>  	.multiprocess	= HLIST_HEAD_INIT,
>>>  	.rlim		= INIT_RLIMITS,
>>>  	.cred_guard_mutex = __MUTEX_INITIALIZER(init_signals.cred_guard_mutex),
>>> +	.exec_update_mutex = __MUTEX_INITIALIZER(init_signals.exec_update_mutex),
>>>  #ifdef CONFIG_POSIX_TIMERS
>>>  	.posix_timers = LIST_HEAD_INIT(init_signals.posix_timers),
>>>  	.cputimer	= {
>>> diff --git a/kernel/fork.c b/kernel/fork.c
>>> index 8642530..036b692 100644
>>> --- a/kernel/fork.c
>>> +++ b/kernel/fork.c
>>> @@ -1594,6 +1594,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
>>>  	sig->oom_score_adj_min = current->signal->oom_score_adj_min;
>>>  
>>>  	mutex_init(&sig->cred_guard_mutex);
>>> +	mutex_init(&sig->exec_update_mutex);
>>>  
>>>  	return 0;
>>>  }
>>>
>>
Bernd Edlinger March 18, 2020, 8:06 p.m. UTC | #4
On 3/18/20 1:22 PM, Kirill Tkhai wrote:
> On 18.03.2020 00:53, Bernd Edlinger wrote:
>> On 3/17/20 9:56 AM, Kirill Tkhai wrote:
>>> On 14.03.2020 12:11, Bernd Edlinger wrote:
>>>> The cred_guard_mutex is problematic.  The cred_guard_mutex is held
>>>> over the userspace accesses as the arguments from userspace are read.
>>>> The cred_guard_mutex is held of PTRACE_EVENT_EXIT as the the other
>>>> threads are killed.  The cred_guard_mutex is held over
>>>> "put_user(0, tsk->clear_child_tid)" in exit_mm().
>>>>
>>>> Any of those can result in deadlock, as the cred_guard_mutex is held
>>>> over a possible indefinite userspace waits for userspace.
>>>>
>>>> Add exec_update_mutex that is only held over exec updating process
>>>> with the new contents of exec, so that code that needs not to be
>>>> confused by exec changing the mm and the cred in ways that can not
>>>> happen during ordinary execution of a process.
>>>>
>>>> The plan is to switch the users of cred_guard_mutex to
>>>> exec_udpate_mutex one by one.  This lets us move forward while still
>>>> being careful and not introducing any regressions.
>>>>
>>>> Link: https://lore.kernel.org/lkml/20160921152946.GA24210@dhcp22.suse.cz/
>>>> Link: https://lore.kernel.org/lkml/AM6PR03MB5170B06F3A2B75EFB98D071AE4E60@AM6PR03MB5170.eurprd03.prod.outlook.com/
>>>> Link: https://lore.kernel.org/linux-fsdevel/20161102181806.GB1112@redhat.com/
>>>> Link: https://lore.kernel.org/lkml/20160923095031.GA14923@redhat.com/
>>>> Link: https://lore.kernel.org/lkml/20170213141452.GA30203@redhat.com/
>>>> Ref: 45c1a159b85b ("Add PTRACE_O_TRACEVFORKDONE and PTRACE_O_TRACEEXIT facilities.")
>>>> Ref: 456f17cd1a28 ("[PATCH] user-vm-unlock-2.5.31-A2")
>>>> Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
>>>> Signed-off-by: Bernd Edlinger <bernd.edlinger@hotmail.de>
>>>> ---
>>>>  fs/exec.c                    | 17 ++++++++++++++---
>>>>  include/linux/binfmts.h      |  8 +++++++-
>>>>  include/linux/sched/signal.h |  9 ++++++++-
>>>>  init/init_task.c             |  1 +
>>>>  kernel/fork.c                |  1 +
>>>>  5 files changed, 31 insertions(+), 5 deletions(-)
>>>>
>>>> v3: this update fixes lock-order and adds an explicit data member in linux_binprm
>>>>
>>>> diff --git a/fs/exec.c b/fs/exec.c
>>>> index d820a72..11974a1 100644
>>>> --- a/fs/exec.c
>>>> +++ b/fs/exec.c
>>>> @@ -1014,12 +1014,17 @@ static int exec_mmap(struct mm_struct *mm)
>>>>  {
>>>>  	struct task_struct *tsk;
>>>>  	struct mm_struct *old_mm, *active_mm;
>>>> +	int ret;
>>>>  
>>>>  	/* Notify parent that we're no longer interested in the old VM */
>>>>  	tsk = current;
>>>>  	old_mm = current->mm;
>>>>  	exec_mm_release(tsk, old_mm);
>>>>  
>>>> +	ret = mutex_lock_killable(&tsk->signal->exec_update_mutex);
>>>> +	if (ret)
>>>> +		return ret;
>>>> +
>>>>  	if (old_mm) {
>>>>  		sync_mm_rss(old_mm);
>>>>  		/*
>>>> @@ -1031,9 +1036,11 @@ static int exec_mmap(struct mm_struct *mm)
>>>>  		down_read(&old_mm->mmap_sem);
>>>>  		if (unlikely(old_mm->core_state)) {
>>>>  			up_read(&old_mm->mmap_sem);
>>>> +			mutex_unlock(&tsk->signal->exec_update_mutex);
>>>>  			return -EINTR;
>>>>  		}
>>>>  	}
>>>> +
>>>>  	task_lock(tsk);
>>>>  	active_mm = tsk->active_mm;
>>>>  	membarrier_exec_mmap(mm);
>>>> @@ -1288,11 +1295,12 @@ int flush_old_exec(struct linux_binprm * bprm)
>>>>  		goto out;
>>>>  
>>>>  	/*
>>>> -	 * After clearing bprm->mm (to mark that current is using the
>>>> -	 * prepared mm now), we have nothing left of the original
>>>> +	 * After setting bprm->called_exec_mmap (to mark that current is
>>>> +	 * using the prepared mm now), we have nothing left of the original
>>>>  	 * process. If anything from here on returns an error, the check
>>>>  	 * in search_binary_handler() will SEGV current.
>>>>  	 */
>>>> +	bprm->called_exec_mmap = 1;
>>>
>>> The two below is non-breaking pair:
>>>
>>> exec_mmap(bprm->mm);
>>> bprm->called_exec_mmap = 1;
>>>
>>> Why not move this into exec_mmap(), so nobody definitely inserts something
>>> between them?
>>>
>>
>> Hmm, could be done, but then I would probably need a different name than
>> "called_exec_mmap".
>>
>> How about adding a nice function comment to exec_mmap that calls out the
>> changed behaviour that the exec_update_mutex is taken unless the function
>> fails?
> 
> Not sure, I understand correct.
> 
> Could you post this like a small patch hunk (on top of anything you want)?
> 

I was thinking of something like that:

--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1010,6 +1010,11 @@ ssize_t read_code(struct file *file, unsigned long addr, 
 }
 EXPORT_SYMBOL(read_code);
 
+/*
+ * Maps the mm_struct mm into the current task struct.
+ * On success, this function returns with the mutex
+ * exec_update_mutex locked.
+ */
 static int exec_mmap(struct mm_struct *mm)
 {
        struct task_struct *tsk;


>> Bernd.
>>
>>
>>>>  	bprm->mm = NULL;
>>>>  
>>>>  #ifdef CONFIG_POSIX_TIMERS
>>>> @@ -1438,6 +1446,8 @@ static void free_bprm(struct linux_binprm *bprm)
>>>>  {
>>>>  	free_arg_pages(bprm);
>>>>  	if (bprm->cred) {
>>>> +		if (bprm->called_exec_mmap)
>>>> +			mutex_unlock(&current->signal->exec_update_mutex);
>>>>  		mutex_unlock(&current->signal->cred_guard_mutex);
>>>>  		abort_creds(bprm->cred);
>>>>  	}
>>>> @@ -1487,6 +1497,7 @@ void install_exec_creds(struct linux_binprm *bprm)
>>>>  	 * credentials; any time after this it may be unlocked.
>>>>  	 */
>>>>  	security_bprm_committed_creds(bprm);
>>>> +	mutex_unlock(&current->signal->exec_update_mutex);
>>>>  	mutex_unlock(&current->signal->cred_guard_mutex);
>>>>  }
>>>>  EXPORT_SYMBOL(install_exec_creds);
>>>> @@ -1678,7 +1689,7 @@ int search_binary_handler(struct linux_binprm *bprm)
>>>>  
>>>>  		read_lock(&binfmt_lock);
>>>>  		put_binfmt(fmt);
>>>> -		if (retval < 0 && !bprm->mm) {
>>>> +		if (retval < 0 && bprm->called_exec_mmap) {
>>>>  			/* we got to flush_old_exec() and failed after it */
>>>>  			read_unlock(&binfmt_lock);
>>>>  			force_sigsegv(SIGSEGV);
>>>> diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
>>>> index b40fc63..a345d9f 100644
>>>> --- a/include/linux/binfmts.h
>>>> +++ b/include/linux/binfmts.h
>>>> @@ -44,7 +44,13 @@ struct linux_binprm {
>>>>  		 * exec has happened. Used to sanitize execution environment
>>>>  		 * and to set AT_SECURE auxv for glibc.
>>>>  		 */
>>>> -		secureexec:1;
>>>> +		secureexec:1,
>>>> +		/*
>>>> +		 * Set by flush_old_exec, when exec_mmap has been called.
>>>> +		 * This is past the point of no return, when the
>>>> +		 * exec_update_mutex has been taken.
>>>> +		 */
>>>> +		called_exec_mmap:1;
>>>>  #ifdef __alpha__
>>>>  	unsigned int taso:1;
>>>>  #endif
>>>> diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
>>>> index 8805025..a29df79 100644
>>>> --- a/include/linux/sched/signal.h
>>>> +++ b/include/linux/sched/signal.h
>>>> @@ -224,7 +224,14 @@ struct signal_struct {
>>>>  
>>>>  	struct mutex cred_guard_mutex;	/* guard against foreign influences on
>>>>  					 * credential calculations
>>>> -					 * (notably. ptrace) */
>>>> +					 * (notably. ptrace)
>>>> +					 * Deprecated do not use in new code.
>>>> +					 * Use exec_update_mutex instead.
>>>> +					 */
>>>> +	struct mutex exec_update_mutex;	/* Held while task_struct is being
>>>> +					 * updated during exec, and may have
>>>> +					 * inconsistent permissions.
>>>> +					 */
>>>>  } __randomize_layout;
>>>>  
>>>>  /*
>>>> diff --git a/init/init_task.c b/init/init_task.c
>>>> index 9e5cbe5..bd403ed 100644
>>>> --- a/init/init_task.c
>>>> +++ b/init/init_task.c
>>>> @@ -26,6 +26,7 @@
>>>>  	.multiprocess	= HLIST_HEAD_INIT,
>>>>  	.rlim		= INIT_RLIMITS,
>>>>  	.cred_guard_mutex = __MUTEX_INITIALIZER(init_signals.cred_guard_mutex),
>>>> +	.exec_update_mutex = __MUTEX_INITIALIZER(init_signals.exec_update_mutex),
>>>>  #ifdef CONFIG_POSIX_TIMERS
>>>>  	.posix_timers = LIST_HEAD_INIT(init_signals.posix_timers),
>>>>  	.cputimer	= {
>>>> diff --git a/kernel/fork.c b/kernel/fork.c
>>>> index 8642530..036b692 100644
>>>> --- a/kernel/fork.c
>>>> +++ b/kernel/fork.c
>>>> @@ -1594,6 +1594,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
>>>>  	sig->oom_score_adj_min = current->signal->oom_score_adj_min;
>>>>  
>>>>  	mutex_init(&sig->cred_guard_mutex);
>>>> +	mutex_init(&sig->exec_update_mutex);
>>>>  
>>>>  	return 0;
>>>>  }
>>>>
>>>
>
Kirill Tkhai March 19, 2020, 7:13 a.m. UTC | #5
On 18.03.2020 23:06, Bernd Edlinger wrote:
> On 3/18/20 1:22 PM, Kirill Tkhai wrote:
>> On 18.03.2020 00:53, Bernd Edlinger wrote:
>>> On 3/17/20 9:56 AM, Kirill Tkhai wrote:
>>>> On 14.03.2020 12:11, Bernd Edlinger wrote:
>>>>> The cred_guard_mutex is problematic.  The cred_guard_mutex is held
>>>>> over the userspace accesses as the arguments from userspace are read.
>>>>> The cred_guard_mutex is held of PTRACE_EVENT_EXIT as the the other
>>>>> threads are killed.  The cred_guard_mutex is held over
>>>>> "put_user(0, tsk->clear_child_tid)" in exit_mm().
>>>>>
>>>>> Any of those can result in deadlock, as the cred_guard_mutex is held
>>>>> over a possible indefinite userspace waits for userspace.
>>>>>
>>>>> Add exec_update_mutex that is only held over exec updating process
>>>>> with the new contents of exec, so that code that needs not to be
>>>>> confused by exec changing the mm and the cred in ways that can not
>>>>> happen during ordinary execution of a process.
>>>>>
>>>>> The plan is to switch the users of cred_guard_mutex to
>>>>> exec_udpate_mutex one by one.  This lets us move forward while still
>>>>> being careful and not introducing any regressions.
>>>>>
>>>>> Link: https://lore.kernel.org/lkml/20160921152946.GA24210@dhcp22.suse.cz/
>>>>> Link: https://lore.kernel.org/lkml/AM6PR03MB5170B06F3A2B75EFB98D071AE4E60@AM6PR03MB5170.eurprd03.prod.outlook.com/
>>>>> Link: https://lore.kernel.org/linux-fsdevel/20161102181806.GB1112@redhat.com/
>>>>> Link: https://lore.kernel.org/lkml/20160923095031.GA14923@redhat.com/
>>>>> Link: https://lore.kernel.org/lkml/20170213141452.GA30203@redhat.com/
>>>>> Ref: 45c1a159b85b ("Add PTRACE_O_TRACEVFORKDONE and PTRACE_O_TRACEEXIT facilities.")
>>>>> Ref: 456f17cd1a28 ("[PATCH] user-vm-unlock-2.5.31-A2")
>>>>> Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
>>>>> Signed-off-by: Bernd Edlinger <bernd.edlinger@hotmail.de>
>>>>> ---
>>>>>  fs/exec.c                    | 17 ++++++++++++++---
>>>>>  include/linux/binfmts.h      |  8 +++++++-
>>>>>  include/linux/sched/signal.h |  9 ++++++++-
>>>>>  init/init_task.c             |  1 +
>>>>>  kernel/fork.c                |  1 +
>>>>>  5 files changed, 31 insertions(+), 5 deletions(-)
>>>>>
>>>>> v3: this update fixes lock-order and adds an explicit data member in linux_binprm
>>>>>
>>>>> diff --git a/fs/exec.c b/fs/exec.c
>>>>> index d820a72..11974a1 100644
>>>>> --- a/fs/exec.c
>>>>> +++ b/fs/exec.c
>>>>> @@ -1014,12 +1014,17 @@ static int exec_mmap(struct mm_struct *mm)
>>>>>  {
>>>>>  	struct task_struct *tsk;
>>>>>  	struct mm_struct *old_mm, *active_mm;
>>>>> +	int ret;
>>>>>  
>>>>>  	/* Notify parent that we're no longer interested in the old VM */
>>>>>  	tsk = current;
>>>>>  	old_mm = current->mm;
>>>>>  	exec_mm_release(tsk, old_mm);
>>>>>  
>>>>> +	ret = mutex_lock_killable(&tsk->signal->exec_update_mutex);
>>>>> +	if (ret)
>>>>> +		return ret;
>>>>> +
>>>>>  	if (old_mm) {
>>>>>  		sync_mm_rss(old_mm);
>>>>>  		/*
>>>>> @@ -1031,9 +1036,11 @@ static int exec_mmap(struct mm_struct *mm)
>>>>>  		down_read(&old_mm->mmap_sem);
>>>>>  		if (unlikely(old_mm->core_state)) {
>>>>>  			up_read(&old_mm->mmap_sem);
>>>>> +			mutex_unlock(&tsk->signal->exec_update_mutex);
>>>>>  			return -EINTR;
>>>>>  		}
>>>>>  	}
>>>>> +
>>>>>  	task_lock(tsk);
>>>>>  	active_mm = tsk->active_mm;
>>>>>  	membarrier_exec_mmap(mm);
>>>>> @@ -1288,11 +1295,12 @@ int flush_old_exec(struct linux_binprm * bprm)
>>>>>  		goto out;
>>>>>  
>>>>>  	/*
>>>>> -	 * After clearing bprm->mm (to mark that current is using the
>>>>> -	 * prepared mm now), we have nothing left of the original
>>>>> +	 * After setting bprm->called_exec_mmap (to mark that current is
>>>>> +	 * using the prepared mm now), we have nothing left of the original
>>>>>  	 * process. If anything from here on returns an error, the check
>>>>>  	 * in search_binary_handler() will SEGV current.
>>>>>  	 */
>>>>> +	bprm->called_exec_mmap = 1;
>>>>
>>>> The two below is non-breaking pair:
>>>>
>>>> exec_mmap(bprm->mm);
>>>> bprm->called_exec_mmap = 1;
>>>>
>>>> Why not move this into exec_mmap(), so nobody definitely inserts something
>>>> between them?
>>>>
>>>
>>> Hmm, could be done, but then I would probably need a different name than
>>> "called_exec_mmap".
>>>
>>> How about adding a nice function comment to exec_mmap that calls out the
>>> changed behaviour that the exec_update_mutex is taken unless the function
>>> fails?
>>
>> Not sure, I understand correct.
>>
>> Could you post this like a small patch hunk (on top of anything you want)?
>>
> 
> I was thinking of something like that:
> 
> --- a/fs/exec.c
> +++ b/fs/exec.c
> @@ -1010,6 +1010,11 @@ ssize_t read_code(struct file *file, unsigned long addr, 
>  }
>  EXPORT_SYMBOL(read_code);
>  
> +/*
> + * Maps the mm_struct mm into the current task struct.
> + * On success, this function returns with the mutex
> + * exec_update_mutex locked.
> + */

Looks OK for me.

>  static int exec_mmap(struct mm_struct *mm)
>  {
>         struct task_struct *tsk;
> 
> 
>>> Bernd.
>>>
>>>
>>>>>  	bprm->mm = NULL;
>>>>>  
>>>>>  #ifdef CONFIG_POSIX_TIMERS
>>>>> @@ -1438,6 +1446,8 @@ static void free_bprm(struct linux_binprm *bprm)
>>>>>  {
>>>>>  	free_arg_pages(bprm);
>>>>>  	if (bprm->cred) {
>>>>> +		if (bprm->called_exec_mmap)
>>>>> +			mutex_unlock(&current->signal->exec_update_mutex);
>>>>>  		mutex_unlock(&current->signal->cred_guard_mutex);
>>>>>  		abort_creds(bprm->cred);
>>>>>  	}
>>>>> @@ -1487,6 +1497,7 @@ void install_exec_creds(struct linux_binprm *bprm)
>>>>>  	 * credentials; any time after this it may be unlocked.
>>>>>  	 */
>>>>>  	security_bprm_committed_creds(bprm);
>>>>> +	mutex_unlock(&current->signal->exec_update_mutex);
>>>>>  	mutex_unlock(&current->signal->cred_guard_mutex);
>>>>>  }
>>>>>  EXPORT_SYMBOL(install_exec_creds);
>>>>> @@ -1678,7 +1689,7 @@ int search_binary_handler(struct linux_binprm *bprm)
>>>>>  
>>>>>  		read_lock(&binfmt_lock);
>>>>>  		put_binfmt(fmt);
>>>>> -		if (retval < 0 && !bprm->mm) {
>>>>> +		if (retval < 0 && bprm->called_exec_mmap) {
>>>>>  			/* we got to flush_old_exec() and failed after it */
>>>>>  			read_unlock(&binfmt_lock);
>>>>>  			force_sigsegv(SIGSEGV);
>>>>> diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
>>>>> index b40fc63..a345d9f 100644
>>>>> --- a/include/linux/binfmts.h
>>>>> +++ b/include/linux/binfmts.h
>>>>> @@ -44,7 +44,13 @@ struct linux_binprm {
>>>>>  		 * exec has happened. Used to sanitize execution environment
>>>>>  		 * and to set AT_SECURE auxv for glibc.
>>>>>  		 */
>>>>> -		secureexec:1;
>>>>> +		secureexec:1,
>>>>> +		/*
>>>>> +		 * Set by flush_old_exec, when exec_mmap has been called.
>>>>> +		 * This is past the point of no return, when the
>>>>> +		 * exec_update_mutex has been taken.
>>>>> +		 */
>>>>> +		called_exec_mmap:1;
>>>>>  #ifdef __alpha__
>>>>>  	unsigned int taso:1;
>>>>>  #endif
>>>>> diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
>>>>> index 8805025..a29df79 100644
>>>>> --- a/include/linux/sched/signal.h
>>>>> +++ b/include/linux/sched/signal.h
>>>>> @@ -224,7 +224,14 @@ struct signal_struct {
>>>>>  
>>>>>  	struct mutex cred_guard_mutex;	/* guard against foreign influences on
>>>>>  					 * credential calculations
>>>>> -					 * (notably. ptrace) */
>>>>> +					 * (notably. ptrace)
>>>>> +					 * Deprecated do not use in new code.
>>>>> +					 * Use exec_update_mutex instead.
>>>>> +					 */
>>>>> +	struct mutex exec_update_mutex;	/* Held while task_struct is being
>>>>> +					 * updated during exec, and may have
>>>>> +					 * inconsistent permissions.
>>>>> +					 */
>>>>>  } __randomize_layout;
>>>>>  
>>>>>  /*
>>>>> diff --git a/init/init_task.c b/init/init_task.c
>>>>> index 9e5cbe5..bd403ed 100644
>>>>> --- a/init/init_task.c
>>>>> +++ b/init/init_task.c
>>>>> @@ -26,6 +26,7 @@
>>>>>  	.multiprocess	= HLIST_HEAD_INIT,
>>>>>  	.rlim		= INIT_RLIMITS,
>>>>>  	.cred_guard_mutex = __MUTEX_INITIALIZER(init_signals.cred_guard_mutex),
>>>>> +	.exec_update_mutex = __MUTEX_INITIALIZER(init_signals.exec_update_mutex),
>>>>>  #ifdef CONFIG_POSIX_TIMERS
>>>>>  	.posix_timers = LIST_HEAD_INIT(init_signals.posix_timers),
>>>>>  	.cputimer	= {
>>>>> diff --git a/kernel/fork.c b/kernel/fork.c
>>>>> index 8642530..036b692 100644
>>>>> --- a/kernel/fork.c
>>>>> +++ b/kernel/fork.c
>>>>> @@ -1594,6 +1594,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
>>>>>  	sig->oom_score_adj_min = current->signal->oom_score_adj_min;
>>>>>  
>>>>>  	mutex_init(&sig->cred_guard_mutex);
>>>>> +	mutex_init(&sig->exec_update_mutex);
>>>>>  
>>>>>  	return 0;
>>>>>  }
>>>>>
>>>>
>>
Bernd Edlinger March 19, 2020, 7:19 a.m. UTC | #6
On 3/19/20 8:13 AM, Kirill Tkhai wrote:
> On 18.03.2020 23:06, Bernd Edlinger wrote:
>>
>> I was thinking of something like that:
>>
>> --- a/fs/exec.c
>> +++ b/fs/exec.c
>> @@ -1010,6 +1010,11 @@ ssize_t read_code(struct file *file, unsigned long addr, 
>>  }
>>  EXPORT_SYMBOL(read_code);
>>  
>> +/*
>> + * Maps the mm_struct mm into the current task struct.
>> + * On success, this function returns with the mutex
>> + * exec_update_mutex locked.
>> + */
> 
> Looks OK for me.
> 

Cool, yeah, then I will post an updated patch in a moment.


Thanks
Bernd.
diff mbox series

Patch

diff --git a/fs/exec.c b/fs/exec.c
index d820a72..11974a1 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1014,12 +1014,17 @@  static int exec_mmap(struct mm_struct *mm)
 {
 	struct task_struct *tsk;
 	struct mm_struct *old_mm, *active_mm;
+	int ret;
 
 	/* Notify parent that we're no longer interested in the old VM */
 	tsk = current;
 	old_mm = current->mm;
 	exec_mm_release(tsk, old_mm);
 
+	ret = mutex_lock_killable(&tsk->signal->exec_update_mutex);
+	if (ret)
+		return ret;
+
 	if (old_mm) {
 		sync_mm_rss(old_mm);
 		/*
@@ -1031,9 +1036,11 @@  static int exec_mmap(struct mm_struct *mm)
 		down_read(&old_mm->mmap_sem);
 		if (unlikely(old_mm->core_state)) {
 			up_read(&old_mm->mmap_sem);
+			mutex_unlock(&tsk->signal->exec_update_mutex);
 			return -EINTR;
 		}
 	}
+
 	task_lock(tsk);
 	active_mm = tsk->active_mm;
 	membarrier_exec_mmap(mm);
@@ -1288,11 +1295,12 @@  int flush_old_exec(struct linux_binprm * bprm)
 		goto out;
 
 	/*
-	 * After clearing bprm->mm (to mark that current is using the
-	 * prepared mm now), we have nothing left of the original
+	 * After setting bprm->called_exec_mmap (to mark that current is
+	 * using the prepared mm now), we have nothing left of the original
 	 * process. If anything from here on returns an error, the check
 	 * in search_binary_handler() will SEGV current.
 	 */
+	bprm->called_exec_mmap = 1;
 	bprm->mm = NULL;
 
 #ifdef CONFIG_POSIX_TIMERS
@@ -1438,6 +1446,8 @@  static void free_bprm(struct linux_binprm *bprm)
 {
 	free_arg_pages(bprm);
 	if (bprm->cred) {
+		if (bprm->called_exec_mmap)
+			mutex_unlock(&current->signal->exec_update_mutex);
 		mutex_unlock(&current->signal->cred_guard_mutex);
 		abort_creds(bprm->cred);
 	}
@@ -1487,6 +1497,7 @@  void install_exec_creds(struct linux_binprm *bprm)
 	 * credentials; any time after this it may be unlocked.
 	 */
 	security_bprm_committed_creds(bprm);
+	mutex_unlock(&current->signal->exec_update_mutex);
 	mutex_unlock(&current->signal->cred_guard_mutex);
 }
 EXPORT_SYMBOL(install_exec_creds);
@@ -1678,7 +1689,7 @@  int search_binary_handler(struct linux_binprm *bprm)
 
 		read_lock(&binfmt_lock);
 		put_binfmt(fmt);
-		if (retval < 0 && !bprm->mm) {
+		if (retval < 0 && bprm->called_exec_mmap) {
 			/* we got to flush_old_exec() and failed after it */
 			read_unlock(&binfmt_lock);
 			force_sigsegv(SIGSEGV);
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index b40fc63..a345d9f 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -44,7 +44,13 @@  struct linux_binprm {
 		 * exec has happened. Used to sanitize execution environment
 		 * and to set AT_SECURE auxv for glibc.
 		 */
-		secureexec:1;
+		secureexec:1,
+		/*
+		 * Set by flush_old_exec, when exec_mmap has been called.
+		 * This is past the point of no return, when the
+		 * exec_update_mutex has been taken.
+		 */
+		called_exec_mmap:1;
 #ifdef __alpha__
 	unsigned int taso:1;
 #endif
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 8805025..a29df79 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -224,7 +224,14 @@  struct signal_struct {
 
 	struct mutex cred_guard_mutex;	/* guard against foreign influences on
 					 * credential calculations
-					 * (notably. ptrace) */
+					 * (notably. ptrace)
+					 * Deprecated do not use in new code.
+					 * Use exec_update_mutex instead.
+					 */
+	struct mutex exec_update_mutex;	/* Held while task_struct is being
+					 * updated during exec, and may have
+					 * inconsistent permissions.
+					 */
 } __randomize_layout;
 
 /*
diff --git a/init/init_task.c b/init/init_task.c
index 9e5cbe5..bd403ed 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -26,6 +26,7 @@ 
 	.multiprocess	= HLIST_HEAD_INIT,
 	.rlim		= INIT_RLIMITS,
 	.cred_guard_mutex = __MUTEX_INITIALIZER(init_signals.cred_guard_mutex),
+	.exec_update_mutex = __MUTEX_INITIALIZER(init_signals.exec_update_mutex),
 #ifdef CONFIG_POSIX_TIMERS
 	.posix_timers = LIST_HEAD_INIT(init_signals.posix_timers),
 	.cputimer	= {
diff --git a/kernel/fork.c b/kernel/fork.c
index 8642530..036b692 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1594,6 +1594,7 @@  static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 	sig->oom_score_adj_min = current->signal->oom_score_adj_min;
 
 	mutex_init(&sig->cred_guard_mutex);
+	mutex_init(&sig->exec_update_mutex);
 
 	return 0;
 }