diff mbox series

[v1,3/7] kernel/fork: always deny write access to current MM exe_file

Message ID 20210812084348.6521-4-david@redhat.com (mailing list archive)
State New
Headers show
Series Remove in-tree usage of MAP_DENYWRITE | expand

Commit Message

David Hildenbrand Aug. 12, 2021, 8:43 a.m. UTC
We want to remove VM_DENYWRITE only currently only used when mapping the
executable during exec. During exec, we already deny_write_access() the
executable, however, after exec completes the VMAs mapped
with VM_DENYWRITE effectively keeps write access denied via
deny_write_access().

Let's deny write access when setting the MM exe_file. With this change, we
can remove VM_DENYWRITE for mapping executables.

This represents a minor user space visible change:
sys_prctl(PR_SET_MM_EXE_FILE) can now fail if the file is already
opened writable. Also, after sys_prctl(PR_SET_MM_EXE_FILE), the file
cannot be opened writable. Note that we can already fail with -EACCES if
the file doesn't have execute permissions.

Signed-off-by: David Hildenbrand <david@redhat.com>
---
 kernel/fork.c | 39 ++++++++++++++++++++++++++++++++++-----
 1 file changed, 34 insertions(+), 5 deletions(-)

Comments

Christian Brauner Aug. 12, 2021, 10:05 a.m. UTC | #1
[+Cc Andrei]

On Thu, Aug 12, 2021 at 10:43:44AM +0200, David Hildenbrand wrote:
> We want to remove VM_DENYWRITE only currently only used when mapping the
> executable during exec. During exec, we already deny_write_access() the
> executable, however, after exec completes the VMAs mapped
> with VM_DENYWRITE effectively keeps write access denied via
> deny_write_access().
> 
> Let's deny write access when setting the MM exe_file. With this change, we
> can remove VM_DENYWRITE for mapping executables.
> 
> This represents a minor user space visible change:
> sys_prctl(PR_SET_MM_EXE_FILE) can now fail if the file is already
> opened writable. Also, after sys_prctl(PR_SET_MM_EXE_FILE), the file

Just for completeness, this also affects PR_SET_MM_MAP when exe_fd is
set.

> cannot be opened writable. Note that we can already fail with -EACCES if
> the file doesn't have execute permissions.
> 
> Signed-off-by: David Hildenbrand <david@redhat.com>
> ---

The biggest user I know and that I'm involved in is CRIU which heavily
uses PR_SET_MM_MAP (with a fallback to PR_SET_MM_EXE_FILE on older
kernels) during restore. Afair, criu opens the exe fd as an O_PATH
during dump and thus will use the same flag during restore when
opening it. So that should be fine.

However, if I understand the consequences of this change correctly, a
problem could be restoring workloads that hold a writable fd open to
their exe file at dump time which would mean that during restore that fd
would be reopened writable causing CRIU to fail when setting the exe
file for the task to be restored.

Which honestly, no idea how many such workloads exist. (I know at least
of runC and LXC need to sometimes reopen to rexec themselves (weird bug
to protect against attacking the exe file) and thus re-open
/proc/self/exe but read-only.)

>  kernel/fork.c | 39 ++++++++++++++++++++++++++++++++++-----
>  1 file changed, 34 insertions(+), 5 deletions(-)
> 
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 6bd2e52bcdfb..5d904878f19b 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -476,6 +476,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
>  {
>  	struct vm_area_struct *mpnt, *tmp, *prev, **pprev;
>  	struct rb_node **rb_link, *rb_parent;
> +	struct file *exe_file;
>  	int retval;
>  	unsigned long charge;
>  	LIST_HEAD(uf);
> @@ -493,7 +494,10 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
>  	mmap_write_lock_nested(mm, SINGLE_DEPTH_NESTING);
>  
>  	/* No ordering required: file already has been exposed. */
> -	RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
> +	exe_file = get_mm_exe_file(oldmm);
> +	RCU_INIT_POINTER(mm->exe_file, exe_file);
> +	if (exe_file)
> +		deny_write_access(exe_file);
>  
>  	mm->total_vm = oldmm->total_vm;
>  	mm->data_vm = oldmm->data_vm;
> @@ -638,8 +642,13 @@ static inline void mm_free_pgd(struct mm_struct *mm)
>  #else
>  static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
>  {
> +	struct file *exe_file;
> +
>  	mmap_write_lock(oldmm);
> -	RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
> +	exe_file = get_mm_exe_file(oldmm);
> +	RCU_INIT_POINTER(mm->exe_file, exe_file);
> +	if (exe_file)
> +		deny_write_access(exe_file);
>  	mmap_write_unlock(oldmm);
>  	return 0;
>  }
> @@ -1163,11 +1172,19 @@ void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
>  	 */
>  	old_exe_file = rcu_dereference_raw(mm->exe_file);
>  
> -	if (new_exe_file)
> +	if (new_exe_file) {
>  		get_file(new_exe_file);
> +		/*
> +		 * exec code is required to deny_write_access() successfully,
> +		 * so this cannot fail
> +		 */
> +		deny_write_access(new_exe_file);
> +	}
>  	rcu_assign_pointer(mm->exe_file, new_exe_file);
> -	if (old_exe_file)
> +	if (old_exe_file) {
> +		allow_write_access(old_exe_file);
>  		fput(old_exe_file);
> +	}
>  }
>  
>  int atomic_set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
> @@ -1194,10 +1211,22 @@ int atomic_set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
>  	}
>  
>  	/* set the new file, lockless */
> +	ret = deny_write_access(new_exe_file);
> +	if (ret)
> +		return -EACCES;
>  	get_file(new_exe_file);
> +
>  	old_exe_file = xchg(&mm->exe_file, new_exe_file);
> -	if (old_exe_file)
> +	if (old_exe_file) {
> +		/*
> +		 * Don't race with dup_mmap() getting the file and disallowing
> +		 * write access while someone might open the file writable.
> +		 */
> +		mmap_read_lock(mm);
> +		allow_write_access(old_exe_file);
>  		fput(old_exe_file);
> +		mmap_read_unlock(mm);
> +	}
>  	return 0;
>  }
>  
> -- 
> 2.31.1
>
David Hildenbrand Aug. 12, 2021, 10:13 a.m. UTC | #2
On 12.08.21 12:05, Christian Brauner wrote:
> [+Cc Andrei]
> 
> On Thu, Aug 12, 2021 at 10:43:44AM +0200, David Hildenbrand wrote:
>> We want to remove VM_DENYWRITE only currently only used when mapping the
>> executable during exec. During exec, we already deny_write_access() the
>> executable, however, after exec completes the VMAs mapped
>> with VM_DENYWRITE effectively keeps write access denied via
>> deny_write_access().
>>
>> Let's deny write access when setting the MM exe_file. With this change, we
>> can remove VM_DENYWRITE for mapping executables.
>>
>> This represents a minor user space visible change:
>> sys_prctl(PR_SET_MM_EXE_FILE) can now fail if the file is already
>> opened writable. Also, after sys_prctl(PR_SET_MM_EXE_FILE), the file
> 
> Just for completeness, this also affects PR_SET_MM_MAP when exe_fd is
> set.

Correct.

> 
>> cannot be opened writable. Note that we can already fail with -EACCES if
>> the file doesn't have execute permissions.
>>
>> Signed-off-by: David Hildenbrand <david@redhat.com>
>> ---
> 
> The biggest user I know and that I'm involved in is CRIU which heavily
> uses PR_SET_MM_MAP (with a fallback to PR_SET_MM_EXE_FILE on older
> kernels) during restore. Afair, criu opens the exe fd as an O_PATH
> during dump and thus will use the same flag during restore when
> opening it. So that should be fine.

Yes.

> 
> However, if I understand the consequences of this change correctly, a
> problem could be restoring workloads that hold a writable fd open to
> their exe file at dump time which would mean that during restore that fd
> would be reopened writable causing CRIU to fail when setting the exe
> file for the task to be restored.

If it's their exe file, then the existing VM_DENYWRITE handling would 
have forbidden these workloads to open the fd of their exe file 
writable, right? At least before doing any 
PR_SET_MM_MAP/PR_SET_MM_EXE_FILE. But that should rule out quite a lot 
of cases we might be worried about, right?

> 
> Which honestly, no idea how many such workloads exist. (I know at least
> of runC and LXC need to sometimes reopen to rexec themselves (weird bug
> to protect against attacking the exe file) and thus re-open
> /proc/self/exe but read-only.)
> 
>>   kernel/fork.c | 39 ++++++++++++++++++++++++++++++++++-----
>>   1 file changed, 34 insertions(+), 5 deletions(-)
>>
>> diff --git a/kernel/fork.c b/kernel/fork.c
>> index 6bd2e52bcdfb..5d904878f19b 100644
>> --- a/kernel/fork.c
>> +++ b/kernel/fork.c
>> @@ -476,6 +476,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
>>   {
>>   	struct vm_area_struct *mpnt, *tmp, *prev, **pprev;
>>   	struct rb_node **rb_link, *rb_parent;
>> +	struct file *exe_file;
>>   	int retval;
>>   	unsigned long charge;
>>   	LIST_HEAD(uf);
>> @@ -493,7 +494,10 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
>>   	mmap_write_lock_nested(mm, SINGLE_DEPTH_NESTING);
>>   
>>   	/* No ordering required: file already has been exposed. */
>> -	RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
>> +	exe_file = get_mm_exe_file(oldmm);
>> +	RCU_INIT_POINTER(mm->exe_file, exe_file);
>> +	if (exe_file)
>> +		deny_write_access(exe_file);
>>   
>>   	mm->total_vm = oldmm->total_vm;
>>   	mm->data_vm = oldmm->data_vm;
>> @@ -638,8 +642,13 @@ static inline void mm_free_pgd(struct mm_struct *mm)
>>   #else
>>   static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
>>   {
>> +	struct file *exe_file;
>> +
>>   	mmap_write_lock(oldmm);
>> -	RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
>> +	exe_file = get_mm_exe_file(oldmm);
>> +	RCU_INIT_POINTER(mm->exe_file, exe_file);
>> +	if (exe_file)
>> +		deny_write_access(exe_file);
>>   	mmap_write_unlock(oldmm);
>>   	return 0;
>>   }
>> @@ -1163,11 +1172,19 @@ void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
>>   	 */
>>   	old_exe_file = rcu_dereference_raw(mm->exe_file);
>>   
>> -	if (new_exe_file)
>> +	if (new_exe_file) {
>>   		get_file(new_exe_file);
>> +		/*
>> +		 * exec code is required to deny_write_access() successfully,
>> +		 * so this cannot fail
>> +		 */
>> +		deny_write_access(new_exe_file);
>> +	}
>>   	rcu_assign_pointer(mm->exe_file, new_exe_file);
>> -	if (old_exe_file)
>> +	if (old_exe_file) {
>> +		allow_write_access(old_exe_file);
>>   		fput(old_exe_file);
>> +	}
>>   }
>>   
>>   int atomic_set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
>> @@ -1194,10 +1211,22 @@ int atomic_set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
>>   	}
>>   
>>   	/* set the new file, lockless */
>> +	ret = deny_write_access(new_exe_file);
>> +	if (ret)
>> +		return -EACCES;
>>   	get_file(new_exe_file);
>> +
>>   	old_exe_file = xchg(&mm->exe_file, new_exe_file);
>> -	if (old_exe_file)
>> +	if (old_exe_file) {
>> +		/*
>> +		 * Don't race with dup_mmap() getting the file and disallowing
>> +		 * write access while someone might open the file writable.
>> +		 */
>> +		mmap_read_lock(mm);
>> +		allow_write_access(old_exe_file);
>>   		fput(old_exe_file);
>> +		mmap_read_unlock(mm);
>> +	}
>>   	return 0;
>>   }
>>   
>> -- 
>> 2.31.1
>>
>
Christian Brauner Aug. 12, 2021, 12:32 p.m. UTC | #3
On Thu, Aug 12, 2021 at 12:13:44PM +0200, David Hildenbrand wrote:
> On 12.08.21 12:05, Christian Brauner wrote:
> > [+Cc Andrei]
> > 
> > On Thu, Aug 12, 2021 at 10:43:44AM +0200, David Hildenbrand wrote:
> > > We want to remove VM_DENYWRITE only currently only used when mapping the
> > > executable during exec. During exec, we already deny_write_access() the
> > > executable, however, after exec completes the VMAs mapped
> > > with VM_DENYWRITE effectively keeps write access denied via
> > > deny_write_access().
> > > 
> > > Let's deny write access when setting the MM exe_file. With this change, we
> > > can remove VM_DENYWRITE for mapping executables.
> > > 
> > > This represents a minor user space visible change:
> > > sys_prctl(PR_SET_MM_EXE_FILE) can now fail if the file is already
> > > opened writable. Also, after sys_prctl(PR_SET_MM_EXE_FILE), the file
> > 
> > Just for completeness, this also affects PR_SET_MM_MAP when exe_fd is
> > set.
> 
> Correct.
> 
> > 
> > > cannot be opened writable. Note that we can already fail with -EACCES if
> > > the file doesn't have execute permissions.
> > > 
> > > Signed-off-by: David Hildenbrand <david@redhat.com>
> > > ---
> > 
> > The biggest user I know and that I'm involved in is CRIU which heavily
> > uses PR_SET_MM_MAP (with a fallback to PR_SET_MM_EXE_FILE on older
> > kernels) during restore. Afair, criu opens the exe fd as an O_PATH
> > during dump and thus will use the same flag during restore when
> > opening it. So that should be fine.
> 
> Yes.
> 
> > 
> > However, if I understand the consequences of this change correctly, a
> > problem could be restoring workloads that hold a writable fd open to
> > their exe file at dump time which would mean that during restore that fd
> > would be reopened writable causing CRIU to fail when setting the exe
> > file for the task to be restored.
> 
> If it's their exe file, then the existing VM_DENYWRITE handling would have
> forbidden these workloads to open the fd of their exe file writable, right?

Yes.

> At least before doing any PR_SET_MM_MAP/PR_SET_MM_EXE_FILE. But that should
> rule out quite a lot of cases we might be worried about, right?

Yes, it rules out the most obvious cases. The problem is really just
that we don't know how common weirder cases are. But that doesn't mean
we shouldn't try and risk it. This is a nice cleanup and playing
/proc/self/exe games isn't super common.

> 
> > 
> > Which honestly, no idea how many such workloads exist. (I know at least
> > of runC and LXC need to sometimes reopen to rexec themselves (weird bug
> > to protect against attacking the exe file) and thus re-open
> > /proc/self/exe but read-only.)
> > 
> > >   kernel/fork.c | 39 ++++++++++++++++++++++++++++++++++-----
> > >   1 file changed, 34 insertions(+), 5 deletions(-)
> > > 
> > > diff --git a/kernel/fork.c b/kernel/fork.c
> > > index 6bd2e52bcdfb..5d904878f19b 100644
> > > --- a/kernel/fork.c
> > > +++ b/kernel/fork.c
> > > @@ -476,6 +476,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
> > >   {
> > >   	struct vm_area_struct *mpnt, *tmp, *prev, **pprev;
> > >   	struct rb_node **rb_link, *rb_parent;
> > > +	struct file *exe_file;
> > >   	int retval;
> > >   	unsigned long charge;
> > >   	LIST_HEAD(uf);
> > > @@ -493,7 +494,10 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
> > >   	mmap_write_lock_nested(mm, SINGLE_DEPTH_NESTING);
> > >   	/* No ordering required: file already has been exposed. */
> > > -	RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
> > > +	exe_file = get_mm_exe_file(oldmm);
> > > +	RCU_INIT_POINTER(mm->exe_file, exe_file);
> > > +	if (exe_file)
> > > +		deny_write_access(exe_file);
> > >   	mm->total_vm = oldmm->total_vm;
> > >   	mm->data_vm = oldmm->data_vm;
> > > @@ -638,8 +642,13 @@ static inline void mm_free_pgd(struct mm_struct *mm)
> > >   #else
> > >   static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
> > >   {
> > > +	struct file *exe_file;
> > > +
> > >   	mmap_write_lock(oldmm);
> > > -	RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
> > > +	exe_file = get_mm_exe_file(oldmm);
> > > +	RCU_INIT_POINTER(mm->exe_file, exe_file);
> > > +	if (exe_file)
> > > +		deny_write_access(exe_file);
> > >   	mmap_write_unlock(oldmm);
> > >   	return 0;
> > >   }
> > > @@ -1163,11 +1172,19 @@ void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
> > >   	 */
> > >   	old_exe_file = rcu_dereference_raw(mm->exe_file);
> > > -	if (new_exe_file)
> > > +	if (new_exe_file) {
> > >   		get_file(new_exe_file);
> > > +		/*
> > > +		 * exec code is required to deny_write_access() successfully,
> > > +		 * so this cannot fail
> > > +		 */
> > > +		deny_write_access(new_exe_file);
> > > +	}
> > >   	rcu_assign_pointer(mm->exe_file, new_exe_file);
> > > -	if (old_exe_file)
> > > +	if (old_exe_file) {
> > > +		allow_write_access(old_exe_file);
> > >   		fput(old_exe_file);
> > > +	}
> > >   }
> > >   int atomic_set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
> > > @@ -1194,10 +1211,22 @@ int atomic_set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
> > >   	}
> > >   	/* set the new file, lockless */
> > > +	ret = deny_write_access(new_exe_file);
> > > +	if (ret)
> > > +		return -EACCES;
> > >   	get_file(new_exe_file);
> > > +
> > >   	old_exe_file = xchg(&mm->exe_file, new_exe_file);
> > > -	if (old_exe_file)
> > > +	if (old_exe_file) {
> > > +		/*
> > > +		 * Don't race with dup_mmap() getting the file and disallowing
> > > +		 * write access while someone might open the file writable.
> > > +		 */
> > > +		mmap_read_lock(mm);
> > > +		allow_write_access(old_exe_file);
> > >   		fput(old_exe_file);
> > > +		mmap_read_unlock(mm);
> > > +	}
> > >   	return 0;
> > >   }
> > > -- 
> > > 2.31.1
> > > 
> > 
> 
> 
> -- 
> Thanks,
> 
> David / dhildenb
>
David Hildenbrand Aug. 12, 2021, 12:38 p.m. UTC | #4
On 12.08.21 14:32, Christian Brauner wrote:
> On Thu, Aug 12, 2021 at 12:13:44PM +0200, David Hildenbrand wrote:
>> On 12.08.21 12:05, Christian Brauner wrote:
>>> [+Cc Andrei]
>>>
>>> On Thu, Aug 12, 2021 at 10:43:44AM +0200, David Hildenbrand wrote:
>>>> We want to remove VM_DENYWRITE only currently only used when mapping the
>>>> executable during exec. During exec, we already deny_write_access() the
>>>> executable, however, after exec completes the VMAs mapped
>>>> with VM_DENYWRITE effectively keeps write access denied via
>>>> deny_write_access().
>>>>
>>>> Let's deny write access when setting the MM exe_file. With this change, we
>>>> can remove VM_DENYWRITE for mapping executables.
>>>>
>>>> This represents a minor user space visible change:
>>>> sys_prctl(PR_SET_MM_EXE_FILE) can now fail if the file is already
>>>> opened writable. Also, after sys_prctl(PR_SET_MM_EXE_FILE), the file
>>>
>>> Just for completeness, this also affects PR_SET_MM_MAP when exe_fd is
>>> set.
>>
>> Correct.
>>
>>>
>>>> cannot be opened writable. Note that we can already fail with -EACCES if
>>>> the file doesn't have execute permissions.
>>>>
>>>> Signed-off-by: David Hildenbrand <david@redhat.com>
>>>> ---
>>>
>>> The biggest user I know and that I'm involved in is CRIU which heavily
>>> uses PR_SET_MM_MAP (with a fallback to PR_SET_MM_EXE_FILE on older
>>> kernels) during restore. Afair, criu opens the exe fd as an O_PATH
>>> during dump and thus will use the same flag during restore when
>>> opening it. So that should be fine.
>>
>> Yes.
>>
>>>
>>> However, if I understand the consequences of this change correctly, a
>>> problem could be restoring workloads that hold a writable fd open to
>>> their exe file at dump time which would mean that during restore that fd
>>> would be reopened writable causing CRIU to fail when setting the exe
>>> file for the task to be restored.
>>
>> If it's their exe file, then the existing VM_DENYWRITE handling would have
>> forbidden these workloads to open the fd of their exe file writable, right?
> 
> Yes.
> 
>> At least before doing any PR_SET_MM_MAP/PR_SET_MM_EXE_FILE. But that should
>> rule out quite a lot of cases we might be worried about, right?
> 
> Yes, it rules out the most obvious cases. The problem is really just
> that we don't know how common weirder cases are. But that doesn't mean
> we shouldn't try and risk it. This is a nice cleanup and playing
> /proc/self/exe games isn't super common.
> 

Right, and having the file your executing opened writable isn't 
something very common as well.

If we really run into problems, we could not protect the new file when 
issuing PR_SET_MM_MAP/PR_SET_MM_EXE_FILE. But I'd like to avoid that, if 
possible, because it feels like working around something that never 
should have worked that way and is quite inconsistent.
Linus Torvalds Aug. 12, 2021, 4:51 p.m. UTC | #5
On Wed, Aug 11, 2021 at 10:45 PM David Hildenbrand <david@redhat.com> wrote:
>
>         /* No ordering required: file already has been exposed. */
> -       RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
> +       exe_file = get_mm_exe_file(oldmm);
> +       RCU_INIT_POINTER(mm->exe_file, exe_file);
> +       if (exe_file)
> +               deny_write_access(exe_file);

Can we make a helper function for this, since it's done in two different places?

> -       if (new_exe_file)
> +       if (new_exe_file) {
>                 get_file(new_exe_file);
> +               /*
> +                * exec code is required to deny_write_access() successfully,
> +                * so this cannot fail
> +                */
> +               deny_write_access(new_exe_file);
> +       }
>         rcu_assign_pointer(mm->exe_file, new_exe_file);

And the above looks positively wrong. The comment is also nonsensical,
in that it basically says "we thought this cannot fail, so we'll just
rely on it".

If it truly cannot fail, then the comment should give the reason, not
the "we depend on this not failing".

And honestly, I don't see why it couldn't fail. And if it *does* fail,
we cannot then RCU-assign the exe_file pointer with this, because
you'll get a counter imbalance when you do the allow_write_access()
later.

Anyway, do_open_execat() does do deny_write_access() with proper error
checking. I think that is the existing reference that you depend on -
so that it doesn't fail. So the comment could possibly say that the
only caller has done this, but can we not just use the reference
deny_write_access() directly, and not do a new one here?

IOW, maybe there's an extraneous 'allow_write_access()' somewhere that
should be dropped when we do the whole binprm dance in execve()?

             Linus
David Hildenbrand Aug. 12, 2021, 7:38 p.m. UTC | #6
On 12.08.21 18:51, Linus Torvalds wrote:
> On Wed, Aug 11, 2021 at 10:45 PM David Hildenbrand <david@redhat.com> wrote:
>>
>>          /* No ordering required: file already has been exposed. */
>> -       RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
>> +       exe_file = get_mm_exe_file(oldmm);
>> +       RCU_INIT_POINTER(mm->exe_file, exe_file);
>> +       if (exe_file)
>> +               deny_write_access(exe_file);
> 
> Can we make a helper function for this, since it's done in two different places?

Sure, no compelling reason not to (except finding a suitable name, but 
I'll think about that tomorrow).

> 
>> -       if (new_exe_file)
>> +       if (new_exe_file) {
>>                  get_file(new_exe_file);
>> +               /*
>> +                * exec code is required to deny_write_access() successfully,
>> +                * so this cannot fail
>> +                */
>> +               deny_write_access(new_exe_file);
>> +       }
>>          rcu_assign_pointer(mm->exe_file, new_exe_file);
> 
> And the above looks positively wrong. The comment is also nonsensical,
> in that it basically says "we thought this cannot fail, so we'll just
> rely on it".

Well, it documents the expectation towards the caller, but in a 
suboptimal way, I agree.

> 
> If it truly cannot fail, then the comment should give the reason, not
> the "we depend on this not failing".

Right, "We depend on the caller already have done a deny_write_access() 
successfully first such that this call cannot fail." combined with

if (deny_write_access(new_exe_file))
	pr_warn("Unexpected failure of deny_write_access() in %s",
                  __func__);

suggestions welcome.

> 
> And honestly, I don't see why it couldn't fail. And if it *does* fail,
> we cannot then RCU-assign the exe_file pointer with this, because
> you'll get a counter imbalance when you do the allow_write_access()
> later.

Anyone calling set_mm_exe_file() (-> begin_new_exec()) is expected to 
successfully triggered a deny_write_access() upfront such that we won't 
fail at that point.

Further, on the dup_mmap() path we are sure the previous oldmm exe_file 
properly saw a successful deny_write_access() already, because that's 
now guaranteed for any exe_file.

> 
> Anyway, do_open_execat() does do deny_write_access() with proper error
> checking. I think that is the existing reference that you depend on -
> so that it doesn't fail. So the comment could possibly say that the
> only caller has done this, but can we not just use the reference
> deny_write_access() directly, and not do a new one here?

I think that might over-complicate the exec code where we would see a 
allow_write_access() on error paths, but not on success paths. This here 
looks cleaner to me, agreeing that the comment and the error check has 
to be improved.

We handle all allow_write_access()/deny_write_access() regarding 
exe_file completely in kernel/fork.c, which is IMHO quite nice.

> 
> IOW, maybe there's an extraneous 'allow_write_access()' somewhere that
> should be dropped when we do the whole binprm dance in execve()?

fs/exec.c: free_bprm() and exec_binprm() to be precise.

Thanks!
diff mbox series

Patch

diff --git a/kernel/fork.c b/kernel/fork.c
index 6bd2e52bcdfb..5d904878f19b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -476,6 +476,7 @@  static __latent_entropy int dup_mmap(struct mm_struct *mm,
 {
 	struct vm_area_struct *mpnt, *tmp, *prev, **pprev;
 	struct rb_node **rb_link, *rb_parent;
+	struct file *exe_file;
 	int retval;
 	unsigned long charge;
 	LIST_HEAD(uf);
@@ -493,7 +494,10 @@  static __latent_entropy int dup_mmap(struct mm_struct *mm,
 	mmap_write_lock_nested(mm, SINGLE_DEPTH_NESTING);
 
 	/* No ordering required: file already has been exposed. */
-	RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
+	exe_file = get_mm_exe_file(oldmm);
+	RCU_INIT_POINTER(mm->exe_file, exe_file);
+	if (exe_file)
+		deny_write_access(exe_file);
 
 	mm->total_vm = oldmm->total_vm;
 	mm->data_vm = oldmm->data_vm;
@@ -638,8 +642,13 @@  static inline void mm_free_pgd(struct mm_struct *mm)
 #else
 static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 {
+	struct file *exe_file;
+
 	mmap_write_lock(oldmm);
-	RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
+	exe_file = get_mm_exe_file(oldmm);
+	RCU_INIT_POINTER(mm->exe_file, exe_file);
+	if (exe_file)
+		deny_write_access(exe_file);
 	mmap_write_unlock(oldmm);
 	return 0;
 }
@@ -1163,11 +1172,19 @@  void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
 	 */
 	old_exe_file = rcu_dereference_raw(mm->exe_file);
 
-	if (new_exe_file)
+	if (new_exe_file) {
 		get_file(new_exe_file);
+		/*
+		 * exec code is required to deny_write_access() successfully,
+		 * so this cannot fail
+		 */
+		deny_write_access(new_exe_file);
+	}
 	rcu_assign_pointer(mm->exe_file, new_exe_file);
-	if (old_exe_file)
+	if (old_exe_file) {
+		allow_write_access(old_exe_file);
 		fput(old_exe_file);
+	}
 }
 
 int atomic_set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
@@ -1194,10 +1211,22 @@  int atomic_set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
 	}
 
 	/* set the new file, lockless */
+	ret = deny_write_access(new_exe_file);
+	if (ret)
+		return -EACCES;
 	get_file(new_exe_file);
+
 	old_exe_file = xchg(&mm->exe_file, new_exe_file);
-	if (old_exe_file)
+	if (old_exe_file) {
+		/*
+		 * Don't race with dup_mmap() getting the file and disallowing
+		 * write access while someone might open the file writable.
+		 */
+		mmap_read_lock(mm);
+		allow_write_access(old_exe_file);
 		fput(old_exe_file);
+		mmap_read_unlock(mm);
+	}
 	return 0;
 }