diff mbox series

[v2,3/5] mm: madvise: implement lightweight guard page mechanism

Message ID fce49bbbfe41b82161a37b022c8eb1e6c20e1d85.1729440856.git.lorenzo.stoakes@oracle.com (mailing list archive)
State New
Headers show
Series implement lightweight guard pages | expand

Commit Message

Lorenzo Stoakes Oct. 20, 2024, 4:20 p.m. UTC
Implement a new lightweight guard page feature, that is regions of userland
virtual memory that, when accessed, cause a fatal signal to arise.

Currently users must establish PROT_NONE ranges to achieve this.

However this is very costly memory-wise - we need a VMA for each and every
one of these regions AND they become unmergeable with surrounding VMAs.

In addition repeated mmap() calls require repeated kernel context switches
and contention of the mmap lock to install these ranges, potentially also
having to unmap memory if installed over existing ranges.

The lightweight guard approach eliminates the VMA cost altogether - rather
than establishing a PROT_NONE VMA, it operates at the level of page table
entries - poisoning PTEs such that accesses to them cause a fault followed
by a SIGSGEV signal being raised.

This is achieved through the PTE marker mechanism, which a previous commit
in this series extended to permit this to be done, installed via the
generic page walking logic, also extended by a prior commit for this
purpose.

These poison ranges are established with MADV_GUARD_POISON, and if the
range in which they are installed contain any existing mappings, they will
be zapped, i.e. free the range and unmap memory (thus mimicking the
behaviour of MADV_DONTNEED in this respect).

Any existing poison entries will be left untouched. There is no nesting of
poisoned pages.

Poisoned ranges are NOT cleared by MADV_DONTNEED, as this would be rather
unexpected behaviour, but are cleared on process teardown or unmapping of
memory ranges.

Ranges can have the poison property removed by MADV_GUARD_UNPOISON -
'remedying' the poisoning. The ranges over which this is applied, should
they contain non-poison entries, will be untouched, only poison entries
will be cleared.

We permit this operation on anonymous memory only, and only VMAs which are
non-special, non-huge and not mlock()'d (if we permitted this we'd have to
drop locked pages which would be rather counterintuitive).

Suggested-by: Vlastimil Babka <vbabka@suse.cz>
Suggested-by: Jann Horn <jannh@google.com>
Suggested-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
---
 arch/alpha/include/uapi/asm/mman.h     |   3 +
 arch/mips/include/uapi/asm/mman.h      |   3 +
 arch/parisc/include/uapi/asm/mman.h    |   3 +
 arch/xtensa/include/uapi/asm/mman.h    |   3 +
 include/uapi/asm-generic/mman-common.h |   3 +
 mm/madvise.c                           | 168 +++++++++++++++++++++++++
 mm/mprotect.c                          |   3 +-
 mm/mseal.c                             |   1 +
 8 files changed, 186 insertions(+), 1 deletion(-)

Comments

David Hildenbrand Oct. 21, 2024, 5:05 p.m. UTC | #1
On 20.10.24 18:20, Lorenzo Stoakes wrote:
> Implement a new lightweight guard page feature, that is regions of userland
> virtual memory that, when accessed, cause a fatal signal to arise.
> 
> Currently users must establish PROT_NONE ranges to achieve this.
> 
> However this is very costly memory-wise - we need a VMA for each and every
> one of these regions AND they become unmergeable with surrounding VMAs.
> 
> In addition repeated mmap() calls require repeated kernel context switches
> and contention of the mmap lock to install these ranges, potentially also
> having to unmap memory if installed over existing ranges.
> 
> The lightweight guard approach eliminates the VMA cost altogether - rather
> than establishing a PROT_NONE VMA, it operates at the level of page table
> entries - poisoning PTEs such that accesses to them cause a fault followed
> by a SIGSGEV signal being raised.
> 
> This is achieved through the PTE marker mechanism, which a previous commit
> in this series extended to permit this to be done, installed via the
> generic page walking logic, also extended by a prior commit for this
> purpose.
> 
> These poison ranges are established with MADV_GUARD_POISON, and if the
> range in which they are installed contain any existing mappings, they will
> be zapped, i.e. free the range and unmap memory (thus mimicking the
> behaviour of MADV_DONTNEED in this respect).
> 
> Any existing poison entries will be left untouched. There is no nesting of
> poisoned pages.
> 
> Poisoned ranges are NOT cleared by MADV_DONTNEED, as this would be rather
> unexpected behaviour, but are cleared on process teardown or unmapping of
> memory ranges.
> 
> Ranges can have the poison property removed by MADV_GUARD_UNPOISON -
> 'remedying' the poisoning. The ranges over which this is applied, should
> they contain non-poison entries, will be untouched, only poison entries
> will be cleared.
> 
> We permit this operation on anonymous memory only, and only VMAs which are
> non-special, non-huge and not mlock()'d (if we permitted this we'd have to
> drop locked pages which would be rather counterintuitive).
> 
> Suggested-by: Vlastimil Babka <vbabka@suse.cz>
> Suggested-by: Jann Horn <jannh@google.com>
> Suggested-by: David Hildenbrand <david@redhat.com>
> Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> ---
>   arch/alpha/include/uapi/asm/mman.h     |   3 +
>   arch/mips/include/uapi/asm/mman.h      |   3 +
>   arch/parisc/include/uapi/asm/mman.h    |   3 +
>   arch/xtensa/include/uapi/asm/mman.h    |   3 +
>   include/uapi/asm-generic/mman-common.h |   3 +
>   mm/madvise.c                           | 168 +++++++++++++++++++++++++
>   mm/mprotect.c                          |   3 +-
>   mm/mseal.c                             |   1 +
>   8 files changed, 186 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/alpha/include/uapi/asm/mman.h b/arch/alpha/include/uapi/asm/mman.h
> index 763929e814e9..71e13f27742d 100644
> --- a/arch/alpha/include/uapi/asm/mman.h
> +++ b/arch/alpha/include/uapi/asm/mman.h
> @@ -78,6 +78,9 @@
>   
>   #define MADV_COLLAPSE	25		/* Synchronous hugepage collapse */
>   
> +#define MADV_GUARD_POISON 102		/* fatal signal on access to range */
> +#define MADV_GUARD_UNPOISON 103		/* revoke guard poisoning */

Just to raise it here: MADV_GUARD_INSTALL / MADV_GUARD_REMOVE or sth. 
like that would have been even clearer, at least to me.

But no strong opinion, just if somebody else reading along was wondering 
about the same.


I'm hoping to find more time to have a closer look at this this week, 
but in general, the concept sounds reasonable to me.
Lorenzo Stoakes Oct. 21, 2024, 5:15 p.m. UTC | #2
On Mon, Oct 21, 2024 at 07:05:27PM +0200, David Hildenbrand wrote:
> On 20.10.24 18:20, Lorenzo Stoakes wrote:
> > Implement a new lightweight guard page feature, that is regions of userland
> > virtual memory that, when accessed, cause a fatal signal to arise.
> >
> > Currently users must establish PROT_NONE ranges to achieve this.
> >
> > However this is very costly memory-wise - we need a VMA for each and every
> > one of these regions AND they become unmergeable with surrounding VMAs.
> >
> > In addition repeated mmap() calls require repeated kernel context switches
> > and contention of the mmap lock to install these ranges, potentially also
> > having to unmap memory if installed over existing ranges.
> >
> > The lightweight guard approach eliminates the VMA cost altogether - rather
> > than establishing a PROT_NONE VMA, it operates at the level of page table
> > entries - poisoning PTEs such that accesses to them cause a fault followed
> > by a SIGSGEV signal being raised.
> >
> > This is achieved through the PTE marker mechanism, which a previous commit
> > in this series extended to permit this to be done, installed via the
> > generic page walking logic, also extended by a prior commit for this
> > purpose.
> >
> > These poison ranges are established with MADV_GUARD_POISON, and if the
> > range in which they are installed contain any existing mappings, they will
> > be zapped, i.e. free the range and unmap memory (thus mimicking the
> > behaviour of MADV_DONTNEED in this respect).
> >
> > Any existing poison entries will be left untouched. There is no nesting of
> > poisoned pages.
> >
> > Poisoned ranges are NOT cleared by MADV_DONTNEED, as this would be rather
> > unexpected behaviour, but are cleared on process teardown or unmapping of
> > memory ranges.
> >
> > Ranges can have the poison property removed by MADV_GUARD_UNPOISON -
> > 'remedying' the poisoning. The ranges over which this is applied, should
> > they contain non-poison entries, will be untouched, only poison entries
> > will be cleared.
> >
> > We permit this operation on anonymous memory only, and only VMAs which are
> > non-special, non-huge and not mlock()'d (if we permitted this we'd have to
> > drop locked pages which would be rather counterintuitive).
> >
> > Suggested-by: Vlastimil Babka <vbabka@suse.cz>
> > Suggested-by: Jann Horn <jannh@google.com>
> > Suggested-by: David Hildenbrand <david@redhat.com>
> > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> > ---
> >   arch/alpha/include/uapi/asm/mman.h     |   3 +
> >   arch/mips/include/uapi/asm/mman.h      |   3 +
> >   arch/parisc/include/uapi/asm/mman.h    |   3 +
> >   arch/xtensa/include/uapi/asm/mman.h    |   3 +
> >   include/uapi/asm-generic/mman-common.h |   3 +
> >   mm/madvise.c                           | 168 +++++++++++++++++++++++++
> >   mm/mprotect.c                          |   3 +-
> >   mm/mseal.c                             |   1 +
> >   8 files changed, 186 insertions(+), 1 deletion(-)
> >
> > diff --git a/arch/alpha/include/uapi/asm/mman.h b/arch/alpha/include/uapi/asm/mman.h
> > index 763929e814e9..71e13f27742d 100644
> > --- a/arch/alpha/include/uapi/asm/mman.h
> > +++ b/arch/alpha/include/uapi/asm/mman.h
> > @@ -78,6 +78,9 @@
> >   #define MADV_COLLAPSE	25		/* Synchronous hugepage collapse */
> > +#define MADV_GUARD_POISON 102		/* fatal signal on access to range */
> > +#define MADV_GUARD_UNPOISON 103		/* revoke guard poisoning */
>
> Just to raise it here: MADV_GUARD_INSTALL / MADV_GUARD_REMOVE or sth. like
> that would have been even clearer, at least to me.

:)

It still feels like poisoning to me because we're explicitly putting
something in the page tables to make a range have different fault behaviour
like a HW poisoning, and 'installing' suggests backing or something like
this, I think that's more confusing.

>
> But no strong opinion, just if somebody else reading along was wondering
> about the same.
>
>
> I'm hoping to find more time to have a closer look at this this week, but in
> general, the concept sounds reasonable to me.

Thanks!

>
> --
> Cheers,
>
> David / dhildenb
>
David Hildenbrand Oct. 21, 2024, 5:23 p.m. UTC | #3
On 21.10.24 19:15, Lorenzo Stoakes wrote:
> On Mon, Oct 21, 2024 at 07:05:27PM +0200, David Hildenbrand wrote:
>> On 20.10.24 18:20, Lorenzo Stoakes wrote:
>>> Implement a new lightweight guard page feature, that is regions of userland
>>> virtual memory that, when accessed, cause a fatal signal to arise.
>>>
>>> Currently users must establish PROT_NONE ranges to achieve this.
>>>
>>> However this is very costly memory-wise - we need a VMA for each and every
>>> one of these regions AND they become unmergeable with surrounding VMAs.
>>>
>>> In addition repeated mmap() calls require repeated kernel context switches
>>> and contention of the mmap lock to install these ranges, potentially also
>>> having to unmap memory if installed over existing ranges.
>>>
>>> The lightweight guard approach eliminates the VMA cost altogether - rather
>>> than establishing a PROT_NONE VMA, it operates at the level of page table
>>> entries - poisoning PTEs such that accesses to them cause a fault followed
>>> by a SIGSGEV signal being raised.
>>>
>>> This is achieved through the PTE marker mechanism, which a previous commit
>>> in this series extended to permit this to be done, installed via the
>>> generic page walking logic, also extended by a prior commit for this
>>> purpose.
>>>
>>> These poison ranges are established with MADV_GUARD_POISON, and if the
>>> range in which they are installed contain any existing mappings, they will
>>> be zapped, i.e. free the range and unmap memory (thus mimicking the
>>> behaviour of MADV_DONTNEED in this respect).
>>>
>>> Any existing poison entries will be left untouched. There is no nesting of
>>> poisoned pages.
>>>
>>> Poisoned ranges are NOT cleared by MADV_DONTNEED, as this would be rather
>>> unexpected behaviour, but are cleared on process teardown or unmapping of
>>> memory ranges.
>>>
>>> Ranges can have the poison property removed by MADV_GUARD_UNPOISON -
>>> 'remedying' the poisoning. The ranges over which this is applied, should
>>> they contain non-poison entries, will be untouched, only poison entries
>>> will be cleared.
>>>
>>> We permit this operation on anonymous memory only, and only VMAs which are
>>> non-special, non-huge and not mlock()'d (if we permitted this we'd have to
>>> drop locked pages which would be rather counterintuitive).
>>>
>>> Suggested-by: Vlastimil Babka <vbabka@suse.cz>
>>> Suggested-by: Jann Horn <jannh@google.com>
>>> Suggested-by: David Hildenbrand <david@redhat.com>
>>> Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
>>> ---
>>>    arch/alpha/include/uapi/asm/mman.h     |   3 +
>>>    arch/mips/include/uapi/asm/mman.h      |   3 +
>>>    arch/parisc/include/uapi/asm/mman.h    |   3 +
>>>    arch/xtensa/include/uapi/asm/mman.h    |   3 +
>>>    include/uapi/asm-generic/mman-common.h |   3 +
>>>    mm/madvise.c                           | 168 +++++++++++++++++++++++++
>>>    mm/mprotect.c                          |   3 +-
>>>    mm/mseal.c                             |   1 +
>>>    8 files changed, 186 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/arch/alpha/include/uapi/asm/mman.h b/arch/alpha/include/uapi/asm/mman.h
>>> index 763929e814e9..71e13f27742d 100644
>>> --- a/arch/alpha/include/uapi/asm/mman.h
>>> +++ b/arch/alpha/include/uapi/asm/mman.h
>>> @@ -78,6 +78,9 @@
>>>    #define MADV_COLLAPSE	25		/* Synchronous hugepage collapse */
>>> +#define MADV_GUARD_POISON 102		/* fatal signal on access to range */
>>> +#define MADV_GUARD_UNPOISON 103		/* revoke guard poisoning */
>>
>> Just to raise it here: MADV_GUARD_INSTALL / MADV_GUARD_REMOVE or sth. like
>> that would have been even clearer, at least to me.
> 
> :)
> 
> It still feels like poisoning to me because we're explicitly putting
> something in the page tables to make a range have different fault behaviour
> like a HW poisoning, and 'installing' suggests backing or something like
> this, I think that's more confusing.

I connect "poison" to "SIGBUS" and "corrupt memory state", not to "there 
is nothing and there must not be anything". Thus my thinking. But again, 
not the end of the world, just wanted to raise it ...

> 
>>
>> But no strong opinion, just if somebody else reading along was wondering
>> about the same.
>>
>>
>> I'm hoping to find more time to have a closer look at this this week, but in
>> general, the concept sounds reasonable to me.
> 
> Thanks!

Thank you for implementing this and up-streaming it!
John Hubbard Oct. 21, 2024, 7:25 p.m. UTC | #4
On 10/21/24 10:23 AM, David Hildenbrand wrote:
> On 21.10.24 19:15, Lorenzo Stoakes wrote:
>> On Mon, Oct 21, 2024 at 07:05:27PM +0200, David Hildenbrand wrote:
>>> On 20.10.24 18:20, Lorenzo Stoakes wrote:
>>>> Implement a new lightweight guard page feature, that is regions of userland
>>>> virtual memory that, when accessed, cause a fatal signal to arise.
>>>>
>>>> Currently users must establish PROT_NONE ranges to achieve this.
>>>>
>>>> However this is very costly memory-wise - we need a VMA for each and every
>>>> one of these regions AND they become unmergeable with surrounding VMAs.
>>>>
>>>> In addition repeated mmap() calls require repeated kernel context switches
>>>> and contention of the mmap lock to install these ranges, potentially also
>>>> having to unmap memory if installed over existing ranges.
>>>>
>>>> The lightweight guard approach eliminates the VMA cost altogether - rather
>>>> than establishing a PROT_NONE VMA, it operates at the level of page table
>>>> entries - poisoning PTEs such that accesses to them cause a fault followed
>>>> by a SIGSGEV signal being raised.
>>>>
>>>> This is achieved through the PTE marker mechanism, which a previous commit
>>>> in this series extended to permit this to be done, installed via the
>>>> generic page walking logic, also extended by a prior commit for this
>>>> purpose.
>>>>
>>>> These poison ranges are established with MADV_GUARD_POISON, and if the
>>>> range in which they are installed contain any existing mappings, they will
>>>> be zapped, i.e. free the range and unmap memory (thus mimicking the
>>>> behaviour of MADV_DONTNEED in this respect).
>>>>
>>>> Any existing poison entries will be left untouched. There is no nesting of
>>>> poisoned pages.
>>>>
>>>> Poisoned ranges are NOT cleared by MADV_DONTNEED, as this would be rather
>>>> unexpected behaviour, but are cleared on process teardown or unmapping of
>>>> memory ranges.
>>>>
>>>> Ranges can have the poison property removed by MADV_GUARD_UNPOISON -
>>>> 'remedying' the poisoning. The ranges over which this is applied, should
>>>> they contain non-poison entries, will be untouched, only poison entries
>>>> will be cleared.
>>>>
>>>> We permit this operation on anonymous memory only, and only VMAs which are
>>>> non-special, non-huge and not mlock()'d (if we permitted this we'd have to
>>>> drop locked pages which would be rather counterintuitive).
>>>>
>>>> Suggested-by: Vlastimil Babka <vbabka@suse.cz>
>>>> Suggested-by: Jann Horn <jannh@google.com>
>>>> Suggested-by: David Hildenbrand <david@redhat.com>
>>>> Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
>>>> ---
>>>>    arch/alpha/include/uapi/asm/mman.h     |   3 +
>>>>    arch/mips/include/uapi/asm/mman.h      |   3 +
>>>>    arch/parisc/include/uapi/asm/mman.h    |   3 +
>>>>    arch/xtensa/include/uapi/asm/mman.h    |   3 +
>>>>    include/uapi/asm-generic/mman-common.h |   3 +
>>>>    mm/madvise.c                           | 168 +++++++++++++++++++++++++
>>>>    mm/mprotect.c                          |   3 +-
>>>>    mm/mseal.c                             |   1 +
>>>>    8 files changed, 186 insertions(+), 1 deletion(-)
>>>>
>>>> diff --git a/arch/alpha/include/uapi/asm/mman.h b/arch/alpha/include/uapi/asm/mman.h
>>>> index 763929e814e9..71e13f27742d 100644
>>>> --- a/arch/alpha/include/uapi/asm/mman.h
>>>> +++ b/arch/alpha/include/uapi/asm/mman.h
>>>> @@ -78,6 +78,9 @@
>>>>    #define MADV_COLLAPSE    25        /* Synchronous hugepage collapse */
>>>> +#define MADV_GUARD_POISON 102        /* fatal signal on access to range */
>>>> +#define MADV_GUARD_UNPOISON 103        /* revoke guard poisoning */
>>>
>>> Just to raise it here: MADV_GUARD_INSTALL / MADV_GUARD_REMOVE or sth. like
>>> that would have been even clearer, at least to me.

Yes, I think so.

>>
>> :)
>>
>> It still feels like poisoning to me because we're explicitly putting
>> something in the page tables to make a range have different fault behaviour
>> like a HW poisoning, and 'installing' suggests backing or something like
>> this, I think that's more confusing.
> 
> I connect "poison" to "SIGBUS" and "corrupt memory state", not to "there is nothing and there must not be anything". Thus my thinking. But again, not the end of the world, just wanted to raise it ...

"Poison" is used so far for fairly distinct things, and I'd very much like
to avoid extending its meaning to guard pages. It makes the other things
less unique, and it misses a naming and classification opportunity.

"Guard" and "guard page" are fairly unique names. That's valuable.


thanks,
Lorenzo Stoakes Oct. 21, 2024, 7:39 p.m. UTC | #5
On Mon, Oct 21, 2024 at 12:25:08PM -0700, John Hubbard wrote:
> On 10/21/24 10:23 AM, David Hildenbrand wrote:
[snip]
> > > > Just to raise it here: MADV_GUARD_INSTALL / MADV_GUARD_REMOVE or sth. like
> > > > that would have been even clearer, at least to me.
>
> Yes, I think so.
>
> > >
> > > :)
> > >
> > > It still feels like poisoning to me because we're explicitly putting
> > > something in the page tables to make a range have different fault behaviour
> > > like a HW poisoning, and 'installing' suggests backing or something like
> > > this, I think that's more confusing.
> >
> > I connect "poison" to "SIGBUS" and "corrupt memory state", not to "there is nothing and there must not be anything". Thus my thinking. But again, not the end of the world, just wanted to raise it ...
>
> "Poison" is used so far for fairly distinct things, and I'd very much like
> to avoid extending its meaning to guard pages. It makes the other things
> less unique, and it misses a naming and classification opportunity.
>
> "Guard" and "guard page" are fairly unique names. That's valuable.
>
>
> thanks,
> --
> John Hubbard
>

Guys you're breaking my heart... Will you not leave me with even a remnant
of a cultural reference?? [0]

[0]:https://www.youtube.com/watch?v=_mej5wS7viw
Vlastimil Babka Oct. 21, 2024, 8:11 p.m. UTC | #6
On 10/20/24 18:20, Lorenzo Stoakes wrote:
> Implement a new lightweight guard page feature, that is regions of userland
> virtual memory that, when accessed, cause a fatal signal to arise.
> 
> Currently users must establish PROT_NONE ranges to achieve this.
> 
> However this is very costly memory-wise - we need a VMA for each and every
> one of these regions AND they become unmergeable with surrounding VMAs.
> 
> In addition repeated mmap() calls require repeated kernel context switches
> and contention of the mmap lock to install these ranges, potentially also
> having to unmap memory if installed over existing ranges.
> 
> The lightweight guard approach eliminates the VMA cost altogether - rather
> than establishing a PROT_NONE VMA, it operates at the level of page table
> entries - poisoning PTEs such that accesses to them cause a fault followed
> by a SIGSGEV signal being raised.
> 
> This is achieved through the PTE marker mechanism, which a previous commit
> in this series extended to permit this to be done, installed via the
> generic page walking logic, also extended by a prior commit for this
> purpose.
> 
> These poison ranges are established with MADV_GUARD_POISON, and if the
> range in which they are installed contain any existing mappings, they will
> be zapped, i.e. free the range and unmap memory (thus mimicking the
> behaviour of MADV_DONTNEED in this respect).
> 
> Any existing poison entries will be left untouched. There is no nesting of
> poisoned pages.
> 
> Poisoned ranges are NOT cleared by MADV_DONTNEED, as this would be rather
> unexpected behaviour, but are cleared on process teardown or unmapping of
> memory ranges.
> 
> Ranges can have the poison property removed by MADV_GUARD_UNPOISON -
> 'remedying' the poisoning. The ranges over which this is applied, should
> they contain non-poison entries, will be untouched, only poison entries
> will be cleared.
> 
> We permit this operation on anonymous memory only, and only VMAs which are
> non-special, non-huge and not mlock()'d (if we permitted this we'd have to
> drop locked pages which would be rather counterintuitive).
> 
> Suggested-by: Vlastimil Babka <vbabka@suse.cz>
> Suggested-by: Jann Horn <jannh@google.com>
> Suggested-by: David Hildenbrand <david@redhat.com>
> Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>

<snip>

> +static long madvise_guard_poison(struct vm_area_struct *vma,
> +				 struct vm_area_struct **prev,
> +				 unsigned long start, unsigned long end)
> +{
> +	long err;
> +
> +	*prev = vma;
> +	if (!is_valid_guard_vma(vma, /* allow_locked = */false))
> +		return -EINVAL;
> +
> +	/*
> +	 * If we install poison markers, then the range is no longer
> +	 * empty from a page table perspective and therefore it's
> +	 * appropriate to have an anon_vma.
> +	 *
> +	 * This ensures that on fork, we copy page tables correctly.
> +	 */
> +	err = anon_vma_prepare(vma);
> +	if (err)
> +		return err;
> +
> +	/*
> +	 * Optimistically try to install the guard poison pages first. If any
> +	 * non-guard pages are encountered, give up and zap the range before
> +	 * trying again.
> +	 */

Should the page walker become powerful enough to handle this in one go? :)
But sure, if it's too big a task to teach it to zap ptes with all the tlb
flushing etc (I assume it's something page walkers don't do today), it makes
sense to do it this way.
Or we could require userspace to zap first (MADV_DONTNEED), but that would
unnecessarily mean extra syscalls for the use case of an allocator debug
mode that wants to turn freed memory to guards to catch use after free.
So this seems like a good compromise...

> +	while (true) {
> +		/* Returns < 0 on error, == 0 if success, > 0 if zap needed. */
> +		err = walk_page_range_mm(vma->vm_mm, start, end,
> +					 &guard_poison_walk_ops, NULL);
> +		if (err <= 0)
> +			return err;
> +
> +		/*
> +		 * OK some of the range have non-guard pages mapped, zap
> +		 * them. This leaves existing guard pages in place.
> +		 */
> +		zap_page_range_single(vma, start, end - start, NULL);

... however the potentially endless loop doesn't seem great. Could a
malicious program keep refaulting the range (ignoring any segfaults if it
loses a race) with one thread while failing to make progress here with
another thread? Is that ok because it would only punish itself?

I mean if we have to retry the guards page installation more than once, it
means the program *is* racing faults with installing guard ptes in the same
range, right? So it would be right to segfault it. But I guess when we
detect it here, we have no way to send the signal to the right thread and it
would be too late? So unless we can do the PTE zap+install marker
atomically, maybe there's no better way and this is acceptable as a
malicious program can harm only itself?

Maybe it would be just simpler to install the marker via zap_details rather
than the pagewalk?

> +
> +		if (fatal_signal_pending(current))
> +			return -EINTR;
> +		cond_resched();
> +	}
> +}
> +
> +static int guard_unpoison_pte_entry(pte_t *pte, unsigned long addr,
> +				    unsigned long next, struct mm_walk *walk)
> +{
> +	pte_t ptent = ptep_get(pte);
> +
> +	if (is_guard_pte_marker(ptent)) {
> +		/* Simply clear the PTE marker. */
> +		pte_clear_not_present_full(walk->mm, addr, pte, false);
> +		update_mmu_cache(walk->vma, addr, pte);
> +	}
> +
> +	return 0;
> +}
> +
> +static const struct mm_walk_ops guard_unpoison_walk_ops = {
> +	.pte_entry		= guard_unpoison_pte_entry,
> +	.walk_lock		= PGWALK_RDLOCK,
> +};
> +
> +static long madvise_guard_unpoison(struct vm_area_struct *vma,
> +				   struct vm_area_struct **prev,
> +				   unsigned long start, unsigned long end)
> +{
> +	*prev = vma;
> +	/*
> +	 * We're ok with unpoisoning mlock()'d ranges, as this is a
> +	 * non-destructive action.
> +	 */
> +	if (!is_valid_guard_vma(vma, /* allow_locked = */true))
> +		return -EINVAL;
> +
> +	return walk_page_range(vma->vm_mm, start, end,
> +			       &guard_unpoison_walk_ops, NULL);
> +}
> +
>  /*
>   * Apply an madvise behavior to a region of a vma.  madvise_update_vma
>   * will handle splitting a vm area into separate areas, each area with its own
> @@ -1098,6 +1260,10 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
>  		break;
>  	case MADV_COLLAPSE:
>  		return madvise_collapse(vma, prev, start, end);
> +	case MADV_GUARD_POISON:
> +		return madvise_guard_poison(vma, prev, start, end);
> +	case MADV_GUARD_UNPOISON:
> +		return madvise_guard_unpoison(vma, prev, start, end);
>  	}
>  
>  	anon_name = anon_vma_name(vma);
> @@ -1197,6 +1363,8 @@ madvise_behavior_valid(int behavior)
>  	case MADV_DODUMP:
>  	case MADV_WIPEONFORK:
>  	case MADV_KEEPONFORK:
> +	case MADV_GUARD_POISON:
> +	case MADV_GUARD_UNPOISON:
>  #ifdef CONFIG_MEMORY_FAILURE
>  	case MADV_SOFT_OFFLINE:
>  	case MADV_HWPOISON:
> diff --git a/mm/mprotect.c b/mm/mprotect.c
> index 0c5d6d06107d..d0e3ebfadef8 100644
> --- a/mm/mprotect.c
> +++ b/mm/mprotect.c
> @@ -236,7 +236,8 @@ static long change_pte_range(struct mmu_gather *tlb,
>  			} else if (is_pte_marker_entry(entry)) {
>  				/*
>  				 * Ignore error swap entries unconditionally,
> -				 * because any access should sigbus anyway.
> +				 * because any access should sigbus/sigsegv
> +				 * anyway.
>  				 */
>  				if (is_poisoned_swp_entry(entry))
>  					continue;
> diff --git a/mm/mseal.c b/mm/mseal.c
> index ece977bd21e1..21bf5534bcf5 100644
> --- a/mm/mseal.c
> +++ b/mm/mseal.c
> @@ -30,6 +30,7 @@ static bool is_madv_discard(int behavior)
>  	case MADV_REMOVE:
>  	case MADV_DONTFORK:
>  	case MADV_WIPEONFORK:
> +	case MADV_GUARD_POISON:
>  		return true;
>  	}
>
David Hildenbrand Oct. 21, 2024, 8:17 p.m. UTC | #7
On 21.10.24 22:11, Vlastimil Babka wrote:
> On 10/20/24 18:20, Lorenzo Stoakes wrote:
>> Implement a new lightweight guard page feature, that is regions of userland
>> virtual memory that, when accessed, cause a fatal signal to arise.
>>
>> Currently users must establish PROT_NONE ranges to achieve this.
>>
>> However this is very costly memory-wise - we need a VMA for each and every
>> one of these regions AND they become unmergeable with surrounding VMAs.
>>
>> In addition repeated mmap() calls require repeated kernel context switches
>> and contention of the mmap lock to install these ranges, potentially also
>> having to unmap memory if installed over existing ranges.
>>
>> The lightweight guard approach eliminates the VMA cost altogether - rather
>> than establishing a PROT_NONE VMA, it operates at the level of page table
>> entries - poisoning PTEs such that accesses to them cause a fault followed
>> by a SIGSGEV signal being raised.
>>
>> This is achieved through the PTE marker mechanism, which a previous commit
>> in this series extended to permit this to be done, installed via the
>> generic page walking logic, also extended by a prior commit for this
>> purpose.
>>
>> These poison ranges are established with MADV_GUARD_POISON, and if the
>> range in which they are installed contain any existing mappings, they will
>> be zapped, i.e. free the range and unmap memory (thus mimicking the
>> behaviour of MADV_DONTNEED in this respect).
>>
>> Any existing poison entries will be left untouched. There is no nesting of
>> poisoned pages.
>>
>> Poisoned ranges are NOT cleared by MADV_DONTNEED, as this would be rather
>> unexpected behaviour, but are cleared on process teardown or unmapping of
>> memory ranges.
>>
>> Ranges can have the poison property removed by MADV_GUARD_UNPOISON -
>> 'remedying' the poisoning. The ranges over which this is applied, should
>> they contain non-poison entries, will be untouched, only poison entries
>> will be cleared.
>>
>> We permit this operation on anonymous memory only, and only VMAs which are
>> non-special, non-huge and not mlock()'d (if we permitted this we'd have to
>> drop locked pages which would be rather counterintuitive).
>>
>> Suggested-by: Vlastimil Babka <vbabka@suse.cz>
>> Suggested-by: Jann Horn <jannh@google.com>
>> Suggested-by: David Hildenbrand <david@redhat.com>
>> Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> 
> <snip>
> 
>> +static long madvise_guard_poison(struct vm_area_struct *vma,
>> +				 struct vm_area_struct **prev,
>> +				 unsigned long start, unsigned long end)
>> +{
>> +	long err;
>> +
>> +	*prev = vma;
>> +	if (!is_valid_guard_vma(vma, /* allow_locked = */false))
>> +		return -EINVAL;
>> +
>> +	/*
>> +	 * If we install poison markers, then the range is no longer
>> +	 * empty from a page table perspective and therefore it's
>> +	 * appropriate to have an anon_vma.
>> +	 *
>> +	 * This ensures that on fork, we copy page tables correctly.
>> +	 */
>> +	err = anon_vma_prepare(vma);
>> +	if (err)
>> +		return err;
>> +
>> +	/*
>> +	 * Optimistically try to install the guard poison pages first. If any
>> +	 * non-guard pages are encountered, give up and zap the range before
>> +	 * trying again.
>> +	 */
> 
> Should the page walker become powerful enough to handle this in one go? :)
> But sure, if it's too big a task to teach it to zap ptes with all the tlb
> flushing etc (I assume it's something page walkers don't do today), it makes
> sense to do it this way.
> Or we could require userspace to zap first (MADV_DONTNEED), but that would
> unnecessarily mean extra syscalls for the use case of an allocator debug
> mode that wants to turn freed memory to guards to catch use after free.
> So this seems like a good compromise...

Yes please, KIS. We can always implement support for that later if 
really required (leave behavior open when documenting).
David Hildenbrand Oct. 21, 2024, 8:18 p.m. UTC | #8
On 21.10.24 21:39, Lorenzo Stoakes wrote:
> On Mon, Oct 21, 2024 at 12:25:08PM -0700, John Hubbard wrote:
>> On 10/21/24 10:23 AM, David Hildenbrand wrote:
> [snip]
>>>>> Just to raise it here: MADV_GUARD_INSTALL / MADV_GUARD_REMOVE or sth. like
>>>>> that would have been even clearer, at least to me.
>>
>> Yes, I think so.
>>
>>>>
>>>> :)
>>>>
>>>> It still feels like poisoning to me because we're explicitly putting
>>>> something in the page tables to make a range have different fault behaviour
>>>> like a HW poisoning, and 'installing' suggests backing or something like
>>>> this, I think that's more confusing.
>>>
>>> I connect "poison" to "SIGBUS" and "corrupt memory state", not to "there is nothing and there must not be anything". Thus my thinking. But again, not the end of the world, just wanted to raise it ...
>>
>> "Poison" is used so far for fairly distinct things, and I'd very much like
>> to avoid extending its meaning to guard pages. It makes the other things
>> less unique, and it misses a naming and classification opportunity.
>>
>> "Guard" and "guard page" are fairly unique names. That's valuable.
>>
>>
>> thanks,
>> --
>> John Hubbard
>>
> 
> Guys you're breaking my heart... Will you not leave me with even a remnant
> of a cultural reference?? [0]

I'm sure you will recover from that loss :)
Vlastimil Babka Oct. 21, 2024, 8:25 p.m. UTC | #9
On 10/21/24 22:17, David Hildenbrand wrote:
> On 21.10.24 22:11, Vlastimil Babka wrote:
>> On 10/20/24 18:20, Lorenzo Stoakes wrote:
>> 
>> <snip>
>> 
>>> +static long madvise_guard_poison(struct vm_area_struct *vma,
>>> +				 struct vm_area_struct **prev,
>>> +				 unsigned long start, unsigned long end)
>>> +{
>>> +	long err;
>>> +
>>> +	*prev = vma;
>>> +	if (!is_valid_guard_vma(vma, /* allow_locked = */false))
>>> +		return -EINVAL;
>>> +
>>> +	/*
>>> +	 * If we install poison markers, then the range is no longer
>>> +	 * empty from a page table perspective and therefore it's
>>> +	 * appropriate to have an anon_vma.
>>> +	 *
>>> +	 * This ensures that on fork, we copy page tables correctly.
>>> +	 */
>>> +	err = anon_vma_prepare(vma);
>>> +	if (err)
>>> +		return err;
>>> +
>>> +	/*
>>> +	 * Optimistically try to install the guard poison pages first. If any
>>> +	 * non-guard pages are encountered, give up and zap the range before
>>> +	 * trying again.
>>> +	 */
>> 
>> Should the page walker become powerful enough to handle this in one go? :)
>> But sure, if it's too big a task to teach it to zap ptes with all the tlb
>> flushing etc (I assume it's something page walkers don't do today), it makes
>> sense to do it this way.
>> Or we could require userspace to zap first (MADV_DONTNEED), but that would
>> unnecessarily mean extra syscalls for the use case of an allocator debug
>> mode that wants to turn freed memory to guards to catch use after free.
>> So this seems like a good compromise...
> 
> Yes please, KIS.

You mean "require userspace to zap first (MADV_DONTNEED)" ?

I'd normally agree with the KIS principle, but..

> We can always implement support for that later if 

it would either mean later we change behavior (installing guards on
non-zapped PTEs would have to be an error now but maybe start working later,
which is user observable change thus can break somebody)

> really required (leave behavior open when documenting).

and leaving it open when documenting doesn't really mean anything for the
"we don't break userspace" promise vs what the implementation actually does.

Or the changed behavior would need to come with a new MADVISE mode. Not
appealing as it's a mess already.

So since its uapi we should aim for the best from the start.
Lorenzo Stoakes Oct. 21, 2024, 8:27 p.m. UTC | #10
On Mon, Oct 21, 2024 at 10:11:29PM +0200, Vlastimil Babka wrote:
> On 10/20/24 18:20, Lorenzo Stoakes wrote:
> > Implement a new lightweight guard page feature, that is regions of userland
> > virtual memory that, when accessed, cause a fatal signal to arise.
> >
> > Currently users must establish PROT_NONE ranges to achieve this.
> >
> > However this is very costly memory-wise - we need a VMA for each and every
> > one of these regions AND they become unmergeable with surrounding VMAs.
> >
> > In addition repeated mmap() calls require repeated kernel context switches
> > and contention of the mmap lock to install these ranges, potentially also
> > having to unmap memory if installed over existing ranges.
> >
> > The lightweight guard approach eliminates the VMA cost altogether - rather
> > than establishing a PROT_NONE VMA, it operates at the level of page table
> > entries - poisoning PTEs such that accesses to them cause a fault followed
> > by a SIGSGEV signal being raised.
> >
> > This is achieved through the PTE marker mechanism, which a previous commit
> > in this series extended to permit this to be done, installed via the
> > generic page walking logic, also extended by a prior commit for this
> > purpose.
> >
> > These poison ranges are established with MADV_GUARD_POISON, and if the
> > range in which they are installed contain any existing mappings, they will
> > be zapped, i.e. free the range and unmap memory (thus mimicking the
> > behaviour of MADV_DONTNEED in this respect).
> >
> > Any existing poison entries will be left untouched. There is no nesting of
> > poisoned pages.
> >
> > Poisoned ranges are NOT cleared by MADV_DONTNEED, as this would be rather
> > unexpected behaviour, but are cleared on process teardown or unmapping of
> > memory ranges.
> >
> > Ranges can have the poison property removed by MADV_GUARD_UNPOISON -
> > 'remedying' the poisoning. The ranges over which this is applied, should
> > they contain non-poison entries, will be untouched, only poison entries
> > will be cleared.
> >
> > We permit this operation on anonymous memory only, and only VMAs which are
> > non-special, non-huge and not mlock()'d (if we permitted this we'd have to
> > drop locked pages which would be rather counterintuitive).
> >
> > Suggested-by: Vlastimil Babka <vbabka@suse.cz>
> > Suggested-by: Jann Horn <jannh@google.com>
> > Suggested-by: David Hildenbrand <david@redhat.com>
> > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
>
> <snip>
>
> > +static long madvise_guard_poison(struct vm_area_struct *vma,
> > +				 struct vm_area_struct **prev,
> > +				 unsigned long start, unsigned long end)
> > +{
> > +	long err;
> > +
> > +	*prev = vma;
> > +	if (!is_valid_guard_vma(vma, /* allow_locked = */false))
> > +		return -EINVAL;
> > +
> > +	/*
> > +	 * If we install poison markers, then the range is no longer
> > +	 * empty from a page table perspective and therefore it's
> > +	 * appropriate to have an anon_vma.
> > +	 *
> > +	 * This ensures that on fork, we copy page tables correctly.
> > +	 */
> > +	err = anon_vma_prepare(vma);
> > +	if (err)
> > +		return err;
> > +
> > +	/*
> > +	 * Optimistically try to install the guard poison pages first. If any
> > +	 * non-guard pages are encountered, give up and zap the range before
> > +	 * trying again.
> > +	 */
>
> Should the page walker become powerful enough to handle this in one go? :)

I can tell you've not read previous threads...

I've addressed this in discussion with Jann - we'd have to do a full fat
huge comprehensive thing to do an in-place replace.

It'd either have to be fully duplicative of the multiple copies of the very
lengthily code to do this sort of thing right (some in mm/madvise.c itself)
or I'd have to go off and do a totally new pre-requisite series
centralising that in a way that people probably wouldn't accept... I'm not
sure the benefits outway the costs.

> But sure, if it's too big a task to teach it to zap ptes with all the tlb
> flushing etc (I assume it's something page walkers don't do today), it makes
> sense to do it this way.
> Or we could require userspace to zap first (MADV_DONTNEED), but that would
> unnecessarily mean extra syscalls for the use case of an allocator debug
> mode that wants to turn freed memory to guards to catch use after free.
> So this seems like a good compromise...

This is optimistic as the comment says, you very often won't need to do
this, so we do a little extra work in the case that you need to zap,
vs. the more likely case that you don't when you don't.

In the face of racing faults, which we can't reasonably prevent without
having to write _and_ VMA lock which is an egregious requirement, this
wouldn't really save us anythign anyway.

>
> > +	while (true) {
> > +		/* Returns < 0 on error, == 0 if success, > 0 if zap needed. */
> > +		err = walk_page_range_mm(vma->vm_mm, start, end,
> > +					 &guard_poison_walk_ops, NULL);
> > +		if (err <= 0)
> > +			return err;
> > +
> > +		/*
> > +		 * OK some of the range have non-guard pages mapped, zap
> > +		 * them. This leaves existing guard pages in place.
> > +		 */
> > +		zap_page_range_single(vma, start, end - start, NULL);
>
> ... however the potentially endless loop doesn't seem great. Could a
> malicious program keep refaulting the range (ignoring any segfaults if it
> loses a race) with one thread while failing to make progress here with
> another thread? Is that ok because it would only punish itself?

Sigh. Again, I don't think you've read the previous series have you? Or
even the changelog... I added this as Jann asked for it. Originally we'd
-EAGAIN if we got raced. See the discussion over in v1 for details.

I did it that way specifically to avoid such things, but Jann didn't appear
to think it was a problem.

>
> I mean if we have to retry the guards page installation more than once, it
> means the program *is* racing faults with installing guard ptes in the same
> range, right? So it would be right to segfault it. But I guess when we
> detect it here, we have no way to send the signal to the right thread and it
> would be too late? So unless we can do the PTE zap+install marker
> atomically, maybe there's no better way and this is acceptable as a
> malicious program can harm only itself?

Yup you'd only be hurting yourself. I went over this with Jann, who didn't
appear to have a problem with this approach from a security perspective, in
fact he explicitly asked me to do this :)

>
> Maybe it would be just simpler to install the marker via zap_details rather
> than the pagewalk?

Ah the inevitable 'please completely rework how you do everything' comment
I was expecting at some point :)

Obviously I've considered this (and a number of other approaches), it would
fundamentally change what zap is - right now if it can't traverse a page
table level that job done (it's job is to remove PTEs not create).

We'd instead have to completely rework the logic to be able to _install_
page tables and then carefully check we don't break anything and only do it
in the specific cases we need.

Or we could add a mode that says 'replace with a poison marker' rather than
zap, but that has potential TLB concerns, splits it across two operations
(installation and zapping), and then could you really be sure that there
isn't a really really badly timed race that would mean you'd have to loop
again?

Right now it's simple, elegant, small and we can only make ourselves
wait. I don't think this is a huge problem.

I think I'll need an actual security/DoS-based justification to change this.

>
> > +
> > +		if (fatal_signal_pending(current))
> > +			return -EINTR;
> > +		cond_resched();
> > +	}
> > +}
> > +
> > +static int guard_unpoison_pte_entry(pte_t *pte, unsigned long addr,
> > +				    unsigned long next, struct mm_walk *walk)
> > +{
> > +	pte_t ptent = ptep_get(pte);
> > +
> > +	if (is_guard_pte_marker(ptent)) {
> > +		/* Simply clear the PTE marker. */
> > +		pte_clear_not_present_full(walk->mm, addr, pte, false);
> > +		update_mmu_cache(walk->vma, addr, pte);
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> > +static const struct mm_walk_ops guard_unpoison_walk_ops = {
> > +	.pte_entry		= guard_unpoison_pte_entry,
> > +	.walk_lock		= PGWALK_RDLOCK,
> > +};
> > +
> > +static long madvise_guard_unpoison(struct vm_area_struct *vma,
> > +				   struct vm_area_struct **prev,
> > +				   unsigned long start, unsigned long end)
> > +{
> > +	*prev = vma;
> > +	/*
> > +	 * We're ok with unpoisoning mlock()'d ranges, as this is a
> > +	 * non-destructive action.
> > +	 */
> > +	if (!is_valid_guard_vma(vma, /* allow_locked = */true))
> > +		return -EINVAL;
> > +
> > +	return walk_page_range(vma->vm_mm, start, end,
> > +			       &guard_unpoison_walk_ops, NULL);
> > +}
> > +
> >  /*
> >   * Apply an madvise behavior to a region of a vma.  madvise_update_vma
> >   * will handle splitting a vm area into separate areas, each area with its own
> > @@ -1098,6 +1260,10 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
> >  		break;
> >  	case MADV_COLLAPSE:
> >  		return madvise_collapse(vma, prev, start, end);
> > +	case MADV_GUARD_POISON:
> > +		return madvise_guard_poison(vma, prev, start, end);
> > +	case MADV_GUARD_UNPOISON:
> > +		return madvise_guard_unpoison(vma, prev, start, end);
> >  	}
> >
> >  	anon_name = anon_vma_name(vma);
> > @@ -1197,6 +1363,8 @@ madvise_behavior_valid(int behavior)
> >  	case MADV_DODUMP:
> >  	case MADV_WIPEONFORK:
> >  	case MADV_KEEPONFORK:
> > +	case MADV_GUARD_POISON:
> > +	case MADV_GUARD_UNPOISON:
> >  #ifdef CONFIG_MEMORY_FAILURE
> >  	case MADV_SOFT_OFFLINE:
> >  	case MADV_HWPOISON:
> > diff --git a/mm/mprotect.c b/mm/mprotect.c
> > index 0c5d6d06107d..d0e3ebfadef8 100644
> > --- a/mm/mprotect.c
> > +++ b/mm/mprotect.c
> > @@ -236,7 +236,8 @@ static long change_pte_range(struct mmu_gather *tlb,
> >  			} else if (is_pte_marker_entry(entry)) {
> >  				/*
> >  				 * Ignore error swap entries unconditionally,
> > -				 * because any access should sigbus anyway.
> > +				 * because any access should sigbus/sigsegv
> > +				 * anyway.
> >  				 */
> >  				if (is_poisoned_swp_entry(entry))
> >  					continue;
> > diff --git a/mm/mseal.c b/mm/mseal.c
> > index ece977bd21e1..21bf5534bcf5 100644
> > --- a/mm/mseal.c
> > +++ b/mm/mseal.c
> > @@ -30,6 +30,7 @@ static bool is_madv_discard(int behavior)
> >  	case MADV_REMOVE:
> >  	case MADV_DONTFORK:
> >  	case MADV_WIPEONFORK:
> > +	case MADV_GUARD_POISON:
> >  		return true;
> >  	}
> >
>
Lorenzo Stoakes Oct. 21, 2024, 8:30 p.m. UTC | #11
On Mon, Oct 21, 2024 at 10:25:06PM +0200, Vlastimil Babka wrote:
> On 10/21/24 22:17, David Hildenbrand wrote:
> > On 21.10.24 22:11, Vlastimil Babka wrote:
> >> On 10/20/24 18:20, Lorenzo Stoakes wrote:
> >>
> >> <snip>
> >>
> >>> +static long madvise_guard_poison(struct vm_area_struct *vma,
> >>> +				 struct vm_area_struct **prev,
> >>> +				 unsigned long start, unsigned long end)
> >>> +{
> >>> +	long err;
> >>> +
> >>> +	*prev = vma;
> >>> +	if (!is_valid_guard_vma(vma, /* allow_locked = */false))
> >>> +		return -EINVAL;
> >>> +
> >>> +	/*
> >>> +	 * If we install poison markers, then the range is no longer
> >>> +	 * empty from a page table perspective and therefore it's
> >>> +	 * appropriate to have an anon_vma.
> >>> +	 *
> >>> +	 * This ensures that on fork, we copy page tables correctly.
> >>> +	 */
> >>> +	err = anon_vma_prepare(vma);
> >>> +	if (err)
> >>> +		return err;
> >>> +
> >>> +	/*
> >>> +	 * Optimistically try to install the guard poison pages first. If any
> >>> +	 * non-guard pages are encountered, give up and zap the range before
> >>> +	 * trying again.
> >>> +	 */
> >>
> >> Should the page walker become powerful enough to handle this in one go? :)
> >> But sure, if it's too big a task to teach it to zap ptes with all the tlb
> >> flushing etc (I assume it's something page walkers don't do today), it makes
> >> sense to do it this way.
> >> Or we could require userspace to zap first (MADV_DONTNEED), but that would
> >> unnecessarily mean extra syscalls for the use case of an allocator debug
> >> mode that wants to turn freed memory to guards to catch use after free.
> >> So this seems like a good compromise...
> >
> > Yes please, KIS.
>
> You mean "require userspace to zap first (MADV_DONTNEED)" ?

What on earth are you talking about? This is crazy, we can detect if we need to
zap with the page walker then just zap? Why would we do this?

The solution as is is perfectly simple... What is the justification for
this on any level?

Again, if you think there's a _genuine_ security/DoS issue here you're
going to really need to demonstrate it rather than hand wave?

>
> I'd normally agree with the KIS principle, but..
>
> > We can always implement support for that later if
>
> it would either mean later we change behavior (installing guards on
> non-zapped PTEs would have to be an error now but maybe start working later,
> which is user observable change thus can break somebody)
>
> > really required (leave behavior open when documenting).
>
> and leaving it open when documenting doesn't really mean anything for the
> "we don't break userspace" promise vs what the implementation actually does.
>
> Or the changed behavior would need to come with a new MADVISE mode. Not
> appealing as it's a mess already.
>
> So since its uapi we should aim for the best from the start.
>
>

Best is 'call the madvise(), guard pages installed' which is what it is
now.
David Hildenbrand Oct. 21, 2024, 8:37 p.m. UTC | #12
On 21.10.24 22:25, Vlastimil Babka wrote:
> On 10/21/24 22:17, David Hildenbrand wrote:
>> On 21.10.24 22:11, Vlastimil Babka wrote:
>>> On 10/20/24 18:20, Lorenzo Stoakes wrote:
>>>
>>> <snip>
>>>
>>>> +static long madvise_guard_poison(struct vm_area_struct *vma,
>>>> +				 struct vm_area_struct **prev,
>>>> +				 unsigned long start, unsigned long end)
>>>> +{
>>>> +	long err;
>>>> +
>>>> +	*prev = vma;
>>>> +	if (!is_valid_guard_vma(vma, /* allow_locked = */false))
>>>> +		return -EINVAL;
>>>> +
>>>> +	/*
>>>> +	 * If we install poison markers, then the range is no longer
>>>> +	 * empty from a page table perspective and therefore it's
>>>> +	 * appropriate to have an anon_vma.
>>>> +	 *
>>>> +	 * This ensures that on fork, we copy page tables correctly.
>>>> +	 */
>>>> +	err = anon_vma_prepare(vma);
>>>> +	if (err)
>>>> +		return err;
>>>> +
>>>> +	/*
>>>> +	 * Optimistically try to install the guard poison pages first. If any
>>>> +	 * non-guard pages are encountered, give up and zap the range before
>>>> +	 * trying again.
>>>> +	 */
>>>
>>> Should the page walker become powerful enough to handle this in one go? :)
>>> But sure, if it's too big a task to teach it to zap ptes with all the tlb
>>> flushing etc (I assume it's something page walkers don't do today), it makes
>>> sense to do it this way.
>>> Or we could require userspace to zap first (MADV_DONTNEED), but that would
>>> unnecessarily mean extra syscalls for the use case of an allocator debug
>>> mode that wants to turn freed memory to guards to catch use after free.
>>> So this seems like a good compromise...
>>
>> Yes please, KIS.
> 
> You mean "require userspace to zap first (MADV_DONTNEED)" ?

Yes, I see from Lorenzo's reply that there is apparently some history to 
this (maybe it's all nicely summarized in the cover letter / this patch, 
have to dig further).

Not sure yet what the problem is, I would have thought it's all 
protected by the PTL, and concurrent faults are user space doing 
something stupid and we'd detect it.

Have to do some more reading on this.

> 
> I'd normally agree with the KIS principle, but..
> 
>> We can always implement support for that later if
> 
> it would either mean later we change behavior (installing guards on
> non-zapped PTEs would have to be an error now but maybe start working later,
> which is user observable change thus can break somebody)
> 
>> really required (leave behavior open when documenting).
> 
> and leaving it open when documenting doesn't really mean anything for the
> "we don't break userspace" promise vs what the implementation actually does.

Not quite I think. You could start return -EEXIST or -EOPNOTSUPP and 
document that this can change in the future to succeed if there is 
something. User space can sense support.

Something failing that at one point starts working is not really 
breaking user space, unless someone really *wants* to fail if there is 
already something (e.g., concurrent fault -> bail out instead of hiding it).

Of course, a more elegant solution would be GUARD_INSTALL vs. 
GUARD_FORCE_INSTALL.

.. but again, there seems to be more history to this.
Vlastimil Babka Oct. 21, 2024, 8:45 p.m. UTC | #13
On 10/21/24 22:27, Lorenzo Stoakes wrote:
> On Mon, Oct 21, 2024 at 10:11:29PM +0200, Vlastimil Babka wrote:
>> On 10/20/24 18:20, Lorenzo Stoakes wrote:
>> > Implement a new lightweight guard page feature, that is regions of userland
>> > virtual memory that, when accessed, cause a fatal signal to arise.
>> >
>> > Currently users must establish PROT_NONE ranges to achieve this.
>> >
>> > However this is very costly memory-wise - we need a VMA for each and every
>> > one of these regions AND they become unmergeable with surrounding VMAs.
>> >
>> > In addition repeated mmap() calls require repeated kernel context switches
>> > and contention of the mmap lock to install these ranges, potentially also
>> > having to unmap memory if installed over existing ranges.
>> >
>> > The lightweight guard approach eliminates the VMA cost altogether - rather
>> > than establishing a PROT_NONE VMA, it operates at the level of page table
>> > entries - poisoning PTEs such that accesses to them cause a fault followed
>> > by a SIGSGEV signal being raised.
>> >
>> > This is achieved through the PTE marker mechanism, which a previous commit
>> > in this series extended to permit this to be done, installed via the
>> > generic page walking logic, also extended by a prior commit for this
>> > purpose.
>> >
>> > These poison ranges are established with MADV_GUARD_POISON, and if the
>> > range in which they are installed contain any existing mappings, they will
>> > be zapped, i.e. free the range and unmap memory (thus mimicking the
>> > behaviour of MADV_DONTNEED in this respect).
>> >
>> > Any existing poison entries will be left untouched. There is no nesting of
>> > poisoned pages.
>> >
>> > Poisoned ranges are NOT cleared by MADV_DONTNEED, as this would be rather
>> > unexpected behaviour, but are cleared on process teardown or unmapping of
>> > memory ranges.
>> >
>> > Ranges can have the poison property removed by MADV_GUARD_UNPOISON -
>> > 'remedying' the poisoning. The ranges over which this is applied, should
>> > they contain non-poison entries, will be untouched, only poison entries
>> > will be cleared.
>> >
>> > We permit this operation on anonymous memory only, and only VMAs which are
>> > non-special, non-huge and not mlock()'d (if we permitted this we'd have to
>> > drop locked pages which would be rather counterintuitive).
>> >
>> > Suggested-by: Vlastimil Babka <vbabka@suse.cz>
>> > Suggested-by: Jann Horn <jannh@google.com>
>> > Suggested-by: David Hildenbrand <david@redhat.com>
>> > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
>>
>> <snip>
>>
>> > +static long madvise_guard_poison(struct vm_area_struct *vma,
>> > +				 struct vm_area_struct **prev,
>> > +				 unsigned long start, unsigned long end)
>> > +{
>> > +	long err;
>> > +
>> > +	*prev = vma;
>> > +	if (!is_valid_guard_vma(vma, /* allow_locked = */false))
>> > +		return -EINVAL;
>> > +
>> > +	/*
>> > +	 * If we install poison markers, then the range is no longer
>> > +	 * empty from a page table perspective and therefore it's
>> > +	 * appropriate to have an anon_vma.
>> > +	 *
>> > +	 * This ensures that on fork, we copy page tables correctly.
>> > +	 */
>> > +	err = anon_vma_prepare(vma);
>> > +	if (err)
>> > +		return err;
>> > +
>> > +	/*
>> > +	 * Optimistically try to install the guard poison pages first. If any
>> > +	 * non-guard pages are encountered, give up and zap the range before
>> > +	 * trying again.
>> > +	 */
>>
>> Should the page walker become powerful enough to handle this in one go? :)
> 
> I can tell you've not read previous threads...

Whoops, you're right, I did read v1 but forgot about the RFC.

But we can assume people who'll only see the code after it's merged will not
have read it either, so since a potentially endless loop could be
suspicious, expanding the comment to explain how it's fine wouldn't hurt?

> I've addressed this in discussion with Jann - we'd have to do a full fat
> huge comprehensive thing to do an in-place replace.
> 
> It'd either have to be fully duplicative of the multiple copies of the very
> lengthily code to do this sort of thing right (some in mm/madvise.c itself)
> or I'd have to go off and do a totally new pre-requisite series
> centralising that in a way that people probably wouldn't accept... I'm not
> sure the benefits outway the costs.
> 
>> But sure, if it's too big a task to teach it to zap ptes with all the tlb
>> flushing etc (I assume it's something page walkers don't do today), it makes
>> sense to do it this way.
>> Or we could require userspace to zap first (MADV_DONTNEED), but that would
>> unnecessarily mean extra syscalls for the use case of an allocator debug
>> mode that wants to turn freed memory to guards to catch use after free.
>> So this seems like a good compromise...
> 
> This is optimistic as the comment says, you very often won't need to do
> this, so we do a little extra work in the case that you need to zap,
> vs. the more likely case that you don't when you don't.
> 
> In the face of racing faults, which we can't reasonably prevent without
> having to write _and_ VMA lock which is an egregious requirement, this
> wouldn't really save us anythign anyway.

OK.

>>
>> > +	while (true) {
>> > +		/* Returns < 0 on error, == 0 if success, > 0 if zap needed. */
>> > +		err = walk_page_range_mm(vma->vm_mm, start, end,
>> > +					 &guard_poison_walk_ops, NULL);
>> > +		if (err <= 0)
>> > +			return err;
>> > +
>> > +		/*
>> > +		 * OK some of the range have non-guard pages mapped, zap
>> > +		 * them. This leaves existing guard pages in place.
>> > +		 */
>> > +		zap_page_range_single(vma, start, end - start, NULL);
>>
>> ... however the potentially endless loop doesn't seem great. Could a
>> malicious program keep refaulting the range (ignoring any segfaults if it
>> loses a race) with one thread while failing to make progress here with
>> another thread? Is that ok because it would only punish itself?
> 
> Sigh. Again, I don't think you've read the previous series have you? Or
> even the changelog... I added this as Jann asked for it. Originally we'd
> -EAGAIN if we got raced. See the discussion over in v1 for details.
> 
> I did it that way specifically to avoid such things, but Jann didn't appear
> to think it was a problem.

If Jann is fine with this then it must be secure enough.

>>
>> I mean if we have to retry the guards page installation more than once, it
>> means the program *is* racing faults with installing guard ptes in the same
>> range, right? So it would be right to segfault it. But I guess when we
>> detect it here, we have no way to send the signal to the right thread and it
>> would be too late? So unless we can do the PTE zap+install marker
>> atomically, maybe there's no better way and this is acceptable as a
>> malicious program can harm only itself?
> 
> Yup you'd only be hurting yourself. I went over this with Jann, who didn't
> appear to have a problem with this approach from a security perspective, in
> fact he explicitly asked me to do this :)
> 
>>
>> Maybe it would be just simpler to install the marker via zap_details rather
>> than the pagewalk?
> 
> Ah the inevitable 'please completely rework how you do everything' comment
> I was expecting at some point :)

Job security :)

j/k

> Obviously I've considered this (and a number of other approaches), it would
> fundamentally change what zap is - right now if it can't traverse a page
> table level that job done (it's job is to remove PTEs not create).
> 
> We'd instead have to completely rework the logic to be able to _install_
> page tables and then carefully check we don't break anything and only do it
> in the specific cases we need.
> 
> Or we could add a mode that says 'replace with a poison marker' rather than
> zap, but that has potential TLB concerns, splits it across two operations
> (installation and zapping), and then could you really be sure that there
> isn't a really really badly timed race that would mean you'd have to loop
> again?
> 
> Right now it's simple, elegant, small and we can only make ourselves
> wait. I don't think this is a huge problem.
> 
> I think I'll need an actual security/DoS-based justification to change this.
> 
>>
>> > +
>> > +		if (fatal_signal_pending(current))
>> > +			return -EINTR;
>> > +		cond_resched();
>> > +	}
>> > +}
>> > +
>> > +static int guard_unpoison_pte_entry(pte_t *pte, unsigned long addr,
>> > +				    unsigned long next, struct mm_walk *walk)
>> > +{
>> > +	pte_t ptent = ptep_get(pte);
>> > +
>> > +	if (is_guard_pte_marker(ptent)) {
>> > +		/* Simply clear the PTE marker. */
>> > +		pte_clear_not_present_full(walk->mm, addr, pte, false);
>> > +		update_mmu_cache(walk->vma, addr, pte);
>> > +	}
>> > +
>> > +	return 0;
>> > +}
>> > +
>> > +static const struct mm_walk_ops guard_unpoison_walk_ops = {
>> > +	.pte_entry		= guard_unpoison_pte_entry,
>> > +	.walk_lock		= PGWALK_RDLOCK,
>> > +};
>> > +
>> > +static long madvise_guard_unpoison(struct vm_area_struct *vma,
>> > +				   struct vm_area_struct **prev,
>> > +				   unsigned long start, unsigned long end)
>> > +{
>> > +	*prev = vma;
>> > +	/*
>> > +	 * We're ok with unpoisoning mlock()'d ranges, as this is a
>> > +	 * non-destructive action.
>> > +	 */
>> > +	if (!is_valid_guard_vma(vma, /* allow_locked = */true))
>> > +		return -EINVAL;
>> > +
>> > +	return walk_page_range(vma->vm_mm, start, end,
>> > +			       &guard_unpoison_walk_ops, NULL);
>> > +}
>> > +
>> >  /*
>> >   * Apply an madvise behavior to a region of a vma.  madvise_update_vma
>> >   * will handle splitting a vm area into separate areas, each area with its own
>> > @@ -1098,6 +1260,10 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
>> >  		break;
>> >  	case MADV_COLLAPSE:
>> >  		return madvise_collapse(vma, prev, start, end);
>> > +	case MADV_GUARD_POISON:
>> > +		return madvise_guard_poison(vma, prev, start, end);
>> > +	case MADV_GUARD_UNPOISON:
>> > +		return madvise_guard_unpoison(vma, prev, start, end);
>> >  	}
>> >
>> >  	anon_name = anon_vma_name(vma);
>> > @@ -1197,6 +1363,8 @@ madvise_behavior_valid(int behavior)
>> >  	case MADV_DODUMP:
>> >  	case MADV_WIPEONFORK:
>> >  	case MADV_KEEPONFORK:
>> > +	case MADV_GUARD_POISON:
>> > +	case MADV_GUARD_UNPOISON:
>> >  #ifdef CONFIG_MEMORY_FAILURE
>> >  	case MADV_SOFT_OFFLINE:
>> >  	case MADV_HWPOISON:
>> > diff --git a/mm/mprotect.c b/mm/mprotect.c
>> > index 0c5d6d06107d..d0e3ebfadef8 100644
>> > --- a/mm/mprotect.c
>> > +++ b/mm/mprotect.c
>> > @@ -236,7 +236,8 @@ static long change_pte_range(struct mmu_gather *tlb,
>> >  			} else if (is_pte_marker_entry(entry)) {
>> >  				/*
>> >  				 * Ignore error swap entries unconditionally,
>> > -				 * because any access should sigbus anyway.
>> > +				 * because any access should sigbus/sigsegv
>> > +				 * anyway.
>> >  				 */
>> >  				if (is_poisoned_swp_entry(entry))
>> >  					continue;
>> > diff --git a/mm/mseal.c b/mm/mseal.c
>> > index ece977bd21e1..21bf5534bcf5 100644
>> > --- a/mm/mseal.c
>> > +++ b/mm/mseal.c
>> > @@ -30,6 +30,7 @@ static bool is_madv_discard(int behavior)
>> >  	case MADV_REMOVE:
>> >  	case MADV_DONTFORK:
>> >  	case MADV_WIPEONFORK:
>> > +	case MADV_GUARD_POISON:
>> >  		return true;
>> >  	}
>> >
>>
Lorenzo Stoakes Oct. 21, 2024, 8:49 p.m. UTC | #14
On Mon, Oct 21, 2024 at 10:37:39PM +0200, David Hildenbrand wrote:
> On 21.10.24 22:25, Vlastimil Babka wrote:
> > On 10/21/24 22:17, David Hildenbrand wrote:
> > > On 21.10.24 22:11, Vlastimil Babka wrote:
> > > > On 10/20/24 18:20, Lorenzo Stoakes wrote:
> > > >
> > > > <snip>
> > > >
> > > > > +static long madvise_guard_poison(struct vm_area_struct *vma,
> > > > > +				 struct vm_area_struct **prev,
> > > > > +				 unsigned long start, unsigned long end)
> > > > > +{
> > > > > +	long err;
> > > > > +
> > > > > +	*prev = vma;
> > > > > +	if (!is_valid_guard_vma(vma, /* allow_locked = */false))
> > > > > +		return -EINVAL;
> > > > > +
> > > > > +	/*
> > > > > +	 * If we install poison markers, then the range is no longer
> > > > > +	 * empty from a page table perspective and therefore it's
> > > > > +	 * appropriate to have an anon_vma.
> > > > > +	 *
> > > > > +	 * This ensures that on fork, we copy page tables correctly.
> > > > > +	 */
> > > > > +	err = anon_vma_prepare(vma);
> > > > > +	if (err)
> > > > > +		return err;
> > > > > +
> > > > > +	/*
> > > > > +	 * Optimistically try to install the guard poison pages first. If any
> > > > > +	 * non-guard pages are encountered, give up and zap the range before
> > > > > +	 * trying again.
> > > > > +	 */
> > > >
> > > > Should the page walker become powerful enough to handle this in one go? :)
> > > > But sure, if it's too big a task to teach it to zap ptes with all the tlb
> > > > flushing etc (I assume it's something page walkers don't do today), it makes
> > > > sense to do it this way.
> > > > Or we could require userspace to zap first (MADV_DONTNEED), but that would
> > > > unnecessarily mean extra syscalls for the use case of an allocator debug
> > > > mode that wants to turn freed memory to guards to catch use after free.
> > > > So this seems like a good compromise...
> > >
> > > Yes please, KIS.
> >
> > You mean "require userspace to zap first (MADV_DONTNEED)" ?
>
> Yes, I see from Lorenzo's reply that there is apparently some history to
> this (maybe it's all nicely summarized in the cover letter / this patch,
> have to dig further).
>
> Not sure yet what the problem is, I would have thought it's all protected by
> the PTL, and concurrent faults are user space doing something stupid and
> we'd detect it.

The looping mechanism is fine for dealing with concurrent faults. There's
no actual _race_ due to PTL, it's just that a user could repeatedly
populate stuff stupidly in a range that is meant to have poison markers put
in.

It's not likely and would be kind of an abusive of the interface, and it'd
really be a process just hurting itself.

In nearly all cases you won't zap at all. The whole point is it's
optimistic. In 99.99% of others you zap once...

>
> Have to do some more reading on this.

May I suggest a book on the history of the prodigy?

>
> >
> > I'd normally agree with the KIS principle, but..
> >
> > > We can always implement support for that later if
> >
> > it would either mean later we change behavior (installing guards on
> > non-zapped PTEs would have to be an error now but maybe start working later,
> > which is user observable change thus can break somebody)
> >
> > > really required (leave behavior open when documenting).
> >
> > and leaving it open when documenting doesn't really mean anything for the
> > "we don't break userspace" promise vs what the implementation actually does.
>
> Not quite I think. You could start return -EEXIST or -EOPNOTSUPP and
> document that this can change in the future to succeed if there is
> something. User space can sense support.

Yeah I mean originally I had a -EAGAIN which was sort of equivalent of this
but Jann pointed out you're just shifting work to userland who would loop
and repeat.

I just don't see why we'd do this.

In fact I was looking at the series and thinking 'wow it's actually a
really small delta' and being proud but... still not KIS enough apparently
;)

>
> Something failing that at one point starts working is not really breaking
> user space, unless someone really *wants* to fail if there is already
> something (e.g., concurrent fault -> bail out instead of hiding it).
>
> Of course, a more elegant solution would be GUARD_INSTALL vs.
> GUARD_FORCE_INSTALL.
>
> .. but again, there seems to be more history to this.

I don't think there's really any value in that. There's just no sensible
situation in which a user would care about this I don't think.

And if you're saying 'hey do MADV_DONTNEED if this fails and keep trying!'
then why not just do that in the kernel?

Trying to explain to a user 'hey this is for installing guard pages but if
there's a facing fault it'll fail and that could keep happening and then
you'll have to zap and maybe in a loop' just... seems like a bloody awful
interface?

I prefer 'here's an interface for installing and removing guard pages,
enjoy!' :)

>
> --
> Cheers,
>
> David / dhildenb
>
David Hildenbrand Oct. 21, 2024, 9:20 p.m. UTC | #15
>> Yes, I see from Lorenzo's reply that there is apparently some history to
>> this (maybe it's all nicely summarized in the cover letter / this patch,
>> have to dig further).
>>
>> Not sure yet what the problem is, I would have thought it's all protected by
>> the PTL, and concurrent faults are user space doing something stupid and
>> we'd detect it.
> 
> The looping mechanism is fine for dealing with concurrent faults. There's
> no actual _race_ due to PTL, it's just that a user could repeatedly
> populate stuff stupidly in a range that is meant to have poison markers put
> in.
> 
> It's not likely and would be kind of an abusive of the interface, and it'd
> really be a process just hurting itself.
> 
> In nearly all cases you won't zap at all. The whole point is it's
> optimistic. In 99.99% of others you zap once...

Exactly! And that's why I am questioning whether the kernel should care 
about that. See below.

> 
>>
>> Have to do some more reading on this.
> 
> May I suggest a book on the history of the prodigy?

:D

> 
>>
>>>
>>> I'd normally agree with the KIS principle, but..
>>>
>>>> We can always implement support for that later if
>>>
>>> it would either mean later we change behavior (installing guards on
>>> non-zapped PTEs would have to be an error now but maybe start working later,
>>> which is user observable change thus can break somebody)
>>>
>>>> really required (leave behavior open when documenting).
>>>
>>> and leaving it open when documenting doesn't really mean anything for the
>>> "we don't break userspace" promise vs what the implementation actually does.
>>
>> Not quite I think. You could start return -EEXIST or -EOPNOTSUPP and
>> document that this can change in the future to succeed if there is
>> something. User space can sense support.
> 
> Yeah I mean originally I had a -EAGAIN which was sort of equivalent of this
> but Jann pointed out you're just shifting work to userland who would loop
> and repeat.
> 
> I just don't see why we'd do this.
> 
> In fact I was looking at the series and thinking 'wow it's actually a
> really small delta' and being proud but... still not KIS enough apparently
> ;)

You know, I read a lot of kernel code ... and mfill_atomic_install_pte() 
is what popped in my head: if there is already something, let user space 
handle it, because it is unexpected.

The uffd interface is slightly better, as it gives you the number of 
processed PTEs back, which madvise() is not designed for.

But maybe this (returning how many we already processed) is not required 
due to the nature of guard pages (below).

> 
>>
>> Something failing that at one point starts working is not really breaking
>> user space, unless someone really *wants* to fail if there is already
>> something (e.g., concurrent fault -> bail out instead of hiding it).
>>
>> Of course, a more elegant solution would be GUARD_INSTALL vs.
>> GUARD_FORCE_INSTALL.
>>
>> .. but again, there seems to be more history to this.
> 
> I don't think there's really any value in that. There's just no sensible
> situation in which a user would care about this I don't think.

Making sure nobody touches an area, and wile doing that somebody already 
touched that area? I guess it could be worked around by 
mprotect(PROT_NONE),madvise(GUARD),mprotect(PROT_READ|PROT_WRITE) ... 
which is not particularly nice :)

> 
> And if you're saying 'hey do MADV_DONTNEED if this fails and keep trying!'
> then why not just do that in the kernel?

Heh, no!

If user space doesn't expect there to be something, it should *fail*. 
That's likely going to be the majority of use cases for guard pages 
(happy to be told otherwise). No retry.

And if user space expects there to be something it should zap ahead of 
time (which some allocators maybe already do to free up memory after 
free()) to then install the guard. No retry.

There is this case where user space might be unsure. There, it might 
make sense to retry exactly once.

> 
> Trying to explain to a user 'hey this is for installing guard pages but if
> there's a facing fault it'll fail and that could keep happening and then
> you'll have to zap and maybe in a loop' just... seems like a bloody awful
> interface?

Hope my example above made it clearer. This "retry forever until it 
works" use case doesn't quite make sense to me, but I might just be 
missing something important.

But again, I have to do more reading on the history of the current 
approach ... and it's fairly late here.
Lorenzo Stoakes Oct. 21, 2024, 9:33 p.m. UTC | #16
On Mon, Oct 21, 2024 at 11:20:03PM +0200, David Hildenbrand wrote:
> > > Yes, I see from Lorenzo's reply that there is apparently some history to
> > > this (maybe it's all nicely summarized in the cover letter / this patch,
> > > have to dig further).
> > >
> > > Not sure yet what the problem is, I would have thought it's all protected by
> > > the PTL, and concurrent faults are user space doing something stupid and
> > > we'd detect it.
> >
> > The looping mechanism is fine for dealing with concurrent faults. There's
> > no actual _race_ due to PTL, it's just that a user could repeatedly
> > populate stuff stupidly in a range that is meant to have poison markers put
> > in.
> >
> > It's not likely and would be kind of an abusive of the interface, and it'd
> > really be a process just hurting itself.
> >
> > In nearly all cases you won't zap at all. The whole point is it's
> > optimistic. In 99.99% of others you zap once...
>
> Exactly! And that's why I am questioning whether the kernel should care
> about that. See below.
>
> >
> > >
> > > Have to do some more reading on this.
> >
> > May I suggest a book on the history of the prodigy?
>
> :D
>
> >
> > >
> > > >
> > > > I'd normally agree with the KIS principle, but..
> > > >
> > > > > We can always implement support for that later if
> > > >
> > > > it would either mean later we change behavior (installing guards on
> > > > non-zapped PTEs would have to be an error now but maybe start working later,
> > > > which is user observable change thus can break somebody)
> > > >
> > > > > really required (leave behavior open when documenting).
> > > >
> > > > and leaving it open when documenting doesn't really mean anything for the
> > > > "we don't break userspace" promise vs what the implementation actually does.
> > >
> > > Not quite I think. You could start return -EEXIST or -EOPNOTSUPP and
> > > document that this can change in the future to succeed if there is
> > > something. User space can sense support.
> >
> > Yeah I mean originally I had a -EAGAIN which was sort of equivalent of this
> > but Jann pointed out you're just shifting work to userland who would loop
> > and repeat.
> >
> > I just don't see why we'd do this.
> >
> > In fact I was looking at the series and thinking 'wow it's actually a
> > really small delta' and being proud but... still not KIS enough apparently
> > ;)
>
> You know, I read a lot of kernel code ... and mfill_atomic_install_pte() is
> what popped in my head: if there is already something, let user space handle
> it, because it is unexpected.
>
> The uffd interface is slightly better, as it gives you the number of
> processed PTEs back, which madvise() is not designed for.
>
> But maybe this (returning how many we already processed) is not required due
> to the nature of guard pages (below).
>
> >
> > >
> > > Something failing that at one point starts working is not really breaking
> > > user space, unless someone really *wants* to fail if there is already
> > > something (e.g., concurrent fault -> bail out instead of hiding it).
> > >
> > > Of course, a more elegant solution would be GUARD_INSTALL vs.
> > > GUARD_FORCE_INSTALL.
> > >
> > > .. but again, there seems to be more history to this.
> >
> > I don't think there's really any value in that. There's just no sensible
> > situation in which a user would care about this I don't think.
>
> Making sure nobody touches an area, and wile doing that somebody already
> touched that area? I guess it could be worked around by
> mprotect(PROT_NONE),madvise(GUARD),mprotect(PROT_READ|PROT_WRITE) ... which
> is not particularly nice :)
>
> >
> > And if you're saying 'hey do MADV_DONTNEED if this fails and keep trying!'
> > then why not just do that in the kernel?
>
> Heh, no!
>
> If user space doesn't expect there to be something, it should *fail*. That's
> likely going to be the majority of use cases for guard pages (happy to be
> told otherwise). No retry.
>
> And if user space expects there to be something it should zap ahead of time
> (which some allocators maybe already do to free up memory after free()) to
> then install the guard. No retry.
>
> There is this case where user space might be unsure. There, it might make
> sense to retry exactly once.
>
> >
> > Trying to explain to a user 'hey this is for installing guard pages but if
> > there's a facing fault it'll fail and that could keep happening and then
> > you'll have to zap and maybe in a loop' just... seems like a bloody awful
> > interface?
>
> Hope my example above made it clearer. This "retry forever until it works"
> use case doesn't quite make sense to me, but I might just be missing
> something important.

It won't be forever in any case other than a process that is abusing itself.

Then it's a security question and... see below!

>
> But again, I have to do more reading on the history of the current approach
> ... and it's fairly late here.

Yes late for me too and I've been working since 8am pretty solid so... :)

>
>
> --
> Cheers,
>
> David / dhildenb
>

So we all agree it's very unlikely, we all agree that even if it happens it'll
happen only once, so what is the problem with the loop exactly? Other than
philosophical?

We do have other loops in the kernel like this... it'd really be the same as the
process throttling itself and whether it loops in userland or kernel land is
kind of immaterial really.

So to me this comes down fundamentally - is this a genuine security/abuse
concern?

Since Jann suggested this, I find it unlikely. But I have pinged him for his
input.

Otherwise I don't know what we're arguing about... and I think we're fine!
Vlastimil Babka Oct. 21, 2024, 9:35 p.m. UTC | #17
On 10/21/24 23:20, David Hildenbrand wrote:
>> I don't think there's really any value in that. There's just no sensible
>> situation in which a user would care about this I don't think.
> 
> Making sure nobody touches an area, and wile doing that somebody already 
> touched that area? I guess it could be worked around by 
> mprotect(PROT_NONE),madvise(GUARD),mprotect(PROT_READ|PROT_WRITE) ... 
> which is not particularly nice :)
> 
>> 
>> And if you're saying 'hey do MADV_DONTNEED if this fails and keep trying!'
>> then why not just do that in the kernel?
> 
> Heh, no!
> 
> If user space doesn't expect there to be something, it should *fail*. 
> That's likely going to be the majority of use cases for guard pages 
> (happy to be told otherwise). No retry.
> 
> And if user space expects there to be something it should zap ahead of 
> time (which some allocators maybe already do to free up memory after 
> free()) to then install the guard. No retry.
> 
> There is this case where user space might be unsure. There, it might 
> make sense to retry exactly once.

I've thought so too and the RFC was implemented like this, but Jann came up
with a scenario where a THP can cause the range including our
to-be-installed guard pte to be populated even if the userspace is not
trying to access that exact address, see here:

https://lore.kernel.org/all/CAG48ez3vqbqyWb4bLdpqSUnhwqGo2OQetecNhEGPdCGDr94nbQ@mail.gmail.com/

So unless we can't *reliably* detect that userspace is really shooting
itself in the foot and return a failure to install guard pte *only* in that
case (which would be useful), and not horribly complicate everything to
ensure that reliability and to avoid false positives due to races with
THP's, then it's probably better to just retry as this version does.

>> 
>> Trying to explain to a user 'hey this is for installing guard pages but if
>> there's a facing fault it'll fail and that could keep happening and then
>> you'll have to zap and maybe in a loop' just... seems like a bloody awful
>> interface?
> 
> Hope my example above made it clearer. This "retry forever until it 
> works" use case doesn't quite make sense to me, but I might just be 
> missing something important.
> 
> But again, I have to do more reading on the history of the current 
> approach ... and it's fairly late here.

Yeah see the RFC thread linked above.
Lorenzo Stoakes Oct. 21, 2024, 9:46 p.m. UTC | #18
On Mon, Oct 21, 2024 at 11:35:24PM +0200, Vlastimil Babka wrote:
> On 10/21/24 23:20, David Hildenbrand wrote:
> >> I don't think there's really any value in that. There's just no sensible
> >> situation in which a user would care about this I don't think.
> >
> > Making sure nobody touches an area, and wile doing that somebody already
> > touched that area? I guess it could be worked around by
> > mprotect(PROT_NONE),madvise(GUARD),mprotect(PROT_READ|PROT_WRITE) ...
> > which is not particularly nice :)
> >
> >>
> >> And if you're saying 'hey do MADV_DONTNEED if this fails and keep trying!'
> >> then why not just do that in the kernel?
> >
> > Heh, no!
> >
> > If user space doesn't expect there to be something, it should *fail*.
> > That's likely going to be the majority of use cases for guard pages
> > (happy to be told otherwise). No retry.
> >
> > And if user space expects there to be something it should zap ahead of
> > time (which some allocators maybe already do to free up memory after
> > free()) to then install the guard. No retry.
> >
> > There is this case where user space might be unsure. There, it might
> > make sense to retry exactly once.
>
> I've thought so too and the RFC was implemented like this, but Jann came up
> with a scenario where a THP can cause the range including our
> to-be-installed guard pte to be populated even if the userspace is not
> trying to access that exact address, see here:
>
> https://lore.kernel.org/all/CAG48ez3vqbqyWb4bLdpqSUnhwqGo2OQetecNhEGPdCGDr94nbQ@mail.gmail.com/
>
> So unless we can't *reliably* detect that userspace is really shooting
> itself in the foot and return a failure to install guard pte *only* in that
> case (which would be useful), and not horribly complicate everything to
> ensure that reliability and to avoid false positives due to races with
> THP's, then it's probably better to just retry as this version does.

It would be complicated, and I'd reallly like to avoid trying to detect
this. It feels a bit whack-a-mole because maybe there's other scenarios
we've not thought about that could be equally problematic?

>
> >>
> >> Trying to explain to a user 'hey this is for installing guard pages but if
> >> there's a facing fault it'll fail and that could keep happening and then
> >> you'll have to zap and maybe in a loop' just... seems like a bloody awful
> >> interface?
> >
> > Hope my example above made it clearer. This "retry forever until it
> > works" use case doesn't quite make sense to me, but I might just be
> > missing something important.
> >
> > But again, I have to do more reading on the history of the current
> > approach ... and it's fairly late here.
>
> Yeah see the RFC thread linked above.
>

Right, but I don't think this is the only scenario that can happen, and I
think, FWIW, yet again the fundamental point comes down to 'is it a
problem?'

Because if looping like this isn't, then problem solved we can all high 5
and go home listening to the prodigy and full happiness.

If not then we can revisit.

And how could it be a problem? Surely only security or DoS
potential. Hopefully Jann can give some positive feedback on that.

We could also, and I hate to say it, but... try to find some means of
checking on reasonable forward progress in the loop if we had to or some
other 'reasonable attempt'.

But let's see...
Jann Horn Oct. 22, 2024, 7:08 p.m. UTC | #19
On Mon, Oct 21, 2024 at 10:46 PM Vlastimil Babka <vbabka@suse.cz> wrote:
> On 10/21/24 22:27, Lorenzo Stoakes wrote:
> > On Mon, Oct 21, 2024 at 10:11:29PM +0200, Vlastimil Babka wrote:
> >> On 10/20/24 18:20, Lorenzo Stoakes wrote:
> >> > +  while (true) {
> >> > +          /* Returns < 0 on error, == 0 if success, > 0 if zap needed. */
> >> > +          err = walk_page_range_mm(vma->vm_mm, start, end,
> >> > +                                   &guard_poison_walk_ops, NULL);
> >> > +          if (err <= 0)
> >> > +                  return err;
> >> > +
> >> > +          /*
> >> > +           * OK some of the range have non-guard pages mapped, zap
> >> > +           * them. This leaves existing guard pages in place.
> >> > +           */
> >> > +          zap_page_range_single(vma, start, end - start, NULL);
> >>
> >> ... however the potentially endless loop doesn't seem great. Could a
> >> malicious program keep refaulting the range (ignoring any segfaults if it
> >> loses a race) with one thread while failing to make progress here with
> >> another thread? Is that ok because it would only punish itself?
> >
> > Sigh. Again, I don't think you've read the previous series have you? Or
> > even the changelog... I added this as Jann asked for it. Originally we'd
> > -EAGAIN if we got raced. See the discussion over in v1 for details.
> >
> > I did it that way specifically to avoid such things, but Jann didn't appear
> > to think it was a problem.
>
> If Jann is fine with this then it must be secure enough.

My thinking there was:

We can legitimately race with adjacent faults populating the area
we're operating on with THP pages; as long as the zapping and
poison-marker-setting are separate, *someone* will have to do the
retry. Either we do it in the kernel, or we tell userspace to handle
it, but having the kernel take care of it is preferable because it
makes the stable UAPI less messy.

One easy way to do it in the kernel would be to return -ERESTARTNOINTR
after the zap_page_range_single() instead of jumping back up, which in
terms of locking and signal handling and such would be equivalent to
looping in userspace (because really that's what -ERESTARTNOINTR does
- it returns out to userspace and moves the instruction pointer back
to restart the syscall). Though if we do that immediately, it might
make MADV_POISON unnecessarily slow, so we should probably retry once
before doing that. The other easy way is to just loop here.

The cond_resched() and pending fatal signal check mean that (except on
CONFIG_PREEMPT_NONE) the only differences between the current
implementation and looping in userspace are that we don't handle
non-fatal signals in between iterations and that we keep hogging the
mmap_lock in read mode. We do already have a bunch of codepaths that
retry on concurrent page table changes, like when zap_pte_range()
encounters a pte_offset_map_lock() failure; though I guess the
difference is that the retry on those is just a couple instructions,
which would be harder to race consistently, while here we redo walks
across the entire range, which should be fairly easy to race
repeatedly.

So I guess you have a point that this might be the easiest way to
stall other tasks that are trying to take mmap_lock for an extended
amount of time, I did not fully consider that... and then I guess you
could use that to slow down usercopy fault handling (once the lock
switches to handoff mode because of a stalled writer?) or slow down
other processes trying to read /proc/$pid/cmdline?

You can already indefinitely hog the mmap_lock with FUSE, though that
requires that you can mount a FUSE filesystem (which you wouldn't be
able in reasonably sandboxed code) and that you can find something
like a pin_user_pages() call that can't drop the mmap lock in between,
and there aren't actually that many of those...

So I guess you have a point and the -ERESTARTNOINTR approach would be
a little bit nicer, as long as it's easy to implement.
David Hildenbrand Oct. 22, 2024, 7:18 p.m. UTC | #20
On 21.10.24 23:35, Vlastimil Babka wrote:
> On 10/21/24 23:20, David Hildenbrand wrote:
>>> I don't think there's really any value in that. There's just no sensible
>>> situation in which a user would care about this I don't think.
>>
>> Making sure nobody touches an area, and wile doing that somebody already
>> touched that area? I guess it could be worked around by
>> mprotect(PROT_NONE),madvise(GUARD),mprotect(PROT_READ|PROT_WRITE) ...
>> which is not particularly nice :)
>>
>>>
>>> And if you're saying 'hey do MADV_DONTNEED if this fails and keep trying!'
>>> then why not just do that in the kernel?
>>
>> Heh, no!
>>
>> If user space doesn't expect there to be something, it should *fail*.
>> That's likely going to be the majority of use cases for guard pages
>> (happy to be told otherwise). No retry.
>>
>> And if user space expects there to be something it should zap ahead of
>> time (which some allocators maybe already do to free up memory after
>> free()) to then install the guard. No retry.
>>
>> There is this case where user space might be unsure. There, it might
>> make sense to retry exactly once.
> 
> I've thought so too and the RFC was implemented like this, but Jann came up
> with a scenario where a THP can cause the range including our
> to-be-installed guard pte to be populated even if the userspace is not
> trying to access that exact address, see here:
> 
> https://lore.kernel.org/all/CAG48ez3vqbqyWb4bLdpqSUnhwqGo2OQetecNhEGPdCGDr94nbQ@mail.gmail.com/

Ah, THP, I should have realized that myself. Yes indeed, in some cases 
we'll have to zap because something was already populated. Not sure how 
often it will happen in practice, will depend on the use case.

For use cases like one "one static guard page every 2MiB", I would 
assume we install the guard pages early, before expecting any page 
faults in that area. Likely there are other ones where it might happen 
more frequently.

For uffd that does not apply, because khugepaged backs off with uffd 
enabled and the only way to resolve a fault is using uffd -- which 
places exactly what was requested by user space. So, no populated PTEs 
without actual page faults on the corresponding virtual addresses.
Lorenzo Stoakes Oct. 22, 2024, 7:35 p.m. UTC | #21
On Tue, Oct 22, 2024 at 09:08:53PM +0200, Jann Horn wrote:
> On Mon, Oct 21, 2024 at 10:46 PM Vlastimil Babka <vbabka@suse.cz> wrote:
> > On 10/21/24 22:27, Lorenzo Stoakes wrote:
> > > On Mon, Oct 21, 2024 at 10:11:29PM +0200, Vlastimil Babka wrote:
> > >> On 10/20/24 18:20, Lorenzo Stoakes wrote:
> > >> > +  while (true) {
> > >> > +          /* Returns < 0 on error, == 0 if success, > 0 if zap needed. */
> > >> > +          err = walk_page_range_mm(vma->vm_mm, start, end,
> > >> > +                                   &guard_poison_walk_ops, NULL);
> > >> > +          if (err <= 0)
> > >> > +                  return err;
> > >> > +
> > >> > +          /*
> > >> > +           * OK some of the range have non-guard pages mapped, zap
> > >> > +           * them. This leaves existing guard pages in place.
> > >> > +           */
> > >> > +          zap_page_range_single(vma, start, end - start, NULL);
> > >>
> > >> ... however the potentially endless loop doesn't seem great. Could a
> > >> malicious program keep refaulting the range (ignoring any segfaults if it
> > >> loses a race) with one thread while failing to make progress here with
> > >> another thread? Is that ok because it would only punish itself?
> > >
> > > Sigh. Again, I don't think you've read the previous series have you? Or
> > > even the changelog... I added this as Jann asked for it. Originally we'd
> > > -EAGAIN if we got raced. See the discussion over in v1 for details.
> > >
> > > I did it that way specifically to avoid such things, but Jann didn't appear
> > > to think it was a problem.
> >
> > If Jann is fine with this then it must be secure enough.
>
> My thinking there was:
>
> We can legitimately race with adjacent faults populating the area
> we're operating on with THP pages; as long as the zapping and
> poison-marker-setting are separate, *someone* will have to do the
> retry. Either we do it in the kernel, or we tell userspace to handle
> it, but having the kernel take care of it is preferable because it
> makes the stable UAPI less messy.
>
> One easy way to do it in the kernel would be to return -ERESTARTNOINTR
> after the zap_page_range_single() instead of jumping back up, which in
> terms of locking and signal handling and such would be equivalent to
> looping in userspace (because really that's what -ERESTARTNOINTR does
> - it returns out to userspace and moves the instruction pointer back
> to restart the syscall). Though if we do that immediately, it might
> make MADV_POISON unnecessarily slow, so we should probably retry once
> before doing that. The other easy way is to just loop here.

Yes we should definitely retry probably a few times to cover the rare
situation of a THP race as you describe under non-abusive circumstances.

>
> The cond_resched() and pending fatal signal check mean that (except on
> CONFIG_PREEMPT_NONE) the only differences between the current
> implementation and looping in userspace are that we don't handle
> non-fatal signals in between iterations and that we keep hogging the
> mmap_lock in read mode. We do already have a bunch of codepaths that
> retry on concurrent page table changes, like when zap_pte_range()
> encounters a pte_offset_map_lock() failure; though I guess the
> difference is that the retry on those is just a couple instructions,
> which would be harder to race consistently, while here we redo walks
> across the entire range, which should be fairly easy to race
> repeatedly.
>
> So I guess you have a point that this might be the easiest way to
> stall other tasks that are trying to take mmap_lock for an extended
> amount of time, I did not fully consider that... and then I guess you
> could use that to slow down usercopy fault handling (once the lock
> switches to handoff mode because of a stalled writer?) or slow down
> other processes trying to read /proc/$pid/cmdline?

Hm does that need a write lock?

>
> You can already indefinitely hog the mmap_lock with FUSE, though that
> requires that you can mount a FUSE filesystem (which you wouldn't be
> able in reasonably sandboxed code) and that you can find something
> like a pin_user_pages() call that can't drop the mmap lock in between,
> and there aren't actually that many of those...
>
> So I guess you have a point and the -ERESTARTNOINTR approach would be
> a little bit nicer, as long as it's easy to implement.

I can go ahead and do it that way if nobody objects, with a few loops
before we do it... which hopefully covers off all the concerns?

Thanks
Jann Horn Oct. 22, 2024, 7:57 p.m. UTC | #22
On Tue, Oct 22, 2024 at 9:35 PM Lorenzo Stoakes
<lorenzo.stoakes@oracle.com> wrote:
> On Tue, Oct 22, 2024 at 09:08:53PM +0200, Jann Horn wrote:
> > On Mon, Oct 21, 2024 at 10:46 PM Vlastimil Babka <vbabka@suse.cz> wrote:
> > > On 10/21/24 22:27, Lorenzo Stoakes wrote:
> > > > On Mon, Oct 21, 2024 at 10:11:29PM +0200, Vlastimil Babka wrote:
> > > >> On 10/20/24 18:20, Lorenzo Stoakes wrote:
> > > >> > +  while (true) {
> > > >> > +          /* Returns < 0 on error, == 0 if success, > 0 if zap needed. */
> > > >> > +          err = walk_page_range_mm(vma->vm_mm, start, end,
> > > >> > +                                   &guard_poison_walk_ops, NULL);
> > > >> > +          if (err <= 0)
> > > >> > +                  return err;
> > > >> > +
> > > >> > +          /*
> > > >> > +           * OK some of the range have non-guard pages mapped, zap
> > > >> > +           * them. This leaves existing guard pages in place.
> > > >> > +           */
> > > >> > +          zap_page_range_single(vma, start, end - start, NULL);
> > > >>
> > > >> ... however the potentially endless loop doesn't seem great. Could a
> > > >> malicious program keep refaulting the range (ignoring any segfaults if it
> > > >> loses a race) with one thread while failing to make progress here with
> > > >> another thread? Is that ok because it would only punish itself?
> > > >
> > > > Sigh. Again, I don't think you've read the previous series have you? Or
> > > > even the changelog... I added this as Jann asked for it. Originally we'd
> > > > -EAGAIN if we got raced. See the discussion over in v1 for details.
> > > >
> > > > I did it that way specifically to avoid such things, but Jann didn't appear
> > > > to think it was a problem.
> > >
> > > If Jann is fine with this then it must be secure enough.
> >
> > My thinking there was:
> >
> > We can legitimately race with adjacent faults populating the area
> > we're operating on with THP pages; as long as the zapping and
> > poison-marker-setting are separate, *someone* will have to do the
> > retry. Either we do it in the kernel, or we tell userspace to handle
> > it, but having the kernel take care of it is preferable because it
> > makes the stable UAPI less messy.
> >
> > One easy way to do it in the kernel would be to return -ERESTARTNOINTR
> > after the zap_page_range_single() instead of jumping back up, which in
> > terms of locking and signal handling and such would be equivalent to
> > looping in userspace (because really that's what -ERESTARTNOINTR does
> > - it returns out to userspace and moves the instruction pointer back
> > to restart the syscall). Though if we do that immediately, it might
> > make MADV_POISON unnecessarily slow, so we should probably retry once
> > before doing that. The other easy way is to just loop here.
>
> Yes we should definitely retry probably a few times to cover the rare
> situation of a THP race as you describe under non-abusive circumstances.
>
> >
> > The cond_resched() and pending fatal signal check mean that (except on
> > CONFIG_PREEMPT_NONE) the only differences between the current
> > implementation and looping in userspace are that we don't handle
> > non-fatal signals in between iterations and that we keep hogging the
> > mmap_lock in read mode. We do already have a bunch of codepaths that
> > retry on concurrent page table changes, like when zap_pte_range()
> > encounters a pte_offset_map_lock() failure; though I guess the
> > difference is that the retry on those is just a couple instructions,
> > which would be harder to race consistently, while here we redo walks
> > across the entire range, which should be fairly easy to race
> > repeatedly.
> >
> > So I guess you have a point that this might be the easiest way to
> > stall other tasks that are trying to take mmap_lock for an extended
> > amount of time, I did not fully consider that... and then I guess you
> > could use that to slow down usercopy fault handling (once the lock
> > switches to handoff mode because of a stalled writer?) or slow down
> > other processes trying to read /proc/$pid/cmdline?
>
> Hm does that need a write lock?

No, but if you have one reader that is hogging the rwsem, and then a
writer is queued up on the rwsem afterwards, I think new readers will
sometimes be queued up behind the writer. So even though the rwsem is
only actually held by a reader, new readers can't immediately take the
rwsem because the rwsem code thinks that would be unfair to a pending
writer who just wants to make some quick change. I'm not super
familiar with this code, but basically I think it works roughly like
this:

If the rwsem code notices that a bunch of readers are preventing a
writer from taking the lock, the rwsem code will start queuing new
readers behind the queued writer. You can see in rwsem_read_trylock()
that the trylock fastpath is skipped if anyone is waiting on the rwsem
or the handoff flag is set, and in rwsem_down_read_slowpath() the
"reader optimistic lock stealing" path is skipped if the lock is
currently held by multiple readers or if the handoff bit is set.

The handoff bit can be set in rwsem_try_write_lock() by a writer "if
it is an RT task or wait in the wait queue for too long". Basically I
think it means something like "I think other users of the lock are
hogging it more than they should, stop stealing the lock from me".
And the RWSEM_WAIT_TIMEOUT for triggering handoff mode is pretty
short, RWSEM_WAIT_TIMEOUT is defined to something like 4ms, so I think
that's how long writers tolerate the lock being hogged by readers
before they prevent new readers from stealing the lock.
Lorenzo Stoakes Oct. 22, 2024, 8:45 p.m. UTC | #23
On Tue, Oct 22, 2024 at 09:57:39PM +0200, Jann Horn wrote:
> On Tue, Oct 22, 2024 at 9:35 PM Lorenzo Stoakes
> <lorenzo.stoakes@oracle.com> wrote:
> > On Tue, Oct 22, 2024 at 09:08:53PM +0200, Jann Horn wrote:
> > > On Mon, Oct 21, 2024 at 10:46 PM Vlastimil Babka <vbabka@suse.cz> wrote:
> > > > On 10/21/24 22:27, Lorenzo Stoakes wrote:
> > > > > On Mon, Oct 21, 2024 at 10:11:29PM +0200, Vlastimil Babka wrote:
> > > > >> On 10/20/24 18:20, Lorenzo Stoakes wrote:
> > > > >> > +  while (true) {
> > > > >> > +          /* Returns < 0 on error, == 0 if success, > 0 if zap needed. */
> > > > >> > +          err = walk_page_range_mm(vma->vm_mm, start, end,
> > > > >> > +                                   &guard_poison_walk_ops, NULL);
> > > > >> > +          if (err <= 0)
> > > > >> > +                  return err;
> > > > >> > +
> > > > >> > +          /*
> > > > >> > +           * OK some of the range have non-guard pages mapped, zap
> > > > >> > +           * them. This leaves existing guard pages in place.
> > > > >> > +           */
> > > > >> > +          zap_page_range_single(vma, start, end - start, NULL);
> > > > >>
> > > > >> ... however the potentially endless loop doesn't seem great. Could a
> > > > >> malicious program keep refaulting the range (ignoring any segfaults if it
> > > > >> loses a race) with one thread while failing to make progress here with
> > > > >> another thread? Is that ok because it would only punish itself?
> > > > >
> > > > > Sigh. Again, I don't think you've read the previous series have you? Or
> > > > > even the changelog... I added this as Jann asked for it. Originally we'd
> > > > > -EAGAIN if we got raced. See the discussion over in v1 for details.
> > > > >
> > > > > I did it that way specifically to avoid such things, but Jann didn't appear
> > > > > to think it was a problem.
> > > >
> > > > If Jann is fine with this then it must be secure enough.
> > >
> > > My thinking there was:
> > >
> > > We can legitimately race with adjacent faults populating the area
> > > we're operating on with THP pages; as long as the zapping and
> > > poison-marker-setting are separate, *someone* will have to do the
> > > retry. Either we do it in the kernel, or we tell userspace to handle
> > > it, but having the kernel take care of it is preferable because it
> > > makes the stable UAPI less messy.
> > >
> > > One easy way to do it in the kernel would be to return -ERESTARTNOINTR
> > > after the zap_page_range_single() instead of jumping back up, which in
> > > terms of locking and signal handling and such would be equivalent to
> > > looping in userspace (because really that's what -ERESTARTNOINTR does
> > > - it returns out to userspace and moves the instruction pointer back
> > > to restart the syscall). Though if we do that immediately, it might
> > > make MADV_POISON unnecessarily slow, so we should probably retry once
> > > before doing that. The other easy way is to just loop here.
> >
> > Yes we should definitely retry probably a few times to cover the rare
> > situation of a THP race as you describe under non-abusive circumstances.
> >
> > >
> > > The cond_resched() and pending fatal signal check mean that (except on
> > > CONFIG_PREEMPT_NONE) the only differences between the current
> > > implementation and looping in userspace are that we don't handle
> > > non-fatal signals in between iterations and that we keep hogging the
> > > mmap_lock in read mode. We do already have a bunch of codepaths that
> > > retry on concurrent page table changes, like when zap_pte_range()
> > > encounters a pte_offset_map_lock() failure; though I guess the
> > > difference is that the retry on those is just a couple instructions,
> > > which would be harder to race consistently, while here we redo walks
> > > across the entire range, which should be fairly easy to race
> > > repeatedly.
> > >
> > > So I guess you have a point that this might be the easiest way to
> > > stall other tasks that are trying to take mmap_lock for an extended
> > > amount of time, I did not fully consider that... and then I guess you
> > > could use that to slow down usercopy fault handling (once the lock
> > > switches to handoff mode because of a stalled writer?) or slow down
> > > other processes trying to read /proc/$pid/cmdline?
> >
> > Hm does that need a write lock?
>
> No, but if you have one reader that is hogging the rwsem, and then a
> writer is queued up on the rwsem afterwards, I think new readers will
> sometimes be queued up behind the writer. So even though the rwsem is
> only actually held by a reader, new readers can't immediately take the
> rwsem because the rwsem code thinks that would be unfair to a pending
> writer who just wants to make some quick change. I'm not super
> familiar with this code, but basically I think it works roughly like
> this:
>
> If the rwsem code notices that a bunch of readers are preventing a
> writer from taking the lock, the rwsem code will start queuing new
> readers behind the queued writer. You can see in rwsem_read_trylock()
> that the trylock fastpath is skipped if anyone is waiting on the rwsem
> or the handoff flag is set, and in rwsem_down_read_slowpath() the
> "reader optimistic lock stealing" path is skipped if the lock is
> currently held by multiple readers or if the handoff bit is set.
>
> The handoff bit can be set in rwsem_try_write_lock() by a writer "if
> it is an RT task or wait in the wait queue for too long". Basically I
> think it means something like "I think other users of the lock are
> hogging it more than they should, stop stealing the lock from me".
> And the RWSEM_WAIT_TIMEOUT for triggering handoff mode is pretty
> short, RWSEM_WAIT_TIMEOUT is defined to something like 4ms, so I think
> that's how long writers tolerate the lock being hogged by readers
> before they prevent new readers from stealing the lock.

Ack makes sense! -ERESTARTNOINTR should help resolve this so definitely
unless anybody has any objection to that I'll go ahead and do a respin
taking that approach (+ all otehr fixes) for v2.

Thanks for your input!
diff mbox series

Patch

diff --git a/arch/alpha/include/uapi/asm/mman.h b/arch/alpha/include/uapi/asm/mman.h
index 763929e814e9..71e13f27742d 100644
--- a/arch/alpha/include/uapi/asm/mman.h
+++ b/arch/alpha/include/uapi/asm/mman.h
@@ -78,6 +78,9 @@ 
 
 #define MADV_COLLAPSE	25		/* Synchronous hugepage collapse */
 
+#define MADV_GUARD_POISON 102		/* fatal signal on access to range */
+#define MADV_GUARD_UNPOISON 103		/* revoke guard poisoning */
+
 /* compatibility flags */
 #define MAP_FILE	0
 
diff --git a/arch/mips/include/uapi/asm/mman.h b/arch/mips/include/uapi/asm/mman.h
index 9c48d9a21aa0..1a2222322f77 100644
--- a/arch/mips/include/uapi/asm/mman.h
+++ b/arch/mips/include/uapi/asm/mman.h
@@ -105,6 +105,9 @@ 
 
 #define MADV_COLLAPSE	25		/* Synchronous hugepage collapse */
 
+#define MADV_GUARD_POISON 102		/* fatal signal on access to range */
+#define MADV_GUARD_UNPOISON 103		/* revoke guard poisoning */
+
 /* compatibility flags */
 #define MAP_FILE	0
 
diff --git a/arch/parisc/include/uapi/asm/mman.h b/arch/parisc/include/uapi/asm/mman.h
index 68c44f99bc93..380905522397 100644
--- a/arch/parisc/include/uapi/asm/mman.h
+++ b/arch/parisc/include/uapi/asm/mman.h
@@ -75,6 +75,9 @@ 
 #define MADV_HWPOISON     100		/* poison a page for testing */
 #define MADV_SOFT_OFFLINE 101		/* soft offline page for testing */
 
+#define MADV_GUARD_POISON 102		/* fatal signal on access to range */
+#define MADV_GUARD_UNPOISON 103		/* revoke guard poisoning */
+
 /* compatibility flags */
 #define MAP_FILE	0
 
diff --git a/arch/xtensa/include/uapi/asm/mman.h b/arch/xtensa/include/uapi/asm/mman.h
index 1ff0c858544f..e8d5affceb28 100644
--- a/arch/xtensa/include/uapi/asm/mman.h
+++ b/arch/xtensa/include/uapi/asm/mman.h
@@ -113,6 +113,9 @@ 
 
 #define MADV_COLLAPSE	25		/* Synchronous hugepage collapse */
 
+#define MADV_GUARD_POISON 102		/* fatal signal on access to range */
+#define MADV_GUARD_UNPOISON 103		/* revoke guard poisoning */
+
 /* compatibility flags */
 #define MAP_FILE	0
 
diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
index 6ce1f1ceb432..5dfd3d442de4 100644
--- a/include/uapi/asm-generic/mman-common.h
+++ b/include/uapi/asm-generic/mman-common.h
@@ -79,6 +79,9 @@ 
 
 #define MADV_COLLAPSE	25		/* Synchronous hugepage collapse */
 
+#define MADV_GUARD_POISON 102		/* fatal signal on access to range */
+#define MADV_GUARD_UNPOISON 103		/* revoke guard poisoning */
+
 /* compatibility flags */
 #define MAP_FILE	0
 
diff --git a/mm/madvise.c b/mm/madvise.c
index e871a72a6c32..7b9a357b84d2 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -60,6 +60,8 @@  static int madvise_need_mmap_write(int behavior)
 	case MADV_POPULATE_READ:
 	case MADV_POPULATE_WRITE:
 	case MADV_COLLAPSE:
+	case MADV_GUARD_POISON:
+	case MADV_GUARD_UNPOISON:
 		return 0;
 	default:
 		/* be safe, default to 1. list exceptions explicitly */
@@ -1017,6 +1019,166 @@  static long madvise_remove(struct vm_area_struct *vma,
 	return error;
 }
 
+static bool is_valid_guard_vma(struct vm_area_struct *vma, bool allow_locked)
+{
+	vm_flags_t disallowed = VM_SPECIAL | VM_HUGETLB;
+
+	/*
+	 * A user could lock after poisoning but that's fine, as they'd not be
+	 * able to fault in. The issue arises when we try to zap existing locked
+	 * VMAs. We don't want to do that.
+	 */
+	if (!allow_locked)
+		disallowed |= VM_LOCKED;
+
+	if (!vma_is_anonymous(vma))
+		return false;
+
+	if ((vma->vm_flags & (VM_MAYWRITE | disallowed)) != VM_MAYWRITE)
+		return false;
+
+	return true;
+}
+
+static bool is_guard_pte_marker(pte_t ptent)
+{
+	return is_pte_marker(ptent) &&
+		is_guard_swp_entry(pte_to_swp_entry(ptent));
+}
+
+static int guard_poison_pud_entry(pud_t *pud, unsigned long addr, unsigned long next,
+				  struct mm_walk *walk)
+{
+	pud_t pudval = pudp_get(pud);
+
+	/* Do not split a huge pud - we do nothing with these so just ignore. */
+	if (pud_trans_huge(pudval) || pud_devmap(pudval))
+		walk->action = ACTION_CONTINUE;
+
+	return 0;
+}
+
+static int guard_poison_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next,
+				  struct mm_walk *walk)
+{
+	pmd_t pmdval = pmdp_get(pmd);
+
+	/* Do not split a huge pmd - we do nothing with these so just ignore. */
+	if (pmd_trans_huge(pmdval) || pmd_devmap(pmdval))
+		walk->action = ACTION_CONTINUE;
+
+	return 0;
+}
+
+static int guard_poison_pte_entry(pte_t *pte, unsigned long addr,
+				  unsigned long next, struct mm_walk *walk)
+{
+	pte_t pteval = ptep_get(pte);
+
+	/*
+	 * If not a guard marker, simply abort the operation. We return a value
+	 * > 0 indicating a non-error abort.
+	 */
+	return !is_guard_pte_marker(pteval);
+}
+
+static int guard_poison_install_pte(unsigned long addr, unsigned long next,
+				    pte_t *ptep, struct mm_walk *walk)
+{
+	/* Simply install a PTE marker, this causes segfault on access. */
+	*ptep = make_pte_marker(PTE_MARKER_GUARD);
+
+	return 0;
+}
+
+static const struct mm_walk_ops guard_poison_walk_ops = {
+	.pud_entry		= guard_poison_pud_entry,
+	.pmd_entry		= guard_poison_pmd_entry,
+	.pte_entry		= guard_poison_pte_entry,
+	.install_pte		= guard_poison_install_pte,
+	.walk_lock		= PGWALK_RDLOCK,
+};
+
+static long madvise_guard_poison(struct vm_area_struct *vma,
+				 struct vm_area_struct **prev,
+				 unsigned long start, unsigned long end)
+{
+	long err;
+
+	*prev = vma;
+	if (!is_valid_guard_vma(vma, /* allow_locked = */false))
+		return -EINVAL;
+
+	/*
+	 * If we install poison markers, then the range is no longer
+	 * empty from a page table perspective and therefore it's
+	 * appropriate to have an anon_vma.
+	 *
+	 * This ensures that on fork, we copy page tables correctly.
+	 */
+	err = anon_vma_prepare(vma);
+	if (err)
+		return err;
+
+	/*
+	 * Optimistically try to install the guard poison pages first. If any
+	 * non-guard pages are encountered, give up and zap the range before
+	 * trying again.
+	 */
+	while (true) {
+		/* Returns < 0 on error, == 0 if success, > 0 if zap needed. */
+		err = walk_page_range_mm(vma->vm_mm, start, end,
+					 &guard_poison_walk_ops, NULL);
+		if (err <= 0)
+			return err;
+
+		/*
+		 * OK some of the range have non-guard pages mapped, zap
+		 * them. This leaves existing guard pages in place.
+		 */
+		zap_page_range_single(vma, start, end - start, NULL);
+
+		if (fatal_signal_pending(current))
+			return -EINTR;
+		cond_resched();
+	}
+}
+
+static int guard_unpoison_pte_entry(pte_t *pte, unsigned long addr,
+				    unsigned long next, struct mm_walk *walk)
+{
+	pte_t ptent = ptep_get(pte);
+
+	if (is_guard_pte_marker(ptent)) {
+		/* Simply clear the PTE marker. */
+		pte_clear_not_present_full(walk->mm, addr, pte, false);
+		update_mmu_cache(walk->vma, addr, pte);
+	}
+
+	return 0;
+}
+
+static const struct mm_walk_ops guard_unpoison_walk_ops = {
+	.pte_entry		= guard_unpoison_pte_entry,
+	.walk_lock		= PGWALK_RDLOCK,
+};
+
+static long madvise_guard_unpoison(struct vm_area_struct *vma,
+				   struct vm_area_struct **prev,
+				   unsigned long start, unsigned long end)
+{
+	*prev = vma;
+	/*
+	 * We're ok with unpoisoning mlock()'d ranges, as this is a
+	 * non-destructive action.
+	 */
+	if (!is_valid_guard_vma(vma, /* allow_locked = */true))
+		return -EINVAL;
+
+	return walk_page_range(vma->vm_mm, start, end,
+			       &guard_unpoison_walk_ops, NULL);
+}
+
 /*
  * Apply an madvise behavior to a region of a vma.  madvise_update_vma
  * will handle splitting a vm area into separate areas, each area with its own
@@ -1098,6 +1260,10 @@  static int madvise_vma_behavior(struct vm_area_struct *vma,
 		break;
 	case MADV_COLLAPSE:
 		return madvise_collapse(vma, prev, start, end);
+	case MADV_GUARD_POISON:
+		return madvise_guard_poison(vma, prev, start, end);
+	case MADV_GUARD_UNPOISON:
+		return madvise_guard_unpoison(vma, prev, start, end);
 	}
 
 	anon_name = anon_vma_name(vma);
@@ -1197,6 +1363,8 @@  madvise_behavior_valid(int behavior)
 	case MADV_DODUMP:
 	case MADV_WIPEONFORK:
 	case MADV_KEEPONFORK:
+	case MADV_GUARD_POISON:
+	case MADV_GUARD_UNPOISON:
 #ifdef CONFIG_MEMORY_FAILURE
 	case MADV_SOFT_OFFLINE:
 	case MADV_HWPOISON:
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 0c5d6d06107d..d0e3ebfadef8 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -236,7 +236,8 @@  static long change_pte_range(struct mmu_gather *tlb,
 			} else if (is_pte_marker_entry(entry)) {
 				/*
 				 * Ignore error swap entries unconditionally,
-				 * because any access should sigbus anyway.
+				 * because any access should sigbus/sigsegv
+				 * anyway.
 				 */
 				if (is_poisoned_swp_entry(entry))
 					continue;
diff --git a/mm/mseal.c b/mm/mseal.c
index ece977bd21e1..21bf5534bcf5 100644
--- a/mm/mseal.c
+++ b/mm/mseal.c
@@ -30,6 +30,7 @@  static bool is_madv_discard(int behavior)
 	case MADV_REMOVE:
 	case MADV_DONTFORK:
 	case MADV_WIPEONFORK:
+	case MADV_GUARD_POISON:
 		return true;
 	}