diff mbox series

[v2,2/5] mm: add PTE_MARKER_GUARD PTE marker

Message ID 081837b697a98c7fa5832542b20f603d49e0b557.1729440856.git.lorenzo.stoakes@oracle.com (mailing list archive)
State New
Headers show
Series implement lightweight guard pages | expand

Commit Message

Lorenzo Stoakes Oct. 20, 2024, 4:20 p.m. UTC
Add a new PTE marker that results in any access causing the accessing
process to segfault.

This is preferable to PTE_MARKER_POISONED, which results in the same
handling as hardware poisoned memory, and is thus undesirable for cases
where we simply wish to 'soft' poison a range.

This is in preparation for implementing the ability to specify guard pages
at the page table level, i.e. ranges that, when accessed, should cause
process termination.

Additionally, rename zap_drop_file_uffd_wp() to zap_drop_markers() - the
function checks the ZAP_FLAG_DROP_MARKER flag so naming it for this single
purpose was simply incorrect.

We then reuse the same logic to determine whether a zap should clear a
guard entry - this should only be performed on teardown and never on
MADV_DONTNEED or the like.

Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
---
 include/linux/mm_inline.h |  2 +-
 include/linux/swapops.h   | 26 ++++++++++++++++++++++++--
 mm/hugetlb.c              |  3 +++
 mm/memory.c               | 18 +++++++++++++++---
 4 files changed, 43 insertions(+), 6 deletions(-)

Comments

Vlastimil Babka Oct. 21, 2024, 1:45 p.m. UTC | #1
On 10/20/24 18:20, Lorenzo Stoakes wrote:
> Add a new PTE marker that results in any access causing the accessing
> process to segfault.

Should we distinguish it from other segfaults? Is there a way? I can see
memory protection keys use SEGV_PKUERR, but no idea if we have any free values.

> This is preferable to PTE_MARKER_POISONED, which results in the same
> handling as hardware poisoned memory, and is thus undesirable for cases
> where we simply wish to 'soft' poison a range.
> 
> This is in preparation for implementing the ability to specify guard pages
> at the page table level, i.e. ranges that, when accessed, should cause
> process termination.
> 
> Additionally, rename zap_drop_file_uffd_wp() to zap_drop_markers() - the
> function checks the ZAP_FLAG_DROP_MARKER flag so naming it for this single
> purpose was simply incorrect.
> 
> We then reuse the same logic to determine whether a zap should clear a
> guard entry - this should only be performed on teardown and never on
> MADV_DONTNEED or the like.
> 
> Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>

Acked-by: Vlastimil Babka <vbabka@suse.cz>

A nit below:

> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 906294ac85dc..50e3f6ed73ac 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -6353,6 +6353,9 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
>  				ret = VM_FAULT_HWPOISON_LARGE |
>  				      VM_FAULT_SET_HINDEX(hstate_index(h));
>  				goto out_mutex;
> +			} else if (marker & PTE_MARKER_GUARD) {
> +				ret = VM_FAULT_SIGSEGV;
> +				goto out_mutex;

Given we don't support hugetlb, should we WARN_ON_ONCE() if such unexpected
marker appears there?

>  			}
>  		}
>  
> diff --git a/mm/memory.c b/mm/memory.c
> index 0f614523b9f4..551455cd453f 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -1455,7 +1455,7 @@ static inline bool should_zap_folio(struct zap_details *details,
>  	return !folio_test_anon(folio);
>  }
>  
> -static inline bool zap_drop_file_uffd_wp(struct zap_details *details)
> +static inline bool zap_drop_markers(struct zap_details *details)
>  {
>  	if (!details)
>  		return false;
> @@ -1476,7 +1476,7 @@ zap_install_uffd_wp_if_needed(struct vm_area_struct *vma,
>  	if (vma_is_anonymous(vma))
>  		return;
>  
> -	if (zap_drop_file_uffd_wp(details))
> +	if (zap_drop_markers(details))
>  		return;
>  
>  	for (;;) {
> @@ -1671,7 +1671,15 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
>  			 * drop the marker if explicitly requested.
>  			 */
>  			if (!vma_is_anonymous(vma) &&
> -			    !zap_drop_file_uffd_wp(details))
> +			    !zap_drop_markers(details))
> +				continue;
> +		} else if (is_guard_swp_entry(entry)) {
> +			/*
> +			 * Ordinary zapping should not remove guard PTE
> +			 * markers. Only do so if we should remove PTE markers
> +			 * in general.
> +			 */
> +			if (!zap_drop_markers(details))
>  				continue;
>  		} else if (is_hwpoison_entry(entry) ||
>  			   is_poisoned_swp_entry(entry)) {
> @@ -4003,6 +4011,10 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
>  	if (marker & PTE_MARKER_POISONED)
>  		return VM_FAULT_HWPOISON;
>  
> +	/* Hitting a guard page is always a fatal condition. */
> +	if (marker & PTE_MARKER_GUARD)
> +		return VM_FAULT_SIGSEGV;
> +
>  	if (pte_marker_entry_uffd_wp(entry))
>  		return pte_marker_handle_uffd_wp(vmf);
>
Vlastimil Babka Oct. 21, 2024, 2:13 p.m. UTC | #2
On 10/20/24 18:20, Lorenzo Stoakes wrote:
> Add a new PTE marker that results in any access causing the accessing
> process to segfault.
> 
> This is preferable to PTE_MARKER_POISONED, which results in the same
> handling as hardware poisoned memory, and is thus undesirable for cases
> where we simply wish to 'soft' poison a range.
> 
> This is in preparation for implementing the ability to specify guard pages
> at the page table level, i.e. ranges that, when accessed, should cause
> process termination.
> 
> Additionally, rename zap_drop_file_uffd_wp() to zap_drop_markers() - the
> function checks the ZAP_FLAG_DROP_MARKER flag so naming it for this single
> purpose was simply incorrect.
> 
> We then reuse the same logic to determine whether a zap should clear a
> guard entry - this should only be performed on teardown and never on
> MADV_DONTNEED or the like.

Since I would have personally put MADV_FREE among "or the like" here, it's
surprising to me that it in fact it's tearing down the guard entries now. Is
that intentional? It should be at least mentioned very explicitly. But I'd
really argue against it, as MADV_FREE is to me a weaker form of
MADV_DONTNEED - the existing pages are not zapped immediately but
prioritized for reclaim. If MADV_DONTNEED leaves guard PTEs in place, why
shouldn't MADV_FREE too?

Seems to me rather currently an artifact of MADV_FREE implementation - if it
encounters hwpoison entries it will tear them down because why not, we have
detected a hw memory error and are lucky the program wants to discard the
pages and not access them, so best use the opportunity and get rid of the
PTE entries immediately (if MADV_DONTNEED doesn't do that too, it certainly
could).

But to extend this to guard PTEs which are result of an explicit userspace
action feels wrong, unless the semantics is the same for MADV_DONTEED. The
semantics chosen for MADV_DONTNEED makes sense, so MADV_FREE should behave
the same?

> Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> ---
>  include/linux/mm_inline.h |  2 +-
>  include/linux/swapops.h   | 26 ++++++++++++++++++++++++--
>  mm/hugetlb.c              |  3 +++
>  mm/memory.c               | 18 +++++++++++++++---
>  4 files changed, 43 insertions(+), 6 deletions(-)
> 
> diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
> index 355cf46a01a6..1b6a917fffa4 100644
> --- a/include/linux/mm_inline.h
> +++ b/include/linux/mm_inline.h
> @@ -544,7 +544,7 @@ static inline pte_marker copy_pte_marker(
>  {
>  	pte_marker srcm = pte_marker_get(entry);
>  	/* Always copy error entries. */
> -	pte_marker dstm = srcm & PTE_MARKER_POISONED;
> +	pte_marker dstm = srcm & (PTE_MARKER_POISONED | PTE_MARKER_GUARD);
>  
>  	/* Only copy PTE markers if UFFD register matches. */
>  	if ((srcm & PTE_MARKER_UFFD_WP) && userfaultfd_wp(dst_vma))
> diff --git a/include/linux/swapops.h b/include/linux/swapops.h
> index cb468e418ea1..4d0606df0791 100644
> --- a/include/linux/swapops.h
> +++ b/include/linux/swapops.h
> @@ -426,9 +426,15 @@ typedef unsigned long pte_marker;
>   * "Poisoned" here is meant in the very general sense of "future accesses are
>   * invalid", instead of referring very specifically to hardware memory errors.
>   * This marker is meant to represent any of various different causes of this.
> + *
> + * Note that, when encountered by the faulting logic, PTEs with this marker will
> + * result in VM_FAULT_HWPOISON and thus regardless trigger hardware memory error
> + * logic.
>   */
>  #define  PTE_MARKER_POISONED			BIT(1)
> -#define  PTE_MARKER_MASK			(BIT(2) - 1)
> +/* Indicates that, on fault, this PTE will case a SIGSEGV signal to be sent. */
> +#define  PTE_MARKER_GUARD			BIT(2)
> +#define  PTE_MARKER_MASK			(BIT(3) - 1)
>  
>  static inline swp_entry_t make_pte_marker_entry(pte_marker marker)
>  {
> @@ -461,9 +467,25 @@ static inline swp_entry_t make_poisoned_swp_entry(void)
>  }
>  
>  static inline int is_poisoned_swp_entry(swp_entry_t entry)
> +{
> +	/*
> +	 * We treat guard pages as poisoned too as these have the same semantics
> +	 * as poisoned ranges, only with different fault handling.
> +	 */
> +	return is_pte_marker_entry(entry) &&
> +		(pte_marker_get(entry) &
> +		 (PTE_MARKER_POISONED | PTE_MARKER_GUARD));
> +}
> +
> +static inline swp_entry_t make_guard_swp_entry(void)
> +{
> +	return make_pte_marker_entry(PTE_MARKER_GUARD);
> +}
> +
> +static inline int is_guard_swp_entry(swp_entry_t entry)
>  {
>  	return is_pte_marker_entry(entry) &&
> -	    (pte_marker_get(entry) & PTE_MARKER_POISONED);
> +		(pte_marker_get(entry) & PTE_MARKER_GUARD);
>  }
>  
>  /*
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 906294ac85dc..50e3f6ed73ac 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -6353,6 +6353,9 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
>  				ret = VM_FAULT_HWPOISON_LARGE |
>  				      VM_FAULT_SET_HINDEX(hstate_index(h));
>  				goto out_mutex;
> +			} else if (marker & PTE_MARKER_GUARD) {
> +				ret = VM_FAULT_SIGSEGV;
> +				goto out_mutex;
>  			}
>  		}
>  
> diff --git a/mm/memory.c b/mm/memory.c
> index 0f614523b9f4..551455cd453f 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -1455,7 +1455,7 @@ static inline bool should_zap_folio(struct zap_details *details,
>  	return !folio_test_anon(folio);
>  }
>  
> -static inline bool zap_drop_file_uffd_wp(struct zap_details *details)
> +static inline bool zap_drop_markers(struct zap_details *details)
>  {
>  	if (!details)
>  		return false;
> @@ -1476,7 +1476,7 @@ zap_install_uffd_wp_if_needed(struct vm_area_struct *vma,
>  	if (vma_is_anonymous(vma))
>  		return;
>  
> -	if (zap_drop_file_uffd_wp(details))
> +	if (zap_drop_markers(details))
>  		return;
>  
>  	for (;;) {
> @@ -1671,7 +1671,15 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
>  			 * drop the marker if explicitly requested.
>  			 */
>  			if (!vma_is_anonymous(vma) &&
> -			    !zap_drop_file_uffd_wp(details))
> +			    !zap_drop_markers(details))
> +				continue;
> +		} else if (is_guard_swp_entry(entry)) {
> +			/*
> +			 * Ordinary zapping should not remove guard PTE
> +			 * markers. Only do so if we should remove PTE markers
> +			 * in general.
> +			 */
> +			if (!zap_drop_markers(details))
>  				continue;
>  		} else if (is_hwpoison_entry(entry) ||
>  			   is_poisoned_swp_entry(entry)) {
> @@ -4003,6 +4011,10 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
>  	if (marker & PTE_MARKER_POISONED)
>  		return VM_FAULT_HWPOISON;
>  
> +	/* Hitting a guard page is always a fatal condition. */
> +	if (marker & PTE_MARKER_GUARD)
> +		return VM_FAULT_SIGSEGV;
> +
>  	if (pte_marker_entry_uffd_wp(entry))
>  		return pte_marker_handle_uffd_wp(vmf);
>
Lorenzo Stoakes Oct. 21, 2024, 2:33 p.m. UTC | #3
On Mon, Oct 21, 2024 at 04:13:34PM +0200, Vlastimil Babka wrote:
> On 10/20/24 18:20, Lorenzo Stoakes wrote:
> > Add a new PTE marker that results in any access causing the accessing
> > process to segfault.
> >
> > This is preferable to PTE_MARKER_POISONED, which results in the same
> > handling as hardware poisoned memory, and is thus undesirable for cases
> > where we simply wish to 'soft' poison a range.
> >
> > This is in preparation for implementing the ability to specify guard pages
> > at the page table level, i.e. ranges that, when accessed, should cause
> > process termination.
> >
> > Additionally, rename zap_drop_file_uffd_wp() to zap_drop_markers() - the
> > function checks the ZAP_FLAG_DROP_MARKER flag so naming it for this single
> > purpose was simply incorrect.
> >
> > We then reuse the same logic to determine whether a zap should clear a
> > guard entry - this should only be performed on teardown and never on
> > MADV_DONTNEED or the like.
>
> Since I would have personally put MADV_FREE among "or the like" here, it's
> surprising to me that it in fact it's tearing down the guard entries now. Is
> that intentional? It should be at least mentioned very explicitly. But I'd
> really argue against it, as MADV_FREE is to me a weaker form of
> MADV_DONTNEED - the existing pages are not zapped immediately but
> prioritized for reclaim. If MADV_DONTNEED leaves guard PTEs in place, why
> shouldn't MADV_FREE too?

That is not, as I understand it, what MADV_FREE is, semantically. From the
man pages:

       MADV_FREE (since Linux 4.5)

              The application no longer requires the pages in the range
              specified by addr and len.  The kernel can thus free these
              pages, but the freeing could be delayed until memory pressure
              occurs.

       MADV_DONTNEED

              Do not expect access in the near future.  (For the time
              being, the application is finished with the given range, so
              the kernel can free resources associated with it.)

MADV_FREE is 'we are completely done with this range'. MADV_DONTNEED is 'we
don't expect to use it in the near future'.

>
> Seems to me rather currently an artifact of MADV_FREE implementation - if it
> encounters hwpoison entries it will tear them down because why not, we have
> detected a hw memory error and are lucky the program wants to discard the
> pages and not access them, so best use the opportunity and get rid of the
> PTE entries immediately (if MADV_DONTNEED doesn't do that too, it certainly
> could).

Right, but we explicitly do not tear them down in the case of MADV_DONTNEED
which matches the description in the manpages that the user _might_ come
back to the range, whereas MADV_FREE means they are truly done but just
don't want the overhead of actually unmapping at this point.

Seems to be this is moreso that MADV_FREE is a not-really-as-efficient
version of what Rik wants to do with his MADV_LAZYFREE thing.

>
> But to extend this to guard PTEs which are result of an explicit userspace
> action feels wrong, unless the semantics is the same for MADV_DONTEED. The
> semantics chosen for MADV_DONTNEED makes sense, so MADV_FREE should behave
> the same?

My understanding from the above is that MADV_FREE is a softer version of
munmap(), i.e. 'totally done with this range', whereas MADV_DONTNEED is a
'revert state to when I first mapped this stuff because I'm done with it
for now but might use it later'.

Obviously happy to be corrected if my understanding of this is off-the-mark
but this is what motivated me to do it this way!

>
> > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> > ---
> >  include/linux/mm_inline.h |  2 +-
> >  include/linux/swapops.h   | 26 ++++++++++++++++++++++++--
> >  mm/hugetlb.c              |  3 +++
> >  mm/memory.c               | 18 +++++++++++++++---
> >  4 files changed, 43 insertions(+), 6 deletions(-)
> >
> > diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
> > index 355cf46a01a6..1b6a917fffa4 100644
> > --- a/include/linux/mm_inline.h
> > +++ b/include/linux/mm_inline.h
> > @@ -544,7 +544,7 @@ static inline pte_marker copy_pte_marker(
> >  {
> >  	pte_marker srcm = pte_marker_get(entry);
> >  	/* Always copy error entries. */
> > -	pte_marker dstm = srcm & PTE_MARKER_POISONED;
> > +	pte_marker dstm = srcm & (PTE_MARKER_POISONED | PTE_MARKER_GUARD);
> >
> >  	/* Only copy PTE markers if UFFD register matches. */
> >  	if ((srcm & PTE_MARKER_UFFD_WP) && userfaultfd_wp(dst_vma))
> > diff --git a/include/linux/swapops.h b/include/linux/swapops.h
> > index cb468e418ea1..4d0606df0791 100644
> > --- a/include/linux/swapops.h
> > +++ b/include/linux/swapops.h
> > @@ -426,9 +426,15 @@ typedef unsigned long pte_marker;
> >   * "Poisoned" here is meant in the very general sense of "future accesses are
> >   * invalid", instead of referring very specifically to hardware memory errors.
> >   * This marker is meant to represent any of various different causes of this.
> > + *
> > + * Note that, when encountered by the faulting logic, PTEs with this marker will
> > + * result in VM_FAULT_HWPOISON and thus regardless trigger hardware memory error
> > + * logic.
> >   */
> >  #define  PTE_MARKER_POISONED			BIT(1)
> > -#define  PTE_MARKER_MASK			(BIT(2) - 1)
> > +/* Indicates that, on fault, this PTE will case a SIGSEGV signal to be sent. */
> > +#define  PTE_MARKER_GUARD			BIT(2)
> > +#define  PTE_MARKER_MASK			(BIT(3) - 1)
> >
> >  static inline swp_entry_t make_pte_marker_entry(pte_marker marker)
> >  {
> > @@ -461,9 +467,25 @@ static inline swp_entry_t make_poisoned_swp_entry(void)
> >  }
> >
> >  static inline int is_poisoned_swp_entry(swp_entry_t entry)
> > +{
> > +	/*
> > +	 * We treat guard pages as poisoned too as these have the same semantics
> > +	 * as poisoned ranges, only with different fault handling.
> > +	 */
> > +	return is_pte_marker_entry(entry) &&
> > +		(pte_marker_get(entry) &
> > +		 (PTE_MARKER_POISONED | PTE_MARKER_GUARD));
> > +}
> > +
> > +static inline swp_entry_t make_guard_swp_entry(void)
> > +{
> > +	return make_pte_marker_entry(PTE_MARKER_GUARD);
> > +}
> > +
> > +static inline int is_guard_swp_entry(swp_entry_t entry)
> >  {
> >  	return is_pte_marker_entry(entry) &&
> > -	    (pte_marker_get(entry) & PTE_MARKER_POISONED);
> > +		(pte_marker_get(entry) & PTE_MARKER_GUARD);
> >  }
> >
> >  /*
> > diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> > index 906294ac85dc..50e3f6ed73ac 100644
> > --- a/mm/hugetlb.c
> > +++ b/mm/hugetlb.c
> > @@ -6353,6 +6353,9 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
> >  				ret = VM_FAULT_HWPOISON_LARGE |
> >  				      VM_FAULT_SET_HINDEX(hstate_index(h));
> >  				goto out_mutex;
> > +			} else if (marker & PTE_MARKER_GUARD) {
> > +				ret = VM_FAULT_SIGSEGV;
> > +				goto out_mutex;
> >  			}
> >  		}
> >
> > diff --git a/mm/memory.c b/mm/memory.c
> > index 0f614523b9f4..551455cd453f 100644
> > --- a/mm/memory.c
> > +++ b/mm/memory.c
> > @@ -1455,7 +1455,7 @@ static inline bool should_zap_folio(struct zap_details *details,
> >  	return !folio_test_anon(folio);
> >  }
> >
> > -static inline bool zap_drop_file_uffd_wp(struct zap_details *details)
> > +static inline bool zap_drop_markers(struct zap_details *details)
> >  {
> >  	if (!details)
> >  		return false;
> > @@ -1476,7 +1476,7 @@ zap_install_uffd_wp_if_needed(struct vm_area_struct *vma,
> >  	if (vma_is_anonymous(vma))
> >  		return;
> >
> > -	if (zap_drop_file_uffd_wp(details))
> > +	if (zap_drop_markers(details))
> >  		return;
> >
> >  	for (;;) {
> > @@ -1671,7 +1671,15 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
> >  			 * drop the marker if explicitly requested.
> >  			 */
> >  			if (!vma_is_anonymous(vma) &&
> > -			    !zap_drop_file_uffd_wp(details))
> > +			    !zap_drop_markers(details))
> > +				continue;
> > +		} else if (is_guard_swp_entry(entry)) {
> > +			/*
> > +			 * Ordinary zapping should not remove guard PTE
> > +			 * markers. Only do so if we should remove PTE markers
> > +			 * in general.
> > +			 */
> > +			if (!zap_drop_markers(details))
> >  				continue;
> >  		} else if (is_hwpoison_entry(entry) ||
> >  			   is_poisoned_swp_entry(entry)) {
> > @@ -4003,6 +4011,10 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
> >  	if (marker & PTE_MARKER_POISONED)
> >  		return VM_FAULT_HWPOISON;
> >
> > +	/* Hitting a guard page is always a fatal condition. */
> > +	if (marker & PTE_MARKER_GUARD)
> > +		return VM_FAULT_SIGSEGV;
> > +
> >  	if (pte_marker_entry_uffd_wp(entry))
> >  		return pte_marker_handle_uffd_wp(vmf);
> >
>
Vlastimil Babka Oct. 21, 2024, 2:54 p.m. UTC | #4
On 10/21/24 16:33, Lorenzo Stoakes wrote:
> On Mon, Oct 21, 2024 at 04:13:34PM +0200, Vlastimil Babka wrote:
>> On 10/20/24 18:20, Lorenzo Stoakes wrote:
>> > Add a new PTE marker that results in any access causing the accessing
>> > process to segfault.
>> >
>> > This is preferable to PTE_MARKER_POISONED, which results in the same
>> > handling as hardware poisoned memory, and is thus undesirable for cases
>> > where we simply wish to 'soft' poison a range.
>> >
>> > This is in preparation for implementing the ability to specify guard pages
>> > at the page table level, i.e. ranges that, when accessed, should cause
>> > process termination.
>> >
>> > Additionally, rename zap_drop_file_uffd_wp() to zap_drop_markers() - the
>> > function checks the ZAP_FLAG_DROP_MARKER flag so naming it for this single
>> > purpose was simply incorrect.
>> >
>> > We then reuse the same logic to determine whether a zap should clear a
>> > guard entry - this should only be performed on teardown and never on
>> > MADV_DONTNEED or the like.
>>
>> Since I would have personally put MADV_FREE among "or the like" here, it's
>> surprising to me that it in fact it's tearing down the guard entries now. Is
>> that intentional? It should be at least mentioned very explicitly. But I'd
>> really argue against it, as MADV_FREE is to me a weaker form of
>> MADV_DONTNEED - the existing pages are not zapped immediately but
>> prioritized for reclaim. If MADV_DONTNEED leaves guard PTEs in place, why
>> shouldn't MADV_FREE too?
> 
> That is not, as I understand it, what MADV_FREE is, semantically. From the
> man pages:
> 
>        MADV_FREE (since Linux 4.5)
> 
>               The application no longer requires the pages in the range
>               specified by addr and len.  The kernel can thus free these
>               pages, but the freeing could be delayed until memory pressure
>               occurs.
> 
>        MADV_DONTNEED
> 
>               Do not expect access in the near future.  (For the time
>               being, the application is finished with the given range, so
>               the kernel can free resources associated with it.)
> 
> MADV_FREE is 'we are completely done with this range'. MADV_DONTNEED is 'we
> don't expect to use it in the near future'.

I think the description gives a wrong impression. What I think matters it
what happens (limited to anon private case as MADV_FREE doesn't support any
other)

MADV_DONTNEED - pages discarded immediately, further access gives new
zero-filled pages

MADV_FREE - pages prioritized for discarding, if that happens before next
write, it gets zero-filled page on next access, but a write done soon enough
 can cancel the upcoming discard.

In that sense, MADV_FREE is a weaker form of DONTNEED, no?

>>
>> Seems to me rather currently an artifact of MADV_FREE implementation - if it
>> encounters hwpoison entries it will tear them down because why not, we have
>> detected a hw memory error and are lucky the program wants to discard the
>> pages and not access them, so best use the opportunity and get rid of the
>> PTE entries immediately (if MADV_DONTNEED doesn't do that too, it certainly
>> could).
> 
> Right, but we explicitly do not tear them down in the case of MADV_DONTNEED
> which matches the description in the manpages that the user _might_ come
> back to the range, whereas MADV_FREE means they are truly done but just
> don't want the overhead of actually unmapping at this point.

But it's also defined what happens if user comes back to the range after a
MADV_FREE. I think the overhead saved happens in the case of actually coming
back soon enough to prevent the discard. With MADV_DONTNEED its immediate
and unconditional.

> Seems to be this is moreso that MADV_FREE is a not-really-as-efficient
> version of what Rik wants to do with his MADV_LAZYFREE thing.

I think that further optimizes MADV_FREE, which is already more optimized
than MADV_DONTNEED.

>>
>> But to extend this to guard PTEs which are result of an explicit userspace
>> action feels wrong, unless the semantics is the same for MADV_DONTEED. The
>> semantics chosen for MADV_DONTNEED makes sense, so MADV_FREE should behave
>> the same?
> 
> My understanding from the above is that MADV_FREE is a softer version of
> munmap(), i.e. 'totally done with this range', whereas MADV_DONTNEED is a
> 'revert state to when I first mapped this stuff because I'm done with it
> for now but might use it later'.

From the implementation I get the opposite understanding. Neither tears down
the vma like a proper unmap(). MADV_DONTNEED zaps page tables immediately,
MADV_FREE effectively too but with a delay depending on memory pressure.
Lorenzo Stoakes Oct. 21, 2024, 3:33 p.m. UTC | #5
On Mon, Oct 21, 2024 at 04:54:06PM +0200, Vlastimil Babka wrote:
> On 10/21/24 16:33, Lorenzo Stoakes wrote:
> > On Mon, Oct 21, 2024 at 04:13:34PM +0200, Vlastimil Babka wrote:
> >> On 10/20/24 18:20, Lorenzo Stoakes wrote:
> >> > Add a new PTE marker that results in any access causing the accessing
> >> > process to segfault.
> >> >
> >> > This is preferable to PTE_MARKER_POISONED, which results in the same
> >> > handling as hardware poisoned memory, and is thus undesirable for cases
> >> > where we simply wish to 'soft' poison a range.
> >> >
> >> > This is in preparation for implementing the ability to specify guard pages
> >> > at the page table level, i.e. ranges that, when accessed, should cause
> >> > process termination.
> >> >
> >> > Additionally, rename zap_drop_file_uffd_wp() to zap_drop_markers() - the
> >> > function checks the ZAP_FLAG_DROP_MARKER flag so naming it for this single
> >> > purpose was simply incorrect.
> >> >
> >> > We then reuse the same logic to determine whether a zap should clear a
> >> > guard entry - this should only be performed on teardown and never on
> >> > MADV_DONTNEED or the like.
> >>
> >> Since I would have personally put MADV_FREE among "or the like" here, it's
> >> surprising to me that it in fact it's tearing down the guard entries now. Is
> >> that intentional? It should be at least mentioned very explicitly. But I'd
> >> really argue against it, as MADV_FREE is to me a weaker form of
> >> MADV_DONTNEED - the existing pages are not zapped immediately but
> >> prioritized for reclaim. If MADV_DONTNEED leaves guard PTEs in place, why
> >> shouldn't MADV_FREE too?
> >
> > That is not, as I understand it, what MADV_FREE is, semantically. From the
> > man pages:
> >
> >        MADV_FREE (since Linux 4.5)
> >
> >               The application no longer requires the pages in the range
> >               specified by addr and len.  The kernel can thus free these
> >               pages, but the freeing could be delayed until memory pressure
> >               occurs.
> >
> >        MADV_DONTNEED
> >
> >               Do not expect access in the near future.  (For the time
> >               being, the application is finished with the given range, so
> >               the kernel can free resources associated with it.)
> >
> > MADV_FREE is 'we are completely done with this range'. MADV_DONTNEED is 'we
> > don't expect to use it in the near future'.
>
> I think the description gives a wrong impression. What I think matters it
> what happens (limited to anon private case as MADV_FREE doesn't support any
> other)
>
> MADV_DONTNEED - pages discarded immediately, further access gives new
> zero-filled pages
>
> MADV_FREE - pages prioritized for discarding, if that happens before next
> write, it gets zero-filled page on next access, but a write done soon enough
>  can cancel the upcoming discard.
>
> In that sense, MADV_FREE is a weaker form of DONTNEED, no?
>
> >>
> >> Seems to me rather currently an artifact of MADV_FREE implementation - if it
> >> encounters hwpoison entries it will tear them down because why not, we have
> >> detected a hw memory error and are lucky the program wants to discard the
> >> pages and not access them, so best use the opportunity and get rid of the
> >> PTE entries immediately (if MADV_DONTNEED doesn't do that too, it certainly
> >> could).
> >
> > Right, but we explicitly do not tear them down in the case of MADV_DONTNEED
> > which matches the description in the manpages that the user _might_ come
> > back to the range, whereas MADV_FREE means they are truly done but just
> > don't want the overhead of actually unmapping at this point.
>
> But it's also defined what happens if user comes back to the range after a
> MADV_FREE. I think the overhead saved happens in the case of actually coming
> back soon enough to prevent the discard. With MADV_DONTNEED its immediate
> and unconditional.
>
> > Seems to be this is moreso that MADV_FREE is a not-really-as-efficient
> > version of what Rik wants to do with his MADV_LAZYFREE thing.
>
> I think that further optimizes MADV_FREE, which is already more optimized
> than MADV_DONTNEED.
>
> >>
> >> But to extend this to guard PTEs which are result of an explicit userspace
> >> action feels wrong, unless the semantics is the same for MADV_DONTEED. The
> >> semantics chosen for MADV_DONTNEED makes sense, so MADV_FREE should behave
> >> the same?
> >
> > My understanding from the above is that MADV_FREE is a softer version of
> > munmap(), i.e. 'totally done with this range', whereas MADV_DONTNEED is a
> > 'revert state to when I first mapped this stuff because I'm done with it
> > for now but might use it later'.
>
> From the implementation I get the opposite understanding. Neither tears down
> the vma like a proper unmap(). MADV_DONTNEED zaps page tables immediately,
> MADV_FREE effectively too but with a delay depending on memory pressure.
>

OK so based on IRC chat I think the conclusion here is TL;DR yes we have to
change this, you're right :)

To summarise for on-list:

* MADV_FREE, while ostensibly being a 'lazy free' mechanism, has the
  ability to be 'cancelled' if you write to the memory. Also, after the
  freeing is complete, you can write to the memory to reuse it, the mapping
  is still there.

* For hardware poison markers it makes sense to drop them as you're
  effectively saying 'I am done with this range that is now unbacked and
  expect to get an empty page should I use it now'. UFFD WP I am not sure
  about but presumably also fine.

* However, guard pages are different - if you 'cancel' and you are left
  with a block of memory allocated to you by a pthread or userland
  allocator implementation, you don't want to then no longer be protected
  from overrunning into other thread memory.

So, while I implemented this based on a. the semantics as apparently
expressed in the man page and b. existing PTE marker behaviour, it seems
that it would actually be problematic to do so.

So yeah, I'll change that in v3!
Lorenzo Stoakes Oct. 21, 2024, 3:41 p.m. UTC | #6
On Mon, Oct 21, 2024 at 04:33:19PM +0100, Lorenzo Stoakes wrote:
[snip]


> * For hardware poison markers it makes sense to drop them as you're
>   effectively saying 'I am done with this range that is now unbacked and
>   expect to get an empty page should I use it now'. UFFD WP I am not sure
>   about but presumably also fine.

[snip]

CORRECTION: (sorry multitasking atm now I have a non-melted CPU) UFFD WP is
            safe from MADV_FREE as it checks is_poisoned_swp_entry() which
            this patch updates to include guard pages, a change I will undo
            in v3. So disregard comment on UFFD WP here.
David Hildenbrand Oct. 21, 2024, 4 p.m. UTC | #7
On 21.10.24 17:33, Lorenzo Stoakes wrote:
> On Mon, Oct 21, 2024 at 04:54:06PM +0200, Vlastimil Babka wrote:
>> On 10/21/24 16:33, Lorenzo Stoakes wrote:
>>> On Mon, Oct 21, 2024 at 04:13:34PM +0200, Vlastimil Babka wrote:
>>>> On 10/20/24 18:20, Lorenzo Stoakes wrote:
>>>>> Add a new PTE marker that results in any access causing the accessing
>>>>> process to segfault.
>>>>>
>>>>> This is preferable to PTE_MARKER_POISONED, which results in the same
>>>>> handling as hardware poisoned memory, and is thus undesirable for cases
>>>>> where we simply wish to 'soft' poison a range.
>>>>>
>>>>> This is in preparation for implementing the ability to specify guard pages
>>>>> at the page table level, i.e. ranges that, when accessed, should cause
>>>>> process termination.
>>>>>
>>>>> Additionally, rename zap_drop_file_uffd_wp() to zap_drop_markers() - the
>>>>> function checks the ZAP_FLAG_DROP_MARKER flag so naming it for this single
>>>>> purpose was simply incorrect.
>>>>>
>>>>> We then reuse the same logic to determine whether a zap should clear a
>>>>> guard entry - this should only be performed on teardown and never on
>>>>> MADV_DONTNEED or the like.
>>>>
>>>> Since I would have personally put MADV_FREE among "or the like" here, it's
>>>> surprising to me that it in fact it's tearing down the guard entries now. Is
>>>> that intentional? It should be at least mentioned very explicitly. But I'd
>>>> really argue against it, as MADV_FREE is to me a weaker form of
>>>> MADV_DONTNEED - the existing pages are not zapped immediately but
>>>> prioritized for reclaim. If MADV_DONTNEED leaves guard PTEs in place, why
>>>> shouldn't MADV_FREE too?
>>>
>>> That is not, as I understand it, what MADV_FREE is, semantically. From the
>>> man pages:
>>>
>>>         MADV_FREE (since Linux 4.5)
>>>
>>>                The application no longer requires the pages in the range
>>>                specified by addr and len.  The kernel can thus free these
>>>                pages, but the freeing could be delayed until memory pressure
>>>                occurs.
>>>
>>>         MADV_DONTNEED
>>>
>>>                Do not expect access in the near future.  (For the time
>>>                being, the application is finished with the given range, so
>>>                the kernel can free resources associated with it.)
>>>
>>> MADV_FREE is 'we are completely done with this range'. MADV_DONTNEED is 'we
>>> don't expect to use it in the near future'.
>>
>> I think the description gives a wrong impression. What I think matters it
>> what happens (limited to anon private case as MADV_FREE doesn't support any
>> other)
>>
>> MADV_DONTNEED - pages discarded immediately, further access gives new
>> zero-filled pages
>>
>> MADV_FREE - pages prioritized for discarding, if that happens before next
>> write, it gets zero-filled page on next access, but a write done soon enough
>>   can cancel the upcoming discard.
>>
>> In that sense, MADV_FREE is a weaker form of DONTNEED, no?
>>
>>>>
>>>> Seems to me rather currently an artifact of MADV_FREE implementation - if it
>>>> encounters hwpoison entries it will tear them down because why not, we have
>>>> detected a hw memory error and are lucky the program wants to discard the
>>>> pages and not access them, so best use the opportunity and get rid of the
>>>> PTE entries immediately (if MADV_DONTNEED doesn't do that too, it certainly
>>>> could).
>>>
>>> Right, but we explicitly do not tear them down in the case of MADV_DONTNEED
>>> which matches the description in the manpages that the user _might_ come
>>> back to the range, whereas MADV_FREE means they are truly done but just
>>> don't want the overhead of actually unmapping at this point.
>>
>> But it's also defined what happens if user comes back to the range after a
>> MADV_FREE. I think the overhead saved happens in the case of actually coming
>> back soon enough to prevent the discard. With MADV_DONTNEED its immediate
>> and unconditional.
>>
>>> Seems to be this is moreso that MADV_FREE is a not-really-as-efficient
>>> version of what Rik wants to do with his MADV_LAZYFREE thing.
>>
>> I think that further optimizes MADV_FREE, which is already more optimized
>> than MADV_DONTNEED.
>>
>>>>
>>>> But to extend this to guard PTEs which are result of an explicit userspace
>>>> action feels wrong, unless the semantics is the same for MADV_DONTEED. The
>>>> semantics chosen for MADV_DONTNEED makes sense, so MADV_FREE should behave
>>>> the same?
>>>
>>> My understanding from the above is that MADV_FREE is a softer version of
>>> munmap(), i.e. 'totally done with this range', whereas MADV_DONTNEED is a
>>> 'revert state to when I first mapped this stuff because I'm done with it
>>> for now but might use it later'.
>>
>>  From the implementation I get the opposite understanding. Neither tears down
>> the vma like a proper unmap(). MADV_DONTNEED zaps page tables immediately,
>> MADV_FREE effectively too but with a delay depending on memory pressure.
>>
> 
> OK so based on IRC chat I think the conclusion here is TL;DR yes we have to
> change this, you're right :)
> 
> To summarise for on-list:
> 
> * MADV_FREE, while ostensibly being a 'lazy free' mechanism, has the
>    ability to be 'cancelled' if you write to the memory. Also, after the
>    freeing is complete, you can write to the memory to reuse it, the mapping
>    is still there.
> 
> * For hardware poison markers it makes sense to drop them as you're
>    effectively saying 'I am done with this range that is now unbacked and
>    expect to get an empty page should I use it now'. UFFD WP I am not sure
>    about but presumably also fine.
> 
> * However, guard pages are different - if you 'cancel' and you are left
>    with a block of memory allocated to you by a pthread or userland
>    allocator implementation, you don't want to then no longer be protected
>    from overrunning into other thread memory.

Agreed. What happens on MADV_DONTNEED/MADV_FREE on guard pages? Ignored 
or error? It sounds like a usage "error" to me (in contrast to munmap()).
Lorenzo Stoakes Oct. 21, 2024, 4:23 p.m. UTC | #8
On Mon, Oct 21, 2024 at 06:00:20PM +0200, David Hildenbrand wrote:
[snip]
> >
> > To summarise for on-list:
> >
> > * MADV_FREE, while ostensibly being a 'lazy free' mechanism, has the
> >    ability to be 'cancelled' if you write to the memory. Also, after the
> >    freeing is complete, you can write to the memory to reuse it, the mapping
> >    is still there.
> >
> > * For hardware poison markers it makes sense to drop them as you're
> >    effectively saying 'I am done with this range that is now unbacked and
> >    expect to get an empty page should I use it now'. UFFD WP I am not sure
> >    about but presumably also fine.
> >
> > * However, guard pages are different - if you 'cancel' and you are left
> >    with a block of memory allocated to you by a pthread or userland
> >    allocator implementation, you don't want to then no longer be protected
> >    from overrunning into other thread memory.
>
> Agreed. What happens on MADV_DONTNEED/MADV_FREE on guard pages? Ignored or
> error? It sounds like a usage "error" to me (in contrast to munmap()).

It's ignored, no errror. On MADV_DONTNEED we already left the guard pages in
place, from v3 we will do the same for MADV_FREE.

I'm not sure I'd say it's an error per se, as somebody might have a use case
where they want to zap over a range but keep guard pages, perhaps an allocator
or something?

Also the existing logic is that existing markers (HW poison, uffd-simulated HW
poison, uffd wp marker) are retained and no error raised on MADV_DONTNEED, and
no error on MADV_FREE either, so it'd be consistent with existing behaviour.

Also semantically you are achieving what the calls expect you are freeing the
ranges since the guard page regions are unbacked so are already freed... so yeah
I don't think an error really makes sense here.

We might also be limiting use cases by assuming they might _only_ be used for
allocators and such.

>
> --
> Cheers,
>
> David / dhildenb
>
David Hildenbrand Oct. 21, 2024, 4:44 p.m. UTC | #9
On 21.10.24 18:23, Lorenzo Stoakes wrote:
> On Mon, Oct 21, 2024 at 06:00:20PM +0200, David Hildenbrand wrote:
> [snip]
>>>
>>> To summarise for on-list:
>>>
>>> * MADV_FREE, while ostensibly being a 'lazy free' mechanism, has the
>>>     ability to be 'cancelled' if you write to the memory. Also, after the
>>>     freeing is complete, you can write to the memory to reuse it, the mapping
>>>     is still there.
>>>
>>> * For hardware poison markers it makes sense to drop them as you're
>>>     effectively saying 'I am done with this range that is now unbacked and
>>>     expect to get an empty page should I use it now'. UFFD WP I am not sure
>>>     about but presumably also fine.
>>>
>>> * However, guard pages are different - if you 'cancel' and you are left
>>>     with a block of memory allocated to you by a pthread or userland
>>>     allocator implementation, you don't want to then no longer be protected
>>>     from overrunning into other thread memory.
>>
>> Agreed. What happens on MADV_DONTNEED/MADV_FREE on guard pages? Ignored or
>> error? It sounds like a usage "error" to me (in contrast to munmap()).
> 
> It's ignored, no errror. On MADV_DONTNEED we already left the guard pages in
> place, from v3 we will do the same for MADV_FREE.
> 
> I'm not sure I'd say it's an error per se, as somebody might have a use case
> where they want to zap over a range but keep guard pages, perhaps an allocator
> or something?

Hm, not sure I see use for that.

Staring at madvise_walk_vmas(), we return ENOMEM on VMA holes, but would 
process PROT_NONE. So current behavior is at least consistent with 
PROT_NONE handling (where something could be mapped, though).

No strong opinion.

> 
> Also the existing logic is that existing markers (HW poison, uffd-simulated HW
> poison, uffd wp marker) are retained and no error raised on MADV_DONTNEED, and
> no error on MADV_FREE either, so it'd be consistent with existing behaviour.


HW poison / uffd-simulated HW poison are expected to be zapped: it's 
just like a mapped page with HWPOISON. So that is correct.
	
UFFD-WP behavior is ... weird. Would not expect MADV_DONTNEED to zap 
uffd-wp entries.

> 
> Also semantically you are achieving what the calls expect you are freeing the
> ranges since the guard page regions are unbacked so are already freed... so yeah
> I don't think an error really makes sense here.

I you compare it to a VMA hole, it make sense to fail. If we treat it 
like PROT_NONE, it make sense to skip them.

> 
> We might also be limiting use cases by assuming they might _only_ be used for
> allocators and such.

I don't buy that as an argument, sorry :)

"Let's map the kernel writable into all user space because otherwise we 
might be limiting use cases"


:P
Lorenzo Stoakes Oct. 21, 2024, 4:51 p.m. UTC | #10
On Mon, Oct 21, 2024 at 06:44:04PM +0200, David Hildenbrand wrote:
> On 21.10.24 18:23, Lorenzo Stoakes wrote:
> > On Mon, Oct 21, 2024 at 06:00:20PM +0200, David Hildenbrand wrote:
> > [snip]
> > > >
> > > > To summarise for on-list:
> > > >
> > > > * MADV_FREE, while ostensibly being a 'lazy free' mechanism, has the
> > > >     ability to be 'cancelled' if you write to the memory. Also, after the
> > > >     freeing is complete, you can write to the memory to reuse it, the mapping
> > > >     is still there.
> > > >
> > > > * For hardware poison markers it makes sense to drop them as you're
> > > >     effectively saying 'I am done with this range that is now unbacked and
> > > >     expect to get an empty page should I use it now'. UFFD WP I am not sure
> > > >     about but presumably also fine.
> > > >
> > > > * However, guard pages are different - if you 'cancel' and you are left
> > > >     with a block of memory allocated to you by a pthread or userland
> > > >     allocator implementation, you don't want to then no longer be protected
> > > >     from overrunning into other thread memory.
> > >
> > > Agreed. What happens on MADV_DONTNEED/MADV_FREE on guard pages? Ignored or
> > > error? It sounds like a usage "error" to me (in contrast to munmap()).
> >
> > It's ignored, no errror. On MADV_DONTNEED we already left the guard pages in
> > place, from v3 we will do the same for MADV_FREE.
> >
> > I'm not sure I'd say it's an error per se, as somebody might have a use case
> > where they want to zap over a range but keep guard pages, perhaps an allocator
> > or something?
>
> Hm, not sure I see use for that.
>
> Staring at madvise_walk_vmas(), we return ENOMEM on VMA holes, but would
> process PROT_NONE. So current behavior is at least consistent with PROT_NONE
> handling (where something could be mapped, though).

Err, the handling of holes is terrible, yes we return ENOMEM, but we _carry out
the whole procedure_ then return an error, an error _indistinguishable from an
error arising from any of the individual parts_.

Which is just, awful.

>
> No strong opinion.

Well you used up your strong opinion on the naming ;)

>
> >
> > Also the existing logic is that existing markers (HW poison, uffd-simulated HW
> > poison, uffd wp marker) are retained and no error raised on MADV_DONTNEED, and
> > no error on MADV_FREE either, so it'd be consistent with existing behaviour.
>
>
> HW poison / uffd-simulated HW poison are expected to be zapped: it's just
> like a mapped page with HWPOISON. So that is correct.

Well, poison is _not_ zapped on MADV_DONTNEED but _is_ on MADV_FREE :) anyway, I
mean the MADV flags are a confusing mess generally, as per Vlasta's comments
which to begin with I strongly disagreed with then, discussing further, realsed
that no this is just a bit insane and had driven _me_ insane.

>
> UFFD-WP behavior is ... weird. Would not expect MADV_DONTNEED to zap uffd-wp
> entries.
>
> >
> > Also semantically you are achieving what the calls expect you are freeing the
> > ranges since the guard page regions are unbacked so are already freed... so yeah
> > I don't think an error really makes sense here.
>
> I you compare it to a VMA hole, it make sense to fail. If we treat it like
> PROT_NONE, it make sense to skip them.
>
> >
> > We might also be limiting use cases by assuming they might _only_ be used for
> > allocators and such.
>
> I don't buy that as an argument, sorry :)
>
> "Let's map the kernel writable into all user space because otherwise we
> might be limiting use cases"

That's a great idea! Patch series incoming, 1st April 2025... :>)
>
>
> :P
>
> --
> Cheers,
>
> David / dhildenb
>

Overall I think just always leaving in place except on remedy err sorry sorry
unpoison and munmap and not returning an error if encountered elsewhere (other
than, of course, GUP) is the right way forward and most in line with user
expectation and practical usage.
David Hildenbrand Oct. 21, 2024, 5 p.m. UTC | #11
On 21.10.24 18:51, Lorenzo Stoakes wrote:
> On Mon, Oct 21, 2024 at 06:44:04PM +0200, David Hildenbrand wrote:
>> On 21.10.24 18:23, Lorenzo Stoakes wrote:
>>> On Mon, Oct 21, 2024 at 06:00:20PM +0200, David Hildenbrand wrote:
>>> [snip]
>>>>>
>>>>> To summarise for on-list:
>>>>>
>>>>> * MADV_FREE, while ostensibly being a 'lazy free' mechanism, has the
>>>>>      ability to be 'cancelled' if you write to the memory. Also, after the
>>>>>      freeing is complete, you can write to the memory to reuse it, the mapping
>>>>>      is still there.
>>>>>
>>>>> * For hardware poison markers it makes sense to drop them as you're
>>>>>      effectively saying 'I am done with this range that is now unbacked and
>>>>>      expect to get an empty page should I use it now'. UFFD WP I am not sure
>>>>>      about but presumably also fine.
>>>>>
>>>>> * However, guard pages are different - if you 'cancel' and you are left
>>>>>      with a block of memory allocated to you by a pthread or userland
>>>>>      allocator implementation, you don't want to then no longer be protected
>>>>>      from overrunning into other thread memory.
>>>>
>>>> Agreed. What happens on MADV_DONTNEED/MADV_FREE on guard pages? Ignored or
>>>> error? It sounds like a usage "error" to me (in contrast to munmap()).
>>>
>>> It's ignored, no errror. On MADV_DONTNEED we already left the guard pages in
>>> place, from v3 we will do the same for MADV_FREE.
>>>
>>> I'm not sure I'd say it's an error per se, as somebody might have a use case
>>> where they want to zap over a range but keep guard pages, perhaps an allocator
>>> or something?
>>
>> Hm, not sure I see use for that.
>>
>> Staring at madvise_walk_vmas(), we return ENOMEM on VMA holes, but would
>> process PROT_NONE. So current behavior is at least consistent with PROT_NONE
>> handling (where something could be mapped, though).
> 
> Err, the handling of holes is terrible, yes we return ENOMEM, but we _carry out
> the whole procedure_ then return an error, an error _indistinguishable from an
> error arising from any of the individual parts_.
> 
> Which is just, awful.

Yes, absolutely. I don't know why we decided to continue. And why we 
return ENOMEM ...

> 
>>
>> No strong opinion.
> 
> Well you used up your strong opinion on the naming ;)

He, and I am out of energy for this year ;)

In retrospective, "install or remove a guard PTE" is just much better 
than anything else ...

So I should never have been mislead to suggest poison/unpoison as a 
replacement for poison/remedy :P

> 
>>
>>>
>>> Also the existing logic is that existing markers (HW poison, uffd-simulated HW
>>> poison, uffd wp marker) are retained and no error raised on MADV_DONTNEED, and
>>> no error on MADV_FREE either, so it'd be consistent with existing behaviour.
>>
>>
>> HW poison / uffd-simulated HW poison are expected to be zapped: it's just
>> like a mapped page with HWPOISON. So that is correct.
> 
> Well, poison is _not_ zapped on MADV_DONTNEED but _is_ on MADV_FREE :) anyway, I

Huh?

madvise_dontneed_single_vma()->zap_page_range_single(details=NULL)->unmap_single_vma(details=NULL) 
... zap_pte_range()

} else if (is_hwpoison_entry(entry) ||
	   is_poisoned_swp_entry(entry)) {
	if (!should_zap_cows(details))
		continue;
	...

Should just zap them.

What am I missing?

> mean the MADV flags are a confusing mess generally, as per Vlasta's comments
> which to begin with I strongly disagreed with then, discussing further, realsed
> that no this is just a bit insane and had driven _me_ insane.
> 
>>
>> UFFD-WP behavior is ... weird. Would not expect MADV_DONTNEED to zap uffd-wp
>> entries.
>>
>>>
>>> Also semantically you are achieving what the calls expect you are freeing the
>>> ranges since the guard page regions are unbacked so are already freed... so yeah
>>> I don't think an error really makes sense here.
>>
>> I you compare it to a VMA hole, it make sense to fail. If we treat it like
>> PROT_NONE, it make sense to skip them.
>>
>>>
>>> We might also be limiting use cases by assuming they might _only_ be used for
>>> allocators and such.
>>
>> I don't buy that as an argument, sorry :)
>>
>> "Let's map the kernel writable into all user space because otherwise we
>> might be limiting use cases"
> 
> That's a great idea! Patch series incoming, 1st April 2025... :>)

:) Just flip the bit on x86 and we're done!

>>
>>
>> :P
>>
>> --
>> Cheers,
>>
>> David / dhildenb
>>
> 
> Overall I think just always leaving in place except on remedy err sorry sorry
> unpoison and munmap and not returning an error if encountered elsewhere (other
> than, of course, GUP) is the right way forward and most in line with user
> expectation and practical usage.


Fine with me, make sure to document that is behaves like a PROT_NONE 
VMA, not like a memory hole, except when something would trigger a fault 
(GUP etc).
Lorenzo Stoakes Oct. 21, 2024, 5:14 p.m. UTC | #12
On Mon, Oct 21, 2024 at 07:00:53PM +0200, David Hildenbrand wrote:
> On 21.10.24 18:51, Lorenzo Stoakes wrote:
> > On Mon, Oct 21, 2024 at 06:44:04PM +0200, David Hildenbrand wrote:
> > > On 21.10.24 18:23, Lorenzo Stoakes wrote:
> > > > On Mon, Oct 21, 2024 at 06:00:20PM +0200, David Hildenbrand wrote:
> > > > [snip]
> > > > > >
> > > > > > To summarise for on-list:
> > > > > >
> > > > > > * MADV_FREE, while ostensibly being a 'lazy free' mechanism, has the
> > > > > >      ability to be 'cancelled' if you write to the memory. Also, after the
> > > > > >      freeing is complete, you can write to the memory to reuse it, the mapping
> > > > > >      is still there.
> > > > > >
> > > > > > * For hardware poison markers it makes sense to drop them as you're
> > > > > >      effectively saying 'I am done with this range that is now unbacked and
> > > > > >      expect to get an empty page should I use it now'. UFFD WP I am not sure
> > > > > >      about but presumably also fine.
> > > > > >
> > > > > > * However, guard pages are different - if you 'cancel' and you are left
> > > > > >      with a block of memory allocated to you by a pthread or userland
> > > > > >      allocator implementation, you don't want to then no longer be protected
> > > > > >      from overrunning into other thread memory.
> > > > >
> > > > > Agreed. What happens on MADV_DONTNEED/MADV_FREE on guard pages? Ignored or
> > > > > error? It sounds like a usage "error" to me (in contrast to munmap()).
> > > >
> > > > It's ignored, no errror. On MADV_DONTNEED we already left the guard pages in
> > > > place, from v3 we will do the same for MADV_FREE.
> > > >
> > > > I'm not sure I'd say it's an error per se, as somebody might have a use case
> > > > where they want to zap over a range but keep guard pages, perhaps an allocator
> > > > or something?
> > >
> > > Hm, not sure I see use for that.
> > >
> > > Staring at madvise_walk_vmas(), we return ENOMEM on VMA holes, but would
> > > process PROT_NONE. So current behavior is at least consistent with PROT_NONE
> > > handling (where something could be mapped, though).
> >
> > Err, the handling of holes is terrible, yes we return ENOMEM, but we _carry out
> > the whole procedure_ then return an error, an error _indistinguishable from an
> > error arising from any of the individual parts_.
> >
> > Which is just, awful.
>
> Yes, absolutely. I don't know why we decided to continue. And why we return
> ENOMEM ...

Anyway UAPI so no turning back now :)

>
> >
> > >
> > > No strong opinion.
> >
> > Well you used up your strong opinion on the naming ;)
>
> He, and I am out of energy for this year ;)

Haha understandable...

>
> In retrospective, "install or remove a guard PTE" is just much better than
> anything else ...
>
> So I should never have been mislead to suggest poison/unpoison as a
> replacement for poison/remedy :P

You know the reason ;)

>
> >
> > >
> > > >
> > > > Also the existing logic is that existing markers (HW poison, uffd-simulated HW
> > > > poison, uffd wp marker) are retained and no error raised on MADV_DONTNEED, and
> > > > no error on MADV_FREE either, so it'd be consistent with existing behaviour.
> > >
> > >
> > > HW poison / uffd-simulated HW poison are expected to be zapped: it's just
> > > like a mapped page with HWPOISON. So that is correct.
> >
> > Well, poison is _not_ zapped on MADV_DONTNEED but _is_ on MADV_FREE :) anyway, I
>
> Huh?
>
> madvise_dontneed_single_vma()->zap_page_range_single(details=NULL)->unmap_single_vma(details=NULL)
> ... zap_pte_range()
>
> } else if (is_hwpoison_entry(entry) ||
> 	   is_poisoned_swp_entry(entry)) {
> 	if (!should_zap_cows(details))
> 		continue;
> 	...
>
> Should just zap them.
>
> What am I missing?

Yeah ok it's me who's missing something here, I hadn't noticed details == NULL
so should_zap_cows() is true, my mistake!

In any case we explicitly add code here to prevent guard pages from going. I
will correct everything where I wrongly say otherwise, doh!

>
> > mean the MADV flags are a confusing mess generally, as per Vlasta's comments
> > which to begin with I strongly disagreed with then, discussing further, realsed
> > that no this is just a bit insane and had driven _me_ insane.
> >
> > >
> > > UFFD-WP behavior is ... weird. Would not expect MADV_DONTNEED to zap uffd-wp
> > > entries.
> > >
> > > >
> > > > Also semantically you are achieving what the calls expect you are freeing the
> > > > ranges since the guard page regions are unbacked so are already freed... so yeah
> > > > I don't think an error really makes sense here.
> > >
> > > I you compare it to a VMA hole, it make sense to fail. If we treat it like
> > > PROT_NONE, it make sense to skip them.
> > >
> > > >
> > > > We might also be limiting use cases by assuming they might _only_ be used for
> > > > allocators and such.
> > >
> > > I don't buy that as an argument, sorry :)
> > >
> > > "Let's map the kernel writable into all user space because otherwise we
> > > might be limiting use cases"
> >
> > That's a great idea! Patch series incoming, 1st April 2025... :>)
>
> :) Just flip the bit on x86 and we're done!

;)

>
> > >
> > >
> > > :P
> > >
> > > --
> > > Cheers,
> > >
> > > David / dhildenb
> > >
> >
> > Overall I think just always leaving in place except on remedy err sorry sorry
> > unpoison and munmap and not returning an error if encountered elsewhere (other
> > than, of course, GUP) is the right way forward and most in line with user
> > expectation and practical usage.
>
>
> Fine with me, make sure to document that is behaves like a PROT_NONE VMA,
> not like a memory hole, except when something would trigger a fault (GUP
> etc).

Ack will make sure to document.

>
>
> --
> Cheers,
>
> David / dhildenb
>
David Hildenbrand Oct. 21, 2024, 5:21 p.m. UTC | #13
>>
>>>
>>>>
>>>>>
>>>>> Also the existing logic is that existing markers (HW poison, uffd-simulated HW
>>>>> poison, uffd wp marker) are retained and no error raised on MADV_DONTNEED, and
>>>>> no error on MADV_FREE either, so it'd be consistent with existing behaviour.
>>>>
>>>>
>>>> HW poison / uffd-simulated HW poison are expected to be zapped: it's just
>>>> like a mapped page with HWPOISON. So that is correct.
>>>
>>> Well, poison is _not_ zapped on MADV_DONTNEED but _is_ on MADV_FREE :) anyway, I
>>
>> Huh?
>>
>> madvise_dontneed_single_vma()->zap_page_range_single(details=NULL)->unmap_single_vma(details=NULL)
>> ... zap_pte_range()
>>
>> } else if (is_hwpoison_entry(entry) ||
>> 	   is_poisoned_swp_entry(entry)) {
>> 	if (!should_zap_cows(details))
>> 		continue;
>> 	...
>>
>> Should just zap them.
>>
>> What am I missing?
> 
> Yeah ok it's me who's missing something here, I hadn't noticed details == NULL
> so should_zap_cows() is true, my mistake!

It's confusing code ... I once had a patch to call it 
"should_zap_anon_folios" etc, but it would only slightly make it clearer 
what is actually happening.

> 
> In any case we explicitly add code here to prevent guard pages from going. I
> will correct everything where I wrongly say otherwise, doh!

Right, that's also one of the reasons I don't think "poison" terminology 
is the best fit, because there are som subtle differences. At least the 
marker does not mention "poison" but PTE_MARKER_GUARD, which I think is 
a very good naming :)
Vlastimil Babka Oct. 21, 2024, 5:26 p.m. UTC | #14
On 10/21/24 19:14, Lorenzo Stoakes wrote:
> On Mon, Oct 21, 2024 at 07:00:53PM +0200, David Hildenbrand wrote:
>>
>> >
>> > >
>> > > >
>> > > > Also the existing logic is that existing markers (HW poison, uffd-simulated HW
>> > > > poison, uffd wp marker) are retained and no error raised on MADV_DONTNEED, and
>> > > > no error on MADV_FREE either, so it'd be consistent with existing behaviour.
>> > >
>> > >
>> > > HW poison / uffd-simulated HW poison are expected to be zapped: it's just
>> > > like a mapped page with HWPOISON. So that is correct.
>> >
>> > Well, poison is _not_ zapped on MADV_DONTNEED but _is_ on MADV_FREE :) anyway, I
>>
>> Huh?
>>
>> madvise_dontneed_single_vma()->zap_page_range_single(details=NULL)->unmap_single_vma(details=NULL)
>> ... zap_pte_range()
>>
>> } else if (is_hwpoison_entry(entry) ||
>> 	   is_poisoned_swp_entry(entry)) {
>> 	if (!should_zap_cows(details))
>> 		continue;
>> 	...
>>
>> Should just zap them.
>>
>> What am I missing?
> 
> Yeah ok it's me who's missing something here, I hadn't noticed details == NULL
> so should_zap_cows() is true, my mistake!

Well, good to know it's consistent then. As I've explained I see why zapping
actual hwpoison makes sense for MADV_DONTNEED/MADV_FREE. That it's done also
for uffd poison is not completely clear, but maybe it was just easier to
implement. But it doesn't mean we have to do the same for GUARD PTEs. Either
behavior of zap/ignore/error could be valid, we just have to pick one and
then live with it as it can't change :) Zapping guards on DONTNEED/FREE
seems wrong to me, so it's between error (and potentially catching some
misuse) and ignore (potentially more performant in case somebody wants to
DOTNEED/FREE an area that contains scattered guards).

And the impossibility to meaningfully unwind on errors in the middle of the
operation (unless we pre-scan for guards) isn't exactly nice, so maybe just
ignore, i.e. the current approach?

> In any case we explicitly add code here to prevent guard pages from going. I
> will correct everything where I wrongly say otherwise, doh!
> 
>>
>> > mean the MADV flags are a confusing mess generally, as per Vlasta's comments
>> > which to begin with I strongly disagreed with then, discussing further, realsed
>> > that no this is just a bit insane and had driven _me_ insane.
>> >
>> > >
>> > > UFFD-WP behavior is ... weird. Would not expect MADV_DONTNEED to zap uffd-wp
>> > > entries.
>> > >
>> > > >
>> > > > Also semantically you are achieving what the calls expect you are freeing the
>> > > > ranges since the guard page regions are unbacked so are already freed... so yeah
>> > > > I don't think an error really makes sense here.
>> > >
>> > > I you compare it to a VMA hole, it make sense to fail. If we treat it like
>> > > PROT_NONE, it make sense to skip them.
>> > >
>> > > >
>> > > > We might also be limiting use cases by assuming they might _only_ be used for
>> > > > allocators and such.
>> > >
>> > > I don't buy that as an argument, sorry :)
>> > >
>> > > "Let's map the kernel writable into all user space because otherwise we
>> > > might be limiting use cases"
>> >
>> > That's a great idea! Patch series incoming, 1st April 2025... :>)
>>
>> :) Just flip the bit on x86 and we're done!
> 
> ;)
> 
>>
>> > >
>> > >
>> > > :P
>> > >
>> > > --
>> > > Cheers,
>> > >
>> > > David / dhildenb
>> > >
>> >
>> > Overall I think just always leaving in place except on remedy err sorry sorry
>> > unpoison and munmap and not returning an error if encountered elsewhere (other
>> > than, of course, GUP) is the right way forward and most in line with user
>> > expectation and practical usage.
>>
>>
>> Fine with me, make sure to document that is behaves like a PROT_NONE VMA,
>> not like a memory hole, except when something would trigger a fault (GUP
>> etc).
> 
> Ack will make sure to document.
> 
>>
>>
>> --
>> Cheers,
>>
>> David / dhildenb
>>
Lorenzo Stoakes Oct. 21, 2024, 7:57 p.m. UTC | #15
On Mon, Oct 21, 2024 at 03:45:31PM +0200, Vlastimil Babka wrote:
[snip]
> > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
>
> Acked-by: Vlastimil Babka <vbabka@suse.cz>

Thanks!

>
> A nit below:
>
> > diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> > index 906294ac85dc..50e3f6ed73ac 100644
> > --- a/mm/hugetlb.c
> > +++ b/mm/hugetlb.c
> > @@ -6353,6 +6353,9 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
> >  				ret = VM_FAULT_HWPOISON_LARGE |
> >  				      VM_FAULT_SET_HINDEX(hstate_index(h));
> >  				goto out_mutex;
> > +			} else if (marker & PTE_MARKER_GUARD) {
> > +				ret = VM_FAULT_SIGSEGV;
> > +				goto out_mutex;
>
> Given we don't support hugetlb, should we WARN_ON_ONCE() if such unexpected
> marker appears there?

Yes agreed, will add.

[snip]
Lorenzo Stoakes Oct. 21, 2024, 8:42 p.m. UTC | #16
On Mon, Oct 21, 2024 at 03:45:31PM +0200, Vlastimil Babka wrote:
> On 10/20/24 18:20, Lorenzo Stoakes wrote:
> > Add a new PTE marker that results in any access causing the accessing
> > process to segfault.
>
> Should we distinguish it from other segfaults? Is there a way? I can see
> memory protection keys use SEGV_PKUERR, but no idea if we have any free values.

Wasn't even aware that existed!!

I'm not sure a process can do anything particularly useful with this
information though?  Hitting a guard page would indicate a programming
error rather than something that might allow meaningful feedback to a user
like memory protection keys.

Do you think there's enough value int his to warrant digging in? And indeed
I imagine bits are in short supply for this and would need a strong
argument to get... so yeah I don't think too worthwhile most likely!

Thanks for the suggestion though!

>
> > This is preferable to PTE_MARKER_POISONED, which results in the same
> > handling as hardware poisoned memory, and is thus undesirable for cases
> > where we simply wish to 'soft' poison a range.
> >
> > This is in preparation for implementing the ability to specify guard pages
> > at the page table level, i.e. ranges that, when accessed, should cause
> > process termination.
> >
> > Additionally, rename zap_drop_file_uffd_wp() to zap_drop_markers() - the
> > function checks the ZAP_FLAG_DROP_MARKER flag so naming it for this single
> > purpose was simply incorrect.
> >
> > We then reuse the same logic to determine whether a zap should clear a
> > guard entry - this should only be performed on teardown and never on
> > MADV_DONTNEED or the like.
> >
> > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
>
> Acked-by: Vlastimil Babka <vbabka@suse.cz>
>
> A nit below:
>
> > diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> > index 906294ac85dc..50e3f6ed73ac 100644
> > --- a/mm/hugetlb.c
> > +++ b/mm/hugetlb.c
> > @@ -6353,6 +6353,9 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
> >  				ret = VM_FAULT_HWPOISON_LARGE |
> >  				      VM_FAULT_SET_HINDEX(hstate_index(h));
> >  				goto out_mutex;
> > +			} else if (marker & PTE_MARKER_GUARD) {
> > +				ret = VM_FAULT_SIGSEGV;
> > +				goto out_mutex;
>
> Given we don't support hugetlb, should we WARN_ON_ONCE() if such unexpected
> marker appears there?
>
> >  			}
> >  		}
> >
> > diff --git a/mm/memory.c b/mm/memory.c
> > index 0f614523b9f4..551455cd453f 100644
> > --- a/mm/memory.c
> > +++ b/mm/memory.c
> > @@ -1455,7 +1455,7 @@ static inline bool should_zap_folio(struct zap_details *details,
> >  	return !folio_test_anon(folio);
> >  }
> >
> > -static inline bool zap_drop_file_uffd_wp(struct zap_details *details)
> > +static inline bool zap_drop_markers(struct zap_details *details)
> >  {
> >  	if (!details)
> >  		return false;
> > @@ -1476,7 +1476,7 @@ zap_install_uffd_wp_if_needed(struct vm_area_struct *vma,
> >  	if (vma_is_anonymous(vma))
> >  		return;
> >
> > -	if (zap_drop_file_uffd_wp(details))
> > +	if (zap_drop_markers(details))
> >  		return;
> >
> >  	for (;;) {
> > @@ -1671,7 +1671,15 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
> >  			 * drop the marker if explicitly requested.
> >  			 */
> >  			if (!vma_is_anonymous(vma) &&
> > -			    !zap_drop_file_uffd_wp(details))
> > +			    !zap_drop_markers(details))
> > +				continue;
> > +		} else if (is_guard_swp_entry(entry)) {
> > +			/*
> > +			 * Ordinary zapping should not remove guard PTE
> > +			 * markers. Only do so if we should remove PTE markers
> > +			 * in general.
> > +			 */
> > +			if (!zap_drop_markers(details))
> >  				continue;
> >  		} else if (is_hwpoison_entry(entry) ||
> >  			   is_poisoned_swp_entry(entry)) {
> > @@ -4003,6 +4011,10 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
> >  	if (marker & PTE_MARKER_POISONED)
> >  		return VM_FAULT_HWPOISON;
> >
> > +	/* Hitting a guard page is always a fatal condition. */
> > +	if (marker & PTE_MARKER_GUARD)
> > +		return VM_FAULT_SIGSEGV;
> > +
> >  	if (pte_marker_entry_uffd_wp(entry))
> >  		return pte_marker_handle_uffd_wp(vmf);
> >
>
Lorenzo Stoakes Oct. 21, 2024, 9:13 p.m. UTC | #17
+cc Dave Hansen

On Mon, Oct 21, 2024 at 09:42:53PM +0100, Lorenzo Stoakes wrote:
> On Mon, Oct 21, 2024 at 03:45:31PM +0200, Vlastimil Babka wrote:
> > On 10/20/24 18:20, Lorenzo Stoakes wrote:
> > > Add a new PTE marker that results in any access causing the accessing
> > > process to segfault.
> >
> > Should we distinguish it from other segfaults? Is there a way? I can see
> > memory protection keys use SEGV_PKUERR, but no idea if we have any free values.
>
> Wasn't even aware that existed!!
>
> I'm not sure a process can do anything particularly useful with this
> information though?  Hitting a guard page would indicate a programming
> error rather than something that might allow meaningful feedback to a user
> like memory protection keys.
>
> Do you think there's enough value int his to warrant digging in? And indeed
> I imagine bits are in short supply for this and would need a strong
> argument to get... so yeah I don't think too worthwhile most likely!
>
> Thanks for the suggestion though!

To put it on list - Dave Hansen commented on IRC that it would be safer to
avoid this for now due to this being an ABI change, and reasonable to
perhaps add it later if required, so that seems a sensible way forward.

Thanks!

[snip]
Dave Hansen Oct. 21, 2024, 9:20 p.m. UTC | #18
On 10/21/24 14:13, Lorenzo Stoakes wrote:
>> Do you think there's enough value int his to warrant digging in? And indeed
>> I imagine bits are in short supply for this and would need a strong
>> argument to get... so yeah I don't think too worthwhile most likely!
>>
>> Thanks for the suggestion though!
> To put it on list - Dave Hansen commented on IRC that it would be safer to
> avoid this for now due to this being an ABI change, and reasonable to
> perhaps add it later if required, so that seems a sensible way forward.

We added SEGV_PKUERR because we really did expect signal handlers to
want to do something new and special, including consuming si_pkey.  Old
signal handlers would probably be pretty confused.

So, yeah, if it's not crystal clear that new signal handler code is
needed, than I'd punt on adding a new SEGV_* code for now.

BTW, SEGV_* codes are sequentially assigned.  It isn't a bitfield and
there are no space constraints.  We've only used a dozen or so of them
and ->si_code is an int.
David Hildenbrand Oct. 22, 2024, 7:13 p.m. UTC | #19
On 21.10.24 19:26, Vlastimil Babka wrote:
> On 10/21/24 19:14, Lorenzo Stoakes wrote:
>> On Mon, Oct 21, 2024 at 07:00:53PM +0200, David Hildenbrand wrote:
>>>
>>>>
>>>>>
>>>>>>
>>>>>> Also the existing logic is that existing markers (HW poison, uffd-simulated HW
>>>>>> poison, uffd wp marker) are retained and no error raised on MADV_DONTNEED, and
>>>>>> no error on MADV_FREE either, so it'd be consistent with existing behaviour.
>>>>>
>>>>>
>>>>> HW poison / uffd-simulated HW poison are expected to be zapped: it's just
>>>>> like a mapped page with HWPOISON. So that is correct.
>>>>
>>>> Well, poison is _not_ zapped on MADV_DONTNEED but _is_ on MADV_FREE :) anyway, I
>>>
>>> Huh?
>>>
>>> madvise_dontneed_single_vma()->zap_page_range_single(details=NULL)->unmap_single_vma(details=NULL)
>>> ... zap_pte_range()
>>>
>>> } else if (is_hwpoison_entry(entry) ||
>>> 	   is_poisoned_swp_entry(entry)) {
>>> 	if (!should_zap_cows(details))
>>> 		continue;
>>> 	...
>>>
>>> Should just zap them.
>>>
>>> What am I missing?
>>
>> Yeah ok it's me who's missing something here, I hadn't noticed details == NULL
>> so should_zap_cows() is true, my mistake!
> 
> Well, good to know it's consistent then. As I've explained I see why zapping
> actual hwpoison makes sense for MADV_DONTNEED/MADV_FREE. That it's done also
> for uffd poison is not completely clear, but maybe it was just easier to
> implement. 

Note that in VM context "uffd poison" really just is "this was hwpoison 
on the source VM, so we mimic that on the destination VM, because the 
data *is* lost" -- so you want the exact same behavior.

For example, when a VM reboots you might just want to ZAP these hwpoison 
entries, and get fresh pages on next access.

So to me it makes sense that they are treated equally.
diff mbox series

Patch

diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 355cf46a01a6..1b6a917fffa4 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -544,7 +544,7 @@  static inline pte_marker copy_pte_marker(
 {
 	pte_marker srcm = pte_marker_get(entry);
 	/* Always copy error entries. */
-	pte_marker dstm = srcm & PTE_MARKER_POISONED;
+	pte_marker dstm = srcm & (PTE_MARKER_POISONED | PTE_MARKER_GUARD);
 
 	/* Only copy PTE markers if UFFD register matches. */
 	if ((srcm & PTE_MARKER_UFFD_WP) && userfaultfd_wp(dst_vma))
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index cb468e418ea1..4d0606df0791 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -426,9 +426,15 @@  typedef unsigned long pte_marker;
  * "Poisoned" here is meant in the very general sense of "future accesses are
  * invalid", instead of referring very specifically to hardware memory errors.
  * This marker is meant to represent any of various different causes of this.
+ *
+ * Note that, when encountered by the faulting logic, PTEs with this marker will
+ * result in VM_FAULT_HWPOISON and thus regardless trigger hardware memory error
+ * logic.
  */
 #define  PTE_MARKER_POISONED			BIT(1)
-#define  PTE_MARKER_MASK			(BIT(2) - 1)
+/* Indicates that, on fault, this PTE will case a SIGSEGV signal to be sent. */
+#define  PTE_MARKER_GUARD			BIT(2)
+#define  PTE_MARKER_MASK			(BIT(3) - 1)
 
 static inline swp_entry_t make_pte_marker_entry(pte_marker marker)
 {
@@ -461,9 +467,25 @@  static inline swp_entry_t make_poisoned_swp_entry(void)
 }
 
 static inline int is_poisoned_swp_entry(swp_entry_t entry)
+{
+	/*
+	 * We treat guard pages as poisoned too as these have the same semantics
+	 * as poisoned ranges, only with different fault handling.
+	 */
+	return is_pte_marker_entry(entry) &&
+		(pte_marker_get(entry) &
+		 (PTE_MARKER_POISONED | PTE_MARKER_GUARD));
+}
+
+static inline swp_entry_t make_guard_swp_entry(void)
+{
+	return make_pte_marker_entry(PTE_MARKER_GUARD);
+}
+
+static inline int is_guard_swp_entry(swp_entry_t entry)
 {
 	return is_pte_marker_entry(entry) &&
-	    (pte_marker_get(entry) & PTE_MARKER_POISONED);
+		(pte_marker_get(entry) & PTE_MARKER_GUARD);
 }
 
 /*
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 906294ac85dc..50e3f6ed73ac 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -6353,6 +6353,9 @@  vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 				ret = VM_FAULT_HWPOISON_LARGE |
 				      VM_FAULT_SET_HINDEX(hstate_index(h));
 				goto out_mutex;
+			} else if (marker & PTE_MARKER_GUARD) {
+				ret = VM_FAULT_SIGSEGV;
+				goto out_mutex;
 			}
 		}
 
diff --git a/mm/memory.c b/mm/memory.c
index 0f614523b9f4..551455cd453f 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1455,7 +1455,7 @@  static inline bool should_zap_folio(struct zap_details *details,
 	return !folio_test_anon(folio);
 }
 
-static inline bool zap_drop_file_uffd_wp(struct zap_details *details)
+static inline bool zap_drop_markers(struct zap_details *details)
 {
 	if (!details)
 		return false;
@@ -1476,7 +1476,7 @@  zap_install_uffd_wp_if_needed(struct vm_area_struct *vma,
 	if (vma_is_anonymous(vma))
 		return;
 
-	if (zap_drop_file_uffd_wp(details))
+	if (zap_drop_markers(details))
 		return;
 
 	for (;;) {
@@ -1671,7 +1671,15 @@  static unsigned long zap_pte_range(struct mmu_gather *tlb,
 			 * drop the marker if explicitly requested.
 			 */
 			if (!vma_is_anonymous(vma) &&
-			    !zap_drop_file_uffd_wp(details))
+			    !zap_drop_markers(details))
+				continue;
+		} else if (is_guard_swp_entry(entry)) {
+			/*
+			 * Ordinary zapping should not remove guard PTE
+			 * markers. Only do so if we should remove PTE markers
+			 * in general.
+			 */
+			if (!zap_drop_markers(details))
 				continue;
 		} else if (is_hwpoison_entry(entry) ||
 			   is_poisoned_swp_entry(entry)) {
@@ -4003,6 +4011,10 @@  static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
 	if (marker & PTE_MARKER_POISONED)
 		return VM_FAULT_HWPOISON;
 
+	/* Hitting a guard page is always a fatal condition. */
+	if (marker & PTE_MARKER_GUARD)
+		return VM_FAULT_SIGSEGV;
+
 	if (pte_marker_entry_uffd_wp(entry))
 		return pte_marker_handle_uffd_wp(vmf);