diff mbox series

[v5,02/10] hugetlb/userfaultfd: Forbid huge pmd sharing when uffd enabled

Message ID 20210210212200.1097784-3-axelrasmussen@google.com (mailing list archive)
State New, archived
Headers show
Series userfaultfd: add minor fault handling | expand

Commit Message

Axel Rasmussen Feb. 10, 2021, 9:21 p.m. UTC
From: Peter Xu <peterx@redhat.com>

Huge pmd sharing could bring problem to userfaultfd.  The thing is that
userfaultfd is running its logic based on the special bits on page table
entries, however the huge pmd sharing could potentially share page table
entries for different address ranges.  That could cause issues on either:

  - When sharing huge pmd page tables for an uffd write protected range, the
    newly mapped huge pmd range will also be write protected unexpectedly, or,

  - When we try to write protect a range of huge pmd shared range, we'll first
    do huge_pmd_unshare() in hugetlb_change_protection(), however that also
    means the UFFDIO_WRITEPROTECT could be silently skipped for the shared
    region, which could lead to data loss.

Since at it, a few other things are done altogether:

  - Move want_pmd_share() from mm/hugetlb.c into linux/hugetlb.h, because
    that's definitely something that arch code would like to use too

  - ARM64 currently directly check against CONFIG_ARCH_WANT_HUGE_PMD_SHARE when
    trying to share huge pmd.  Switch to the want_pmd_share() helper.

Since at it, move vma_shareable() from huge_pmd_share() into want_pmd_share().

Signed-off-by: Peter Xu <peterx@redhat.com>
Signed-off-by: Axel Rasmussen <axelrasmussen@google.com>
---
 arch/arm64/mm/hugetlbpage.c   |  3 +--
 include/linux/hugetlb.h       |  2 ++
 include/linux/userfaultfd_k.h |  9 +++++++++
 mm/hugetlb.c                  | 20 ++++++++++++++------
 4 files changed, 26 insertions(+), 8 deletions(-)

Comments

Mike Kravetz Feb. 12, 2021, 12:19 a.m. UTC | #1
On 2/10/21 1:21 PM, Axel Rasmussen wrote:
> From: Peter Xu <peterx@redhat.com>
> 
> Huge pmd sharing could bring problem to userfaultfd.  The thing is that
> userfaultfd is running its logic based on the special bits on page table
> entries, however the huge pmd sharing could potentially share page table
> entries for different address ranges.  That could cause issues on either:
> 
>   - When sharing huge pmd page tables for an uffd write protected range, the
>     newly mapped huge pmd range will also be write protected unexpectedly, or,
> 
>   - When we try to write protect a range of huge pmd shared range, we'll first
>     do huge_pmd_unshare() in hugetlb_change_protection(), however that also
>     means the UFFDIO_WRITEPROTECT could be silently skipped for the shared
>     region, which could lead to data loss.
> 
> Since at it, a few other things are done altogether:
> 
>   - Move want_pmd_share() from mm/hugetlb.c into linux/hugetlb.h, because
>     that's definitely something that arch code would like to use too
> 
>   - ARM64 currently directly check against CONFIG_ARCH_WANT_HUGE_PMD_SHARE when
>     trying to share huge pmd.  Switch to the want_pmd_share() helper.
> 
> Since at it, move vma_shareable() from huge_pmd_share() into want_pmd_share().
> 
> Signed-off-by: Peter Xu <peterx@redhat.com>
> Signed-off-by: Axel Rasmussen <axelrasmussen@google.com>
> ---
>  arch/arm64/mm/hugetlbpage.c   |  3 +--
>  include/linux/hugetlb.h       |  2 ++
>  include/linux/userfaultfd_k.h |  9 +++++++++
>  mm/hugetlb.c                  | 20 ++++++++++++++------
>  4 files changed, 26 insertions(+), 8 deletions(-)
> 
> diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
> index 6e3bcffe2837..58987a98e179 100644
> --- a/arch/arm64/mm/hugetlbpage.c
> +++ b/arch/arm64/mm/hugetlbpage.c
> @@ -284,8 +284,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
>  		 */
>  		ptep = pte_alloc_map(mm, pmdp, addr);
>  	} else if (sz == PMD_SIZE) {
> -		if (IS_ENABLED(CONFIG_ARCH_WANT_HUGE_PMD_SHARE) &&
> -		    pud_none(READ_ONCE(*pudp)))
> +		if (want_pmd_share(vma, addr) && pud_none(READ_ONCE(*pudp)))
>  			ptep = huge_pmd_share(mm, vma, addr, pudp);
>  		else
>  			ptep = (pte_t *)pmd_alloc(mm, pudp, addr);
> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> index ca6e5ba56f73..d971e7efd17d 100644
> --- a/include/linux/hugetlb.h
> +++ b/include/linux/hugetlb.h
> @@ -1030,4 +1030,6 @@ static inline __init void hugetlb_cma_check(void)
>  }
>  #endif
>  
> +bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr);
> +
>  #endif /* _LINUX_HUGETLB_H */
> diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
> index a8e5f3ea9bb2..c63ccdae3eab 100644
> --- a/include/linux/userfaultfd_k.h
> +++ b/include/linux/userfaultfd_k.h
> @@ -52,6 +52,15 @@ static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma,
>  	return vma->vm_userfaultfd_ctx.ctx == vm_ctx.ctx;
>  }
>  
> +/*
> + * Never enable huge pmd sharing on uffd-wp registered vmas, because uffd-wp
> + * protect information is per pgtable entry.
> + */
> +static inline bool uffd_disable_huge_pmd_share(struct vm_area_struct *vma)
> +{
> +	return vma->vm_flags & VM_UFFD_WP;
> +}
> +
>  static inline bool userfaultfd_missing(struct vm_area_struct *vma)
>  {
>  	return vma->vm_flags & VM_UFFD_MISSING;
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 32d4d2e277ad..5710286e1984 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -5245,6 +5245,18 @@ static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr)
>  	return false;
>  }
>  
> +bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr)
> +{
> +#ifndef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
> +	return false;
> +#endif
> +#ifdef CONFIG_USERFAULTFD
> +	if (uffd_disable_huge_pmd_share(vma))
> +		return false;
> +#endif
> +	return vma_shareable(vma, addr);
> +}
> +

This code certainly does the right thing, however I wonder if it should
be structured a little differently.

want_pmd_share() is currently just a check for CONFIG_ARCH_WANT_HUGE_PMD_SHARE.
How about leaving that mostly as is, and adding the new vma checks to
vma_shareable().  vma_shareable() would then be something like:

	if (!(vma->vm_flags & VM_MAYSHARE))
		return false;
#ifdef CONFIG_USERFAULTFD
	if (uffd_disable_huge_pmd_share(vma)
		return false;
#endif
#ifdef /* XXX */
	/* add other checks for things like uffd wp and soft dirty here */
#endif /* XXX */

	if (range_in_vma(vma, base, end)
		return true;
	return false;

Of course, this would require we leave the call to vma_shareable() at the
beginning of huge_pmd_share.  It also means that we are always making a
function call into huge_pmd_share to determine if sharing is possible.
That is not any different than today.  If we do not want to make that extra
function call, then I would suggest putting all that code in want_pmd_share.
It just seems that all the vma checks for sharing should be in one place
if possible.
Peter Xu Feb. 12, 2021, 8:40 p.m. UTC | #2
On Thu, Feb 11, 2021 at 04:19:55PM -0800, Mike Kravetz wrote:
> want_pmd_share() is currently just a check for CONFIG_ARCH_WANT_HUGE_PMD_SHARE.
> How about leaving that mostly as is, and adding the new vma checks to
> vma_shareable().  vma_shareable() would then be something like:
> 
> 	if (!(vma->vm_flags & VM_MAYSHARE))
> 		return false;
> #ifdef CONFIG_USERFAULTFD
> 	if (uffd_disable_huge_pmd_share(vma)
> 		return false;
> #endif
> #ifdef /* XXX */
> 	/* add other checks for things like uffd wp and soft dirty here */
> #endif /* XXX */
> 
> 	if (range_in_vma(vma, base, end)
> 		return true;
> 	return false;
> 
> Of course, this would require we leave the call to vma_shareable() at the
> beginning of huge_pmd_share.  It also means that we are always making a
> function call into huge_pmd_share to determine if sharing is possible.
> That is not any different than today.  If we do not want to make that extra
> function call, then I would suggest putting all that code in want_pmd_share.
> It just seems that all the vma checks for sharing should be in one place
> if possible.

I don't worry a lot on that since we've already got huge_pte_alloc() which
takes care of huge pmd sharing case, so I don't expect e.g. even most hugetlb
developers to use want_pmd_share() at all, because huge_pte_alloc() will be the
one that frequently got called.

But yeah we can definitely put the check logic into huge_pmd_share() too.
Looking at above code it looks still worth a helper like want_pmd_share() or
with some other name.  Then... instead of making this complicated, how about I
mostly keep this patch but move want_pmd_share() call into huge_pmd_share()
instead?

Btw, Axel, it seems there will still be some respins on the pmd sharing
patches.  Since it turns out it'll be shared by multiple tasks now, do you mind
I pick those out and send them separately?  Then we can consolidate this part
to move on with either the rest of the tasks we've got on hand.

Thanks,
Axel Rasmussen Feb. 12, 2021, 8:47 p.m. UTC | #3
On Fri, Feb 12, 2021 at 12:40 PM Peter Xu <peterx@redhat.com> wrote:
>
> On Thu, Feb 11, 2021 at 04:19:55PM -0800, Mike Kravetz wrote:
> > want_pmd_share() is currently just a check for CONFIG_ARCH_WANT_HUGE_PMD_SHARE.
> > How about leaving that mostly as is, and adding the new vma checks to
> > vma_shareable().  vma_shareable() would then be something like:
> >
> >       if (!(vma->vm_flags & VM_MAYSHARE))
> >               return false;
> > #ifdef CONFIG_USERFAULTFD
> >       if (uffd_disable_huge_pmd_share(vma)
> >               return false;
> > #endif
> > #ifdef /* XXX */
> >       /* add other checks for things like uffd wp and soft dirty here */
> > #endif /* XXX */
> >
> >       if (range_in_vma(vma, base, end)
> >               return true;
> >       return false;
> >
> > Of course, this would require we leave the call to vma_shareable() at the
> > beginning of huge_pmd_share.  It also means that we are always making a
> > function call into huge_pmd_share to determine if sharing is possible.
> > That is not any different than today.  If we do not want to make that extra
> > function call, then I would suggest putting all that code in want_pmd_share.
> > It just seems that all the vma checks for sharing should be in one place
> > if possible.
>
> I don't worry a lot on that since we've already got huge_pte_alloc() which
> takes care of huge pmd sharing case, so I don't expect e.g. even most hugetlb
> developers to use want_pmd_share() at all, because huge_pte_alloc() will be the
> one that frequently got called.
>
> But yeah we can definitely put the check logic into huge_pmd_share() too.
> Looking at above code it looks still worth a helper like want_pmd_share() or
> with some other name.  Then... instead of making this complicated, how about I
> mostly keep this patch but move want_pmd_share() call into huge_pmd_share()
> instead?
>
> Btw, Axel, it seems there will still be some respins on the pmd sharing
> patches.  Since it turns out it'll be shared by multiple tasks now, do you mind
> I pick those out and send them separately?  Then we can consolidate this part
> to move on with either the rest of the tasks we've got on hand.

Sounds good to me. :) Thanks Peter + Mike for working on this!

>
> Thanks,
>
> --
> Peter Xu
>
Mike Kravetz Feb. 12, 2021, 9:27 p.m. UTC | #4
On 2/12/21 12:47 PM, Axel Rasmussen wrote:
> On Fri, Feb 12, 2021 at 12:40 PM Peter Xu <peterx@redhat.com> wrote:
>>
>> On Thu, Feb 11, 2021 at 04:19:55PM -0800, Mike Kravetz wrote:
>>> want_pmd_share() is currently just a check for CONFIG_ARCH_WANT_HUGE_PMD_SHARE.
>>> How about leaving that mostly as is, and adding the new vma checks to
>>> vma_shareable().  vma_shareable() would then be something like:
>>>
>>>       if (!(vma->vm_flags & VM_MAYSHARE))
>>>               return false;
>>> #ifdef CONFIG_USERFAULTFD
>>>       if (uffd_disable_huge_pmd_share(vma)
>>>               return false;
>>> #endif
>>> #ifdef /* XXX */
>>>       /* add other checks for things like uffd wp and soft dirty here */
>>> #endif /* XXX */
>>>
>>>       if (range_in_vma(vma, base, end)
>>>               return true;
>>>       return false;
>>>
>>> Of course, this would require we leave the call to vma_shareable() at the
>>> beginning of huge_pmd_share.  It also means that we are always making a
>>> function call into huge_pmd_share to determine if sharing is possible.
>>> That is not any different than today.  If we do not want to make that extra
>>> function call, then I would suggest putting all that code in want_pmd_share.
>>> It just seems that all the vma checks for sharing should be in one place
>>> if possible.
>>
>> I don't worry a lot on that since we've already got huge_pte_alloc() which
>> takes care of huge pmd sharing case, so I don't expect e.g. even most hugetlb
>> developers to use want_pmd_share() at all, because huge_pte_alloc() will be the
>> one that frequently got called.
>>
>> But yeah we can definitely put the check logic into huge_pmd_share() too.
>> Looking at above code it looks still worth a helper like want_pmd_share() or
>> with some other name.  Then... instead of making this complicated, how about I
>> mostly keep this patch but move want_pmd_share() call into huge_pmd_share()
>> instead?

When looking at this again, all I was suggesting was a single routine to
check for the possibility of pmd sharing.  That is what the version of
want_pmd_share in this patch does.

I have some patches for future optimizations that only take i_mmap_rwsem
in the fault path if sharing is possible.  This is before huge_pte_alloc.
want_pmd_share as defined in this patch would work for that.

Sorry for the noise.
diff mbox series

Patch

diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index 6e3bcffe2837..58987a98e179 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -284,8 +284,7 @@  pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
 		 */
 		ptep = pte_alloc_map(mm, pmdp, addr);
 	} else if (sz == PMD_SIZE) {
-		if (IS_ENABLED(CONFIG_ARCH_WANT_HUGE_PMD_SHARE) &&
-		    pud_none(READ_ONCE(*pudp)))
+		if (want_pmd_share(vma, addr) && pud_none(READ_ONCE(*pudp)))
 			ptep = huge_pmd_share(mm, vma, addr, pudp);
 		else
 			ptep = (pte_t *)pmd_alloc(mm, pudp, addr);
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index ca6e5ba56f73..d971e7efd17d 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -1030,4 +1030,6 @@  static inline __init void hugetlb_cma_check(void)
 }
 #endif
 
+bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr);
+
 #endif /* _LINUX_HUGETLB_H */
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index a8e5f3ea9bb2..c63ccdae3eab 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -52,6 +52,15 @@  static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma,
 	return vma->vm_userfaultfd_ctx.ctx == vm_ctx.ctx;
 }
 
+/*
+ * Never enable huge pmd sharing on uffd-wp registered vmas, because uffd-wp
+ * protect information is per pgtable entry.
+ */
+static inline bool uffd_disable_huge_pmd_share(struct vm_area_struct *vma)
+{
+	return vma->vm_flags & VM_UFFD_WP;
+}
+
 static inline bool userfaultfd_missing(struct vm_area_struct *vma)
 {
 	return vma->vm_flags & VM_UFFD_MISSING;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 32d4d2e277ad..5710286e1984 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5245,6 +5245,18 @@  static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr)
 	return false;
 }
 
+bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr)
+{
+#ifndef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
+	return false;
+#endif
+#ifdef CONFIG_USERFAULTFD
+	if (uffd_disable_huge_pmd_share(vma))
+		return false;
+#endif
+	return vma_shareable(vma, addr);
+}
+
 /*
  * Determine if start,end range within vma could be mapped by shared pmd.
  * If yes, adjust start and end to cover range associated with possible
@@ -5301,9 +5313,6 @@  pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
 	pte_t *pte;
 	spinlock_t *ptl;
 
-	if (!vma_shareable(vma, addr))
-		return (pte_t *)pmd_alloc(mm, pud, addr);
-
 	i_mmap_assert_locked(mapping);
 	vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
 		if (svma == vma)
@@ -5367,7 +5376,7 @@  int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
 	*addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE;
 	return 1;
 }
-#define want_pmd_share()	(1)
+
 #else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
 pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct vma,
 		      unsigned long addr, pud_t *pud)
@@ -5385,7 +5394,6 @@  void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
 				unsigned long *start, unsigned long *end)
 {
 }
-#define want_pmd_share()	(0)
 #endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
 
 #ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB
@@ -5407,7 +5415,7 @@  pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
 			pte = (pte_t *)pud;
 		} else {
 			BUG_ON(sz != PMD_SIZE);
-			if (want_pmd_share() && pud_none(*pud))
+			if (want_pmd_share(vma, addr) && pud_none(*pud))
 				pte = huge_pmd_share(mm, vma, addr, pud);
 			else
 				pte = (pte_t *)pmd_alloc(mm, pud, addr);