diff mbox series

[v5,04/10] hugetlb/userfaultfd: Unshare all pmds for hugetlbfs when register wp

Message ID 20210210212200.1097784-5-axelrasmussen@google.com (mailing list archive)
State New
Headers show
Series userfaultfd: add minor fault handling | expand

Commit Message

Axel Rasmussen Feb. 10, 2021, 9:21 p.m. UTC
From: Peter Xu <peterx@redhat.com>

Huge pmd sharing for hugetlbfs is racy with userfaultfd-wp because
userfaultfd-wp is always based on pgtable entries, so they cannot be shared.

Walk the hugetlb range and unshare all such mappings if there is, right before
UFFDIO_REGISTER will succeed and return to userspace.

This will pair with want_pmd_share() in hugetlb code so that huge pmd sharing
is completely disabled for userfaultfd-wp registered range.

Signed-off-by: Peter Xu <peterx@redhat.com>
Signed-off-by: Axel Rasmussen <axelrasmussen@google.com>
---
 fs/userfaultfd.c             | 48 ++++++++++++++++++++++++++++++++++++
 include/linux/mmu_notifier.h |  1 +
 2 files changed, 49 insertions(+)

Comments

Mike Kravetz Feb. 12, 2021, 6:11 p.m. UTC | #1
On 2/10/21 1:21 PM, Axel Rasmussen wrote:
> From: Peter Xu <peterx@redhat.com>
> 
> Huge pmd sharing for hugetlbfs is racy with userfaultfd-wp because
> userfaultfd-wp is always based on pgtable entries, so they cannot be shared.
> 
> Walk the hugetlb range and unshare all such mappings if there is, right before
> UFFDIO_REGISTER will succeed and return to userspace.
> 
> This will pair with want_pmd_share() in hugetlb code so that huge pmd sharing
> is completely disabled for userfaultfd-wp registered range.
> 
> Signed-off-by: Peter Xu <peterx@redhat.com>
> Signed-off-by: Axel Rasmussen <axelrasmussen@google.com>
> ---
>  fs/userfaultfd.c             | 48 ++++++++++++++++++++++++++++++++++++
>  include/linux/mmu_notifier.h |  1 +
>  2 files changed, 49 insertions(+)
> 
> diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> index 0be8cdd4425a..1f4a34b1a1e7 100644
> --- a/fs/userfaultfd.c
> +++ b/fs/userfaultfd.c
> @@ -15,6 +15,7 @@
>  #include <linux/sched/signal.h>
>  #include <linux/sched/mm.h>
>  #include <linux/mm.h>
> +#include <linux/mmu_notifier.h>
>  #include <linux/poll.h>
>  #include <linux/slab.h>
>  #include <linux/seq_file.h>
> @@ -1191,6 +1192,50 @@ static ssize_t userfaultfd_read(struct file *file, char __user *buf,
>  	}
>  }
>  
> +/*
> + * This function will unconditionally remove all the shared pmd pgtable entries
> + * within the specific vma for a hugetlbfs memory range.
> + */
> +static void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
> +{
> +#ifdef CONFIG_HUGETLB_PAGE
> +	struct hstate *h = hstate_vma(vma);
> +	unsigned long sz = huge_page_size(h);
> +	struct mm_struct *mm = vma->vm_mm;
> +	struct mmu_notifier_range range;
> +	unsigned long address;
> +	spinlock_t *ptl;
> +	pte_t *ptep;
> +
> +	if (!(vma->vm_flags & VM_MAYSHARE))
> +		return;
> +
> +	/*
> +	 * No need to call adjust_range_if_pmd_sharing_possible(), because
> +	 * we're going to operate on the whole vma
> +	 */

This code will certainly work as intended.  However, I wonder if we should
try to optimize and only flush and call huge_pmd_unshare for addresses where
sharing is possible.  Consider this worst case example:

vm_start = 8G + 2M
vm_end   = 11G - 2M
The vma is 'almost' 3G in size, yet only the range 9G to 10G is possibly
shared.  This routine will potentially call lock/unlock ptl and call
huge_pmd_share for every huge page in the range.  Ideally, we should only
make one call to huge_pmd_share with address 9G.  If the unshare is
successful or not, we are done.  The subtle manipulation of &address in
huge_pmd_unshare will result in only one call if the unshare is successful,
but if unsuccessful we will unnecessarily call huge_pmd_unshare for each
address in the range.

Maybe we start by rounding up vm_start by PUD_SIZE and rounding down
vm_end by PUD_SIZE.  

> +	mmu_notifier_range_init(&range, MMU_NOTIFY_HUGETLB_UNSHARE,
> +				0, vma, mm, vma->vm_start, vma->vm_end);
> +	mmu_notifier_invalidate_range_start(&range);
> +	i_mmap_lock_write(vma->vm_file->f_mapping);
> +	for (address = vma->vm_start; address < vma->vm_end; address += sz) {

Then, change the loop increment to PUD_SIZE.  And, also ignore the &address
manipulation done by huge_pmd_unshare.

> +		ptep = huge_pte_offset(mm, address, sz);
> +		if (!ptep)
> +			continue;
> +		ptl = huge_pte_lock(h, mm, ptep);
> +		huge_pmd_unshare(mm, vma, &address, ptep);
> +		spin_unlock(ptl);
> +	}
> +	flush_hugetlb_tlb_range(vma, vma->vm_start, vma->vm_end);
> +	i_mmap_unlock_write(vma->vm_file->f_mapping);
> +	/*
> +	 * No need to call mmu_notifier_invalidate_range(), see
> +	 * Documentation/vm/mmu_notifier.rst.
> +	 */
> +	mmu_notifier_invalidate_range_end(&range);
> +#endif
> +}
> +
>  static void __wake_userfault(struct userfaultfd_ctx *ctx,
>  			     struct userfaultfd_wake_range *range)
>  {
> @@ -1449,6 +1494,9 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
>  		vma->vm_flags = new_flags;
>  		vma->vm_userfaultfd_ctx.ctx = ctx;
>  
> +		if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma))
> +			hugetlb_unshare_all_pmds(vma);
> +
>  	skip:
>  		prev = vma;
>  		start = vma->vm_end;
> diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
> index b8200782dede..ff50c8528113 100644
> --- a/include/linux/mmu_notifier.h
> +++ b/include/linux/mmu_notifier.h
> @@ -51,6 +51,7 @@ enum mmu_notifier_event {
>  	MMU_NOTIFY_SOFT_DIRTY,
>  	MMU_NOTIFY_RELEASE,
>  	MMU_NOTIFY_MIGRATE,
> +	MMU_NOTIFY_HUGETLB_UNSHARE,

I don't claim to know much about mmu notifiers.  Currently, we use other
event notifiers such as MMU_NOTIFY_CLEAR.  I guess we do 'clear' page table
entries if we unshare.  More than happy to have a MMU_NOTIFY_HUGETLB_UNSHARE
event, but will consumers of the notifications know what this new event type
means?  And, if we introduce this should we use this other places where
huge_pmd_unshare is called?
--
Mike Kravetz

>  };
>  
>  #define MMU_NOTIFIER_RANGE_BLOCKABLE (1 << 0)
>
Peter Xu Feb. 12, 2021, 9:18 p.m. UTC | #2
On Fri, Feb 12, 2021 at 10:11:39AM -0800, Mike Kravetz wrote:
> On 2/10/21 1:21 PM, Axel Rasmussen wrote:
> > From: Peter Xu <peterx@redhat.com>
> > 
> > Huge pmd sharing for hugetlbfs is racy with userfaultfd-wp because
> > userfaultfd-wp is always based on pgtable entries, so they cannot be shared.
> > 
> > Walk the hugetlb range and unshare all such mappings if there is, right before
> > UFFDIO_REGISTER will succeed and return to userspace.
> > 
> > This will pair with want_pmd_share() in hugetlb code so that huge pmd sharing
> > is completely disabled for userfaultfd-wp registered range.
> > 
> > Signed-off-by: Peter Xu <peterx@redhat.com>
> > Signed-off-by: Axel Rasmussen <axelrasmussen@google.com>
> > ---
> >  fs/userfaultfd.c             | 48 ++++++++++++++++++++++++++++++++++++
> >  include/linux/mmu_notifier.h |  1 +
> >  2 files changed, 49 insertions(+)
> > 
> > diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> > index 0be8cdd4425a..1f4a34b1a1e7 100644
> > --- a/fs/userfaultfd.c
> > +++ b/fs/userfaultfd.c
> > @@ -15,6 +15,7 @@
> >  #include <linux/sched/signal.h>
> >  #include <linux/sched/mm.h>
> >  #include <linux/mm.h>
> > +#include <linux/mmu_notifier.h>
> >  #include <linux/poll.h>
> >  #include <linux/slab.h>
> >  #include <linux/seq_file.h>
> > @@ -1191,6 +1192,50 @@ static ssize_t userfaultfd_read(struct file *file, char __user *buf,
> >  	}
> >  }
> >  
> > +/*
> > + * This function will unconditionally remove all the shared pmd pgtable entries
> > + * within the specific vma for a hugetlbfs memory range.
> > + */
> > +static void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
> > +{
> > +#ifdef CONFIG_HUGETLB_PAGE
> > +	struct hstate *h = hstate_vma(vma);
> > +	unsigned long sz = huge_page_size(h);
> > +	struct mm_struct *mm = vma->vm_mm;
> > +	struct mmu_notifier_range range;
> > +	unsigned long address;
> > +	spinlock_t *ptl;
> > +	pte_t *ptep;
> > +
> > +	if (!(vma->vm_flags & VM_MAYSHARE))
> > +		return;
> > +
> > +	/*
> > +	 * No need to call adjust_range_if_pmd_sharing_possible(), because
> > +	 * we're going to operate on the whole vma
> > +	 */
> 
> This code will certainly work as intended.  However, I wonder if we should
> try to optimize and only flush and call huge_pmd_unshare for addresses where
> sharing is possible.  Consider this worst case example:
> 
> vm_start = 8G + 2M
> vm_end   = 11G - 2M
> The vma is 'almost' 3G in size, yet only the range 9G to 10G is possibly
> shared.  This routine will potentially call lock/unlock ptl and call
> huge_pmd_share for every huge page in the range.  Ideally, we should only
> make one call to huge_pmd_share with address 9G.  If the unshare is
> successful or not, we are done.  The subtle manipulation of &address in
> huge_pmd_unshare will result in only one call if the unshare is successful,
> but if unsuccessful we will unnecessarily call huge_pmd_unshare for each
> address in the range.
> 
> Maybe we start by rounding up vm_start by PUD_SIZE and rounding down
> vm_end by PUD_SIZE.  

I didn't think that lot since it's slow path, but yeah if that's faster and
without extra logic, then why not. :)

> 
> > +	mmu_notifier_range_init(&range, MMU_NOTIFY_HUGETLB_UNSHARE,
> > +				0, vma, mm, vma->vm_start, vma->vm_end);
> > +	mmu_notifier_invalidate_range_start(&range);
> > +	i_mmap_lock_write(vma->vm_file->f_mapping);
> > +	for (address = vma->vm_start; address < vma->vm_end; address += sz) {
> 
> Then, change the loop increment to PUD_SIZE.  And, also ignore the &address
> manipulation done by huge_pmd_unshare.

Will do!

> 
> > +		ptep = huge_pte_offset(mm, address, sz);
> > +		if (!ptep)
> > +			continue;
> > +		ptl = huge_pte_lock(h, mm, ptep);
> > +		huge_pmd_unshare(mm, vma, &address, ptep);
> > +		spin_unlock(ptl);
> > +	}
> > +	flush_hugetlb_tlb_range(vma, vma->vm_start, vma->vm_end);
> > +	i_mmap_unlock_write(vma->vm_file->f_mapping);
> > +	/*
> > +	 * No need to call mmu_notifier_invalidate_range(), see
> > +	 * Documentation/vm/mmu_notifier.rst.
> > +	 */
> > +	mmu_notifier_invalidate_range_end(&range);
> > +#endif
> > +}
> > +
> >  static void __wake_userfault(struct userfaultfd_ctx *ctx,
> >  			     struct userfaultfd_wake_range *range)
> >  {
> > @@ -1449,6 +1494,9 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
> >  		vma->vm_flags = new_flags;
> >  		vma->vm_userfaultfd_ctx.ctx = ctx;
> >  
> > +		if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma))
> > +			hugetlb_unshare_all_pmds(vma);
> > +
> >  	skip:
> >  		prev = vma;
> >  		start = vma->vm_end;
> > diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
> > index b8200782dede..ff50c8528113 100644
> > --- a/include/linux/mmu_notifier.h
> > +++ b/include/linux/mmu_notifier.h
> > @@ -51,6 +51,7 @@ enum mmu_notifier_event {
> >  	MMU_NOTIFY_SOFT_DIRTY,
> >  	MMU_NOTIFY_RELEASE,
> >  	MMU_NOTIFY_MIGRATE,
> > +	MMU_NOTIFY_HUGETLB_UNSHARE,
> 
> I don't claim to know much about mmu notifiers.  Currently, we use other
> event notifiers such as MMU_NOTIFY_CLEAR.  I guess we do 'clear' page table
> entries if we unshare.  More than happy to have a MMU_NOTIFY_HUGETLB_UNSHARE
> event, but will consumers of the notifications know what this new event type
> means?  And, if we introduce this should we use this other places where
> huge_pmd_unshare is called?

Yeah AFAICT that is a new feature to mmu notifiers and it's not really used a
lot by consumers yet.  Hmm... is there really any consumer at all? I simply
grepped MMU_NOTIFY_UNMAP and see no hook took special care of that.  So it's
some extra information that the upper layer would like to deliever to the
notifiers, it's just not vastly used so far.

So far I didn't worry too much on that either.  MMU_NOTIFY_HUGETLB_UNSHARE is
introduced here simply because I tried to make it explicit, then it's easy to
be overwritten one day if we think huge pmd unshare is not worth a standalone
flag but reuse some other common one.  But I think at least I owe one
documentation of that new enum. :)

I'm not extremely willing to touch the rest callers of huge pmd unshare yet,
unless I've a solid reason.  E.g., one day maybe one mmu notifier hook would
start to monitor some events, then that's a better place imho to change them.
Otherwise any of my future change could be vague, imho.

For this patch - how about I simply go back to use MMU_NOTIFIER_CLEAR instead?

Thanks,
Mike Kravetz Feb. 12, 2021, 9:34 p.m. UTC | #3
On 2/12/21 1:18 PM, Peter Xu wrote:
> On Fri, Feb 12, 2021 at 10:11:39AM -0800, Mike Kravetz wrote:
>> On 2/10/21 1:21 PM, Axel Rasmussen wrote:
>>> From: Peter Xu <peterx@redhat.com>
>>> diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
>>> index b8200782dede..ff50c8528113 100644
>>> --- a/include/linux/mmu_notifier.h
>>> +++ b/include/linux/mmu_notifier.h
>>> @@ -51,6 +51,7 @@ enum mmu_notifier_event {
>>>  	MMU_NOTIFY_SOFT_DIRTY,
>>>  	MMU_NOTIFY_RELEASE,
>>>  	MMU_NOTIFY_MIGRATE,
>>> +	MMU_NOTIFY_HUGETLB_UNSHARE,
>>
>> I don't claim to know much about mmu notifiers.  Currently, we use other
>> event notifiers such as MMU_NOTIFY_CLEAR.  I guess we do 'clear' page table
>> entries if we unshare.  More than happy to have a MMU_NOTIFY_HUGETLB_UNSHARE
>> event, but will consumers of the notifications know what this new event type
>> means?  And, if we introduce this should we use this other places where
>> huge_pmd_unshare is called?
> 
> Yeah AFAICT that is a new feature to mmu notifiers and it's not really used a
> lot by consumers yet.  Hmm... is there really any consumer at all? I simply
> grepped MMU_NOTIFY_UNMAP and see no hook took special care of that.  So it's
> some extra information that the upper layer would like to deliever to the
> notifiers, it's just not vastly used so far.
> 
> So far I didn't worry too much on that either.  MMU_NOTIFY_HUGETLB_UNSHARE is
> introduced here simply because I tried to make it explicit, then it's easy to
> be overwritten one day if we think huge pmd unshare is not worth a standalone
> flag but reuse some other common one.  But I think at least I owe one
> documentation of that new enum. :)
> 
> I'm not extremely willing to touch the rest callers of huge pmd unshare yet,
> unless I've a solid reason.  E.g., one day maybe one mmu notifier hook would
> start to monitor some events, then that's a better place imho to change them.
> Otherwise any of my future change could be vague, imho.
> 
> For this patch - how about I simply go back to use MMU_NOTIFIER_CLEAR instead?

I'm good with the new MMU_NOTIFY_HUGETLB_UNSHARE and agree with your reasoning
for adding it.  I really did not know enough about usage which caused me to
question.
Peter Xu Feb. 12, 2021, 10:14 p.m. UTC | #4
On Fri, Feb 12, 2021 at 01:34:03PM -0800, Mike Kravetz wrote:
> I'm good with the new MMU_NOTIFY_HUGETLB_UNSHARE and agree with your reasoning
> for adding it.  I really did not know enough about usage which caused me to
> question.

It actually is a good question..  Because after a 2nd thought I cannot think of
any mmu notifier usage to explicitly listen to huge pmd unshare event - it
could be just a too internal impl detail of hugetlbfs.  I think any new flag
should justify itself when introduced.  Before I could justify it, I think
MMU_NOTIFIER_CLEAR is indeed more suitable.

Thanks,
diff mbox series

Patch

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 0be8cdd4425a..1f4a34b1a1e7 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -15,6 +15,7 @@ 
 #include <linux/sched/signal.h>
 #include <linux/sched/mm.h>
 #include <linux/mm.h>
+#include <linux/mmu_notifier.h>
 #include <linux/poll.h>
 #include <linux/slab.h>
 #include <linux/seq_file.h>
@@ -1191,6 +1192,50 @@  static ssize_t userfaultfd_read(struct file *file, char __user *buf,
 	}
 }
 
+/*
+ * This function will unconditionally remove all the shared pmd pgtable entries
+ * within the specific vma for a hugetlbfs memory range.
+ */
+static void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
+{
+#ifdef CONFIG_HUGETLB_PAGE
+	struct hstate *h = hstate_vma(vma);
+	unsigned long sz = huge_page_size(h);
+	struct mm_struct *mm = vma->vm_mm;
+	struct mmu_notifier_range range;
+	unsigned long address;
+	spinlock_t *ptl;
+	pte_t *ptep;
+
+	if (!(vma->vm_flags & VM_MAYSHARE))
+		return;
+
+	/*
+	 * No need to call adjust_range_if_pmd_sharing_possible(), because
+	 * we're going to operate on the whole vma
+	 */
+	mmu_notifier_range_init(&range, MMU_NOTIFY_HUGETLB_UNSHARE,
+				0, vma, mm, vma->vm_start, vma->vm_end);
+	mmu_notifier_invalidate_range_start(&range);
+	i_mmap_lock_write(vma->vm_file->f_mapping);
+	for (address = vma->vm_start; address < vma->vm_end; address += sz) {
+		ptep = huge_pte_offset(mm, address, sz);
+		if (!ptep)
+			continue;
+		ptl = huge_pte_lock(h, mm, ptep);
+		huge_pmd_unshare(mm, vma, &address, ptep);
+		spin_unlock(ptl);
+	}
+	flush_hugetlb_tlb_range(vma, vma->vm_start, vma->vm_end);
+	i_mmap_unlock_write(vma->vm_file->f_mapping);
+	/*
+	 * No need to call mmu_notifier_invalidate_range(), see
+	 * Documentation/vm/mmu_notifier.rst.
+	 */
+	mmu_notifier_invalidate_range_end(&range);
+#endif
+}
+
 static void __wake_userfault(struct userfaultfd_ctx *ctx,
 			     struct userfaultfd_wake_range *range)
 {
@@ -1449,6 +1494,9 @@  static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 		vma->vm_flags = new_flags;
 		vma->vm_userfaultfd_ctx.ctx = ctx;
 
+		if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma))
+			hugetlb_unshare_all_pmds(vma);
+
 	skip:
 		prev = vma;
 		start = vma->vm_end;
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index b8200782dede..ff50c8528113 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -51,6 +51,7 @@  enum mmu_notifier_event {
 	MMU_NOTIFY_SOFT_DIRTY,
 	MMU_NOTIFY_RELEASE,
 	MMU_NOTIFY_MIGRATE,
+	MMU_NOTIFY_HUGETLB_UNSHARE,
 };
 
 #define MMU_NOTIFIER_RANGE_BLOCKABLE (1 << 0)