diff mbox series

[v1,1/2] mm/userfaultfd: don't place zeropages when zeropages are disallowed

Message ID 20240321215954.177730-2-david@redhat.com (mailing list archive)
State New, archived
Headers show
Series s390/mm: shared zeropage + KVM fix and optimization | expand

Commit Message

David Hildenbrand March 21, 2024, 9:59 p.m. UTC
s390x must disable shared zeropages for processes running VMs, because
the VMs could end up making use of "storage keys" or protected
virtualization, which are incompatible with shared zeropages.

Yet, with userfaultfd it is possible to insert shared zeropages into
such processes. Let's fallback to simply allocating a fresh zeroed
anonymous folio and insert that instead.

mm_forbids_zeropage() was introduced in commit 593befa6ab74 ("mm: introduce
mm_forbids_zeropage function"), briefly before userfaultfd went
upstream.

Note that we don't want to fail the UFFDIO_ZEROPAGE request like we do
for hugetlb, it would be rather unexpected. Further, we also
cannot really indicated "not supported" to user space ahead of time: it
could be that the MM disallows zeropages after userfaultfd was already
registered.

Fixes: c1a4de99fada ("userfaultfd: mcopy_atomic|mfill_zeropage: UFFDIO_COPY|UFFDIO_ZEROPAGE preparation")
Signed-off-by: David Hildenbrand <david@redhat.com>
---
 mm/userfaultfd.c | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

Comments

Peter Xu March 21, 2024, 10:20 p.m. UTC | #1
On Thu, Mar 21, 2024 at 10:59:53PM +0100, David Hildenbrand wrote:
> s390x must disable shared zeropages for processes running VMs, because
> the VMs could end up making use of "storage keys" or protected
> virtualization, which are incompatible with shared zeropages.
> 
> Yet, with userfaultfd it is possible to insert shared zeropages into
> such processes. Let's fallback to simply allocating a fresh zeroed
> anonymous folio and insert that instead.
> 
> mm_forbids_zeropage() was introduced in commit 593befa6ab74 ("mm: introduce
> mm_forbids_zeropage function"), briefly before userfaultfd went
> upstream.
> 
> Note that we don't want to fail the UFFDIO_ZEROPAGE request like we do
> for hugetlb, it would be rather unexpected. Further, we also
> cannot really indicated "not supported" to user space ahead of time: it
> could be that the MM disallows zeropages after userfaultfd was already
> registered.
> 
> Fixes: c1a4de99fada ("userfaultfd: mcopy_atomic|mfill_zeropage: UFFDIO_COPY|UFFDIO_ZEROPAGE preparation")
> Signed-off-by: David Hildenbrand <david@redhat.com>

Reviewed-by: Peter Xu <peterx@redhat.com>

Still, a few comments below.

> ---
>  mm/userfaultfd.c | 35 +++++++++++++++++++++++++++++++++++
>  1 file changed, 35 insertions(+)
> 
> diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
> index 712160cd41eca..1d1061ccd1dea 100644
> --- a/mm/userfaultfd.c
> +++ b/mm/userfaultfd.c
> @@ -316,6 +316,38 @@ static int mfill_atomic_pte_copy(pmd_t *dst_pmd,
>  	goto out;
>  }
>  
> +static int mfill_atomic_pte_zeroed_folio(pmd_t *dst_pmd,
> +		 struct vm_area_struct *dst_vma, unsigned long dst_addr)
> +{
> +	struct folio *folio;
> +	int ret;

nitpick: we can set -ENOMEM here, then

> +
> +	folio = vma_alloc_zeroed_movable_folio(dst_vma, dst_addr);
> +	if (!folio)
> +		return -ENOMEM;

return ret;

> +
> +	ret = -ENOMEM;

drop.

> +	if (mem_cgroup_charge(folio, dst_vma->vm_mm, GFP_KERNEL))
> +		goto out_put;
> +
> +	/*
> +	 * The memory barrier inside __folio_mark_uptodate makes sure that
> +	 * preceding stores to the page contents become visible before
> +	 * the set_pte_at() write.
> +	 */

This comment doesn't apply.  We can drop it.

Thanks,

> +	__folio_mark_uptodate(folio);
> +
> +	ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr,
> +				       &folio->page, true, 0);
> +	if (ret)
> +		goto out_put;
> +
> +	return 0;
> +out_put:
> +	folio_put(folio);
> +	return ret;
> +}
> +
>  static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd,
>  				     struct vm_area_struct *dst_vma,
>  				     unsigned long dst_addr)
> @@ -324,6 +356,9 @@ static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd,
>  	spinlock_t *ptl;
>  	int ret;
>  
> +	if (mm_forbids_zeropage(dst_vma->mm))
> +		return mfill_atomic_pte_zeroed_folio(dst_pmd, dst_vma, dst_addr);
> +
>  	_dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr),
>  					 dst_vma->vm_page_prot));
>  	ret = -EAGAIN;
> -- 
> 2.43.2
>
David Hildenbrand March 21, 2024, 10:29 p.m. UTC | #2
On 21.03.24 23:20, Peter Xu wrote:
> On Thu, Mar 21, 2024 at 10:59:53PM +0100, David Hildenbrand wrote:
>> s390x must disable shared zeropages for processes running VMs, because
>> the VMs could end up making use of "storage keys" or protected
>> virtualization, which are incompatible with shared zeropages.
>>
>> Yet, with userfaultfd it is possible to insert shared zeropages into
>> such processes. Let's fallback to simply allocating a fresh zeroed
>> anonymous folio and insert that instead.
>>
>> mm_forbids_zeropage() was introduced in commit 593befa6ab74 ("mm: introduce
>> mm_forbids_zeropage function"), briefly before userfaultfd went
>> upstream.
>>
>> Note that we don't want to fail the UFFDIO_ZEROPAGE request like we do
>> for hugetlb, it would be rather unexpected. Further, we also
>> cannot really indicated "not supported" to user space ahead of time: it
>> could be that the MM disallows zeropages after userfaultfd was already
>> registered.
>>
>> Fixes: c1a4de99fada ("userfaultfd: mcopy_atomic|mfill_zeropage: UFFDIO_COPY|UFFDIO_ZEROPAGE preparation")
>> Signed-off-by: David Hildenbrand <david@redhat.com>
> 
> Reviewed-by: Peter Xu <peterx@redhat.com>
> 
> Still, a few comments below.
> 
>> ---
>>   mm/userfaultfd.c | 35 +++++++++++++++++++++++++++++++++++
>>   1 file changed, 35 insertions(+)
>>
>> diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
>> index 712160cd41eca..1d1061ccd1dea 100644
>> --- a/mm/userfaultfd.c
>> +++ b/mm/userfaultfd.c
>> @@ -316,6 +316,38 @@ static int mfill_atomic_pte_copy(pmd_t *dst_pmd,
>>   	goto out;
>>   }
>>   
>> +static int mfill_atomic_pte_zeroed_folio(pmd_t *dst_pmd,
>> +		 struct vm_area_struct *dst_vma, unsigned long dst_addr)
>> +{
>> +	struct folio *folio;
>> +	int ret;
> 
> nitpick: we can set -ENOMEM here, then
> 
>> +
>> +	folio = vma_alloc_zeroed_movable_folio(dst_vma, dst_addr);
>> +	if (!folio)
>> +		return -ENOMEM;
> 
> return ret;
> 
>> +
>> +	ret = -ENOMEM;
> 
> drop.

Sure!

> 
>> +	if (mem_cgroup_charge(folio, dst_vma->vm_mm, GFP_KERNEL))
>> +		goto out_put;
>> +
>> +	/*
>> +	 * The memory barrier inside __folio_mark_uptodate makes sure that
>> +	 * preceding stores to the page contents become visible before
>> +	 * the set_pte_at() write.
>> +	 */
> 
> This comment doesn't apply.  We can drop it.
> 

I thought the same until I spotted that comment (where uffd originally 
copied this from I strongly assume) in do_anonymous_page().

"Preceding stores" here are: zeroing out the memory.


Thanks for the fast review!
Peter Xu March 21, 2024, 10:46 p.m. UTC | #3
On Thu, Mar 21, 2024 at 11:29:45PM +0100, David Hildenbrand wrote:
> On 21.03.24 23:20, Peter Xu wrote:
> > On Thu, Mar 21, 2024 at 10:59:53PM +0100, David Hildenbrand wrote:
> > > s390x must disable shared zeropages for processes running VMs, because
> > > the VMs could end up making use of "storage keys" or protected
> > > virtualization, which are incompatible with shared zeropages.
> > > 
> > > Yet, with userfaultfd it is possible to insert shared zeropages into
> > > such processes. Let's fallback to simply allocating a fresh zeroed
> > > anonymous folio and insert that instead.
> > > 
> > > mm_forbids_zeropage() was introduced in commit 593befa6ab74 ("mm: introduce
> > > mm_forbids_zeropage function"), briefly before userfaultfd went
> > > upstream.
> > > 
> > > Note that we don't want to fail the UFFDIO_ZEROPAGE request like we do
> > > for hugetlb, it would be rather unexpected. Further, we also
> > > cannot really indicated "not supported" to user space ahead of time: it
> > > could be that the MM disallows zeropages after userfaultfd was already
> > > registered.
> > > 
> > > Fixes: c1a4de99fada ("userfaultfd: mcopy_atomic|mfill_zeropage: UFFDIO_COPY|UFFDIO_ZEROPAGE preparation")
> > > Signed-off-by: David Hildenbrand <david@redhat.com>
> > 
> > Reviewed-by: Peter Xu <peterx@redhat.com>
> > 
> > Still, a few comments below.
> > 
> > > ---
> > >   mm/userfaultfd.c | 35 +++++++++++++++++++++++++++++++++++
> > >   1 file changed, 35 insertions(+)
> > > 
> > > diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
> > > index 712160cd41eca..1d1061ccd1dea 100644
> > > --- a/mm/userfaultfd.c
> > > +++ b/mm/userfaultfd.c
> > > @@ -316,6 +316,38 @@ static int mfill_atomic_pte_copy(pmd_t *dst_pmd,
> > >   	goto out;
> > >   }
> > > +static int mfill_atomic_pte_zeroed_folio(pmd_t *dst_pmd,
> > > +		 struct vm_area_struct *dst_vma, unsigned long dst_addr)
> > > +{
> > > +	struct folio *folio;
> > > +	int ret;
> > 
> > nitpick: we can set -ENOMEM here, then
> > 
> > > +
> > > +	folio = vma_alloc_zeroed_movable_folio(dst_vma, dst_addr);
> > > +	if (!folio)
> > > +		return -ENOMEM;
> > 
> > return ret;
> > 
> > > +
> > > +	ret = -ENOMEM;
> > 
> > drop.
> 
> Sure!
> 
> > 
> > > +	if (mem_cgroup_charge(folio, dst_vma->vm_mm, GFP_KERNEL))
> > > +		goto out_put;
> > > +
> > > +	/*
> > > +	 * The memory barrier inside __folio_mark_uptodate makes sure that
> > > +	 * preceding stores to the page contents become visible before
> > > +	 * the set_pte_at() write.
> > > +	 */
> > 
> > This comment doesn't apply.  We can drop it.
> > 
> 
> I thought the same until I spotted that comment (where uffd originally
> copied this from I strongly assume) in do_anonymous_page().
> 
> "Preceding stores" here are: zeroing out the memory.

Ah.. that's okay then.

Considering that userfault used to be pretty cautious on such ordering, as
its specialty to involve many user updates on the page, would you mind we
mention those details out?

	/*
	 * __folio_mark_uptodate contains the memory barrier to make sure 
         * the page updates to the zero page will be visible before
	 * installing the pgtable entries.  See do_anonymous_page().
	 */

Or anything better than my wordings.

Thanks!
David Hildenbrand March 22, 2024, 8:13 a.m. UTC | #4
On 21.03.24 23:46, Peter Xu wrote:
> On Thu, Mar 21, 2024 at 11:29:45PM +0100, David Hildenbrand wrote:
>> On 21.03.24 23:20, Peter Xu wrote:
>>> On Thu, Mar 21, 2024 at 10:59:53PM +0100, David Hildenbrand wrote:
>>>> s390x must disable shared zeropages for processes running VMs, because
>>>> the VMs could end up making use of "storage keys" or protected
>>>> virtualization, which are incompatible with shared zeropages.
>>>>
>>>> Yet, with userfaultfd it is possible to insert shared zeropages into
>>>> such processes. Let's fallback to simply allocating a fresh zeroed
>>>> anonymous folio and insert that instead.
>>>>
>>>> mm_forbids_zeropage() was introduced in commit 593befa6ab74 ("mm: introduce
>>>> mm_forbids_zeropage function"), briefly before userfaultfd went
>>>> upstream.
>>>>
>>>> Note that we don't want to fail the UFFDIO_ZEROPAGE request like we do
>>>> for hugetlb, it would be rather unexpected. Further, we also
>>>> cannot really indicated "not supported" to user space ahead of time: it
>>>> could be that the MM disallows zeropages after userfaultfd was already
>>>> registered.
>>>>
>>>> Fixes: c1a4de99fada ("userfaultfd: mcopy_atomic|mfill_zeropage: UFFDIO_COPY|UFFDIO_ZEROPAGE preparation")
>>>> Signed-off-by: David Hildenbrand <david@redhat.com>
>>>
>>> Reviewed-by: Peter Xu <peterx@redhat.com>
>>>
>>> Still, a few comments below.
>>>
>>>> ---
>>>>    mm/userfaultfd.c | 35 +++++++++++++++++++++++++++++++++++
>>>>    1 file changed, 35 insertions(+)
>>>>
>>>> diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
>>>> index 712160cd41eca..1d1061ccd1dea 100644
>>>> --- a/mm/userfaultfd.c
>>>> +++ b/mm/userfaultfd.c
>>>> @@ -316,6 +316,38 @@ static int mfill_atomic_pte_copy(pmd_t *dst_pmd,
>>>>    	goto out;
>>>>    }
>>>> +static int mfill_atomic_pte_zeroed_folio(pmd_t *dst_pmd,
>>>> +		 struct vm_area_struct *dst_vma, unsigned long dst_addr)
>>>> +{
>>>> +	struct folio *folio;
>>>> +	int ret;
>>>
>>> nitpick: we can set -ENOMEM here, then
>>>
>>>> +
>>>> +	folio = vma_alloc_zeroed_movable_folio(dst_vma, dst_addr);
>>>> +	if (!folio)
>>>> +		return -ENOMEM;
>>>
>>> return ret;
>>>
>>>> +
>>>> +	ret = -ENOMEM;
>>>
>>> drop.
>>
>> Sure!
>>
>>>
>>>> +	if (mem_cgroup_charge(folio, dst_vma->vm_mm, GFP_KERNEL))
>>>> +		goto out_put;
>>>> +
>>>> +	/*
>>>> +	 * The memory barrier inside __folio_mark_uptodate makes sure that
>>>> +	 * preceding stores to the page contents become visible before
>>>> +	 * the set_pte_at() write.
>>>> +	 */
>>>
>>> This comment doesn't apply.  We can drop it.
>>>
>>
>> I thought the same until I spotted that comment (where uffd originally
>> copied this from I strongly assume) in do_anonymous_page().
>>
>> "Preceding stores" here are: zeroing out the memory.
> 
> Ah.. that's okay then.
> 
> Considering that userfault used to be pretty cautious on such ordering, as
> its specialty to involve many user updates on the page, would you mind we
> mention those details out?
> 
> 	/*
> 	 * __folio_mark_uptodate contains the memory barrier to make sure
>           * the page updates to the zero page will be visible before
> 	 * installing the pgtable entries.  See do_anonymous_page().
> 	 */
> 
> Or anything better than my wordings.

Sure, I'd slightly reword it. The following on top:

diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 1d1061ccd1dea..9d385696fb891 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -320,20 +320,19 @@ static int mfill_atomic_pte_zeroed_folio(pmd_t *dst_pmd,
  		 struct vm_area_struct *dst_vma, unsigned long dst_addr)
  {
  	struct folio *folio;
-	int ret;
+	int ret = -ENOMEM;
  
  	folio = vma_alloc_zeroed_movable_folio(dst_vma, dst_addr);
  	if (!folio)
-		return -ENOMEM;
+		return ret;
  
-	ret = -ENOMEM;
  	if (mem_cgroup_charge(folio, dst_vma->vm_mm, GFP_KERNEL))
  		goto out_put;
  
  	/*
  	 * The memory barrier inside __folio_mark_uptodate makes sure that
-	 * preceding stores to the page contents become visible before
-	 * the set_pte_at() write.
+	 * zeroing out the folio become visible before mapping the page
+	 * using set_pte_at(). See do_anonymous_page().
  	 */
  	__folio_mark_uptodate(folio);
  

Thanks!
diff mbox series

Patch

diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 712160cd41eca..1d1061ccd1dea 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -316,6 +316,38 @@  static int mfill_atomic_pte_copy(pmd_t *dst_pmd,
 	goto out;
 }
 
+static int mfill_atomic_pte_zeroed_folio(pmd_t *dst_pmd,
+		 struct vm_area_struct *dst_vma, unsigned long dst_addr)
+{
+	struct folio *folio;
+	int ret;
+
+	folio = vma_alloc_zeroed_movable_folio(dst_vma, dst_addr);
+	if (!folio)
+		return -ENOMEM;
+
+	ret = -ENOMEM;
+	if (mem_cgroup_charge(folio, dst_vma->vm_mm, GFP_KERNEL))
+		goto out_put;
+
+	/*
+	 * The memory barrier inside __folio_mark_uptodate makes sure that
+	 * preceding stores to the page contents become visible before
+	 * the set_pte_at() write.
+	 */
+	__folio_mark_uptodate(folio);
+
+	ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr,
+				       &folio->page, true, 0);
+	if (ret)
+		goto out_put;
+
+	return 0;
+out_put:
+	folio_put(folio);
+	return ret;
+}
+
 static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd,
 				     struct vm_area_struct *dst_vma,
 				     unsigned long dst_addr)
@@ -324,6 +356,9 @@  static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd,
 	spinlock_t *ptl;
 	int ret;
 
+	if (mm_forbids_zeropage(dst_vma->mm))
+		return mfill_atomic_pte_zeroed_folio(dst_pmd, dst_vma, dst_addr);
+
 	_dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr),
 					 dst_vma->vm_page_prot));
 	ret = -EAGAIN;