diff mbox series

[v1,2/5] s390/uv: convert gmap_make_secure() to work on folios

Message ID 20240404163642.1125529-3-david@redhat.com (mailing list archive)
State New
Headers show
Series s390: page_mapcount(), page_has_private() and PG_arch_1 | expand

Commit Message

David Hildenbrand April 4, 2024, 4:36 p.m. UTC
We have various goals that require gmap_make_secure() to only work on
folios. We want to limit the use of page_mapcount() to the places where it
is absolutely necessary, we want to avoid using page flags of tail
pages, and we want to remove page_has_private().

So, let's convert gmap_make_secure() to folios. While s390x makes sure
to never have PMD-mapped THP in processes that use KVM -- by remapping
them using PTEs in thp_split_walk_pmd_entry()->split_huge_pmd() -- we might
still find PTE-mapped THPs and could end up working on tail pages of
such large folios for now.

To handle that cleanly, let's simply split any PTE-mapped large folio,
so we can be sure that we are always working with small folios and never
on tail pages.

There is no real change: splitting will similarly fail on unexpected folio
references, just like it would already when we try to freeze the folio
refcount.

Signed-off-by: David Hildenbrand <david@redhat.com>
---
 arch/s390/include/asm/page.h |  1 +
 arch/s390/kernel/uv.c        | 66 ++++++++++++++++++++++--------------
 2 files changed, 42 insertions(+), 25 deletions(-)

Comments

Matthew Wilcox April 5, 2024, 3:29 a.m. UTC | #1
On Thu, Apr 04, 2024 at 06:36:39PM +0200, David Hildenbrand wrote:
> +		/* We might get PTE-mapped large folios; split them first. */
> +		if (folio_test_large(folio)) {
> +			rc = -E2BIG;

We agree to this point.  I just turned this into -EINVAL.

>  
> +	if (rc == -E2BIG) {
> +		/*
> +		 * Splitting might fail with -EBUSY due to unexpected folio
> +		 * references, just like make_folio_secure(). So handle it
> +		 * ahead of time without the PTL being held.
> +		 */
> +		folio_lock(folio);
> +		rc = split_folio(folio);
> +		folio_unlock(folio);
> +		folio_put(folio);
> +	}

Ummm ... if split_folio() succeeds, aren't we going to return 0 from
this function, which will be interpreted as make_folio_secure() having
succeeded?
Claudio Imbrenda April 10, 2024, 5:31 p.m. UTC | #2
On Thu,  4 Apr 2024 18:36:39 +0200
David Hildenbrand <david@redhat.com> wrote:

> We have various goals that require gmap_make_secure() to only work on
> folios. We want to limit the use of page_mapcount() to the places where it
> is absolutely necessary, we want to avoid using page flags of tail
> pages, and we want to remove page_has_private().
> 
> So, let's convert gmap_make_secure() to folios. While s390x makes sure
> to never have PMD-mapped THP in processes that use KVM -- by remapping
> them using PTEs in thp_split_walk_pmd_entry()->split_huge_pmd() -- we might
> still find PTE-mapped THPs and could end up working on tail pages of
> such large folios for now.
> 
> To handle that cleanly, let's simply split any PTE-mapped large folio,
> so we can be sure that we are always working with small folios and never
> on tail pages.
> 
> There is no real change: splitting will similarly fail on unexpected folio
> references, just like it would already when we try to freeze the folio
> refcount.
> 
> Signed-off-by: David Hildenbrand <david@redhat.com>
> ---
>  arch/s390/include/asm/page.h |  1 +
>  arch/s390/kernel/uv.c        | 66 ++++++++++++++++++++++--------------
>  2 files changed, 42 insertions(+), 25 deletions(-)
> 
> diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h
> index 9381879f7ecf..54d015bcd8e3 100644
> --- a/arch/s390/include/asm/page.h
> +++ b/arch/s390/include/asm/page.h
> @@ -215,6 +215,7 @@ static inline unsigned long __phys_addr(unsigned long x, bool is_31bit)
>  
>  #define phys_to_page(phys)	pfn_to_page(phys_to_pfn(phys))
>  #define page_to_phys(page)	pfn_to_phys(page_to_pfn(page))
> +#define folio_to_phys(page)	pfn_to_phys(folio_pfn(folio))
>  
>  static inline void *pfn_to_virt(unsigned long pfn)
>  {
> diff --git a/arch/s390/kernel/uv.c b/arch/s390/kernel/uv.c
> index 7401838b960b..adcbd4b13035 100644
> --- a/arch/s390/kernel/uv.c
> +++ b/arch/s390/kernel/uv.c
> @@ -181,36 +181,36 @@ int uv_convert_owned_from_secure(unsigned long paddr)
>  }
>  
>  /*
> - * Calculate the expected ref_count for a page that would otherwise have no
> + * Calculate the expected ref_count for a folio that would otherwise have no
>   * further pins. This was cribbed from similar functions in other places in
>   * the kernel, but with some slight modifications. We know that a secure
> - * page can not be a huge page for example.
> + * folio can only be a small folio for example.
>   */
> -static int expected_page_refs(struct page *page)
> +static int expected_folio_refs(struct folio *folio)
>  {
>  	int res;
>  
> -	res = page_mapcount(page);
> -	if (PageSwapCache(page)) {
> +	res = folio_mapcount(folio);
> +	if (folio_test_swapcache(folio)) {
>  		res++;
> -	} else if (page_mapping(page)) {
> +	} else if (folio_mapping(folio)) {
>  		res++;
> -		if (page_has_private(page))
> +		if (folio_has_private(folio))
>  			res++;
>  	}
>  	return res;
>  }
>  
> -static int make_page_secure(struct page *page, struct uv_cb_header *uvcb)
> +static int make_folio_secure(struct folio *folio, struct uv_cb_header *uvcb)
>  {
>  	int expected, cc = 0;
>  
> -	if (PageWriteback(page))
> +	if (folio_test_writeback(folio))
>  		return -EAGAIN;
> -	expected = expected_page_refs(page);
> -	if (!page_ref_freeze(page, expected))
> +	expected = expected_folio_refs(folio);
> +	if (!folio_ref_freeze(folio, expected))
>  		return -EBUSY;
> -	set_bit(PG_arch_1, &page->flags);
> +	set_bit(PG_arch_1, &folio->flags);
>  	/*
>  	 * If the UVC does not succeed or fail immediately, we don't want to
>  	 * loop for long, or we might get stall notifications.
> @@ -220,9 +220,9 @@ static int make_page_secure(struct page *page, struct uv_cb_header *uvcb)
>  	 * -EAGAIN and we let the callers deal with it.
>  	 */
>  	cc = __uv_call(0, (u64)uvcb);
> -	page_ref_unfreeze(page, expected);
> +	folio_ref_unfreeze(folio, expected);
>  	/*
> -	 * Return -ENXIO if the page was not mapped, -EINVAL for other errors.
> +	 * Return -ENXIO if the folio was not mapped, -EINVAL for other errors.
>  	 * If busy or partially completed, return -EAGAIN.
>  	 */
>  	if (cc == UVC_CC_OK)
> @@ -277,7 +277,7 @@ int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb)
>  	bool local_drain = false;
>  	spinlock_t *ptelock;
>  	unsigned long uaddr;
> -	struct page *page;
> +	struct folio *folio;
>  	pte_t *ptep;
>  	int rc;
>  
> @@ -306,33 +306,49 @@ int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb)
>  	if (!ptep)
>  		goto out;
>  	if (pte_present(*ptep) && !(pte_val(*ptep) & _PAGE_INVALID) && pte_write(*ptep)) {
> -		page = pte_page(*ptep);
> +		folio = page_folio(pte_page(*ptep));
>  		rc = -EAGAIN;
> -		if (trylock_page(page)) {
> +
> +		/* We might get PTE-mapped large folios; split them first. */
> +		if (folio_test_large(folio)) {
> +			rc = -E2BIG;
> +		} else if (folio_trylock(folio)) {
>  			if (should_export_before_import(uvcb, gmap->mm))
> -				uv_convert_from_secure(page_to_phys(page));
> -			rc = make_page_secure(page, uvcb);
> -			unlock_page(page);
> +				uv_convert_from_secure(folio_to_phys(folio));
> +			rc = make_folio_secure(folio, uvcb);
> +			folio_unlock(folio);
>  		}
>  
>  		/*
> -		 * Once we drop the PTL, the page may get unmapped and
> +		 * Once we drop the PTL, the folio may get unmapped and
>  		 * freed immediately. We need a temporary reference.
>  		 */
> -		if (rc == -EAGAIN)
> -			get_page(page);
> +		if (rc == -EAGAIN || rc == -E2BIG)
> +			folio_get(folio);
>  	}
>  	pte_unmap_unlock(ptep, ptelock);
>  out:
>  	mmap_read_unlock(gmap->mm);
>  
> +	if (rc == -E2BIG) {
> +		/*
> +		 * Splitting might fail with -EBUSY due to unexpected folio
> +		 * references, just like make_folio_secure(). So handle it
> +		 * ahead of time without the PTL being held.
> +		 */
> +		folio_lock(folio);
> +		rc = split_folio(folio);

if split_folio returns -EAGAIN...

> +		folio_unlock(folio);
> +		folio_put(folio);
> +	}
> +
>  	if (rc == -EAGAIN) {

... we will not skip this ...

>  		/*
>  		 * If we are here because the UVC returned busy or partial
>  		 * completion, this is just a useless check, but it is safe.
>  		 */
> -		wait_on_page_writeback(page);
> -		put_page(page);
> +		folio_wait_writeback(folio);
> +		folio_put(folio);

... and we will do one folio_put() too many

>  	} else if (rc == -EBUSY) {
>  		/*
>  		 * If we have tried a local drain and the page refcount

are we sure that split_folio() can never return -EAGAIN now and in the
future too?

maybe just change it to  } else if (...   ?
Claudio Imbrenda April 10, 2024, 5:32 p.m. UTC | #3
On Fri, 5 Apr 2024 09:09:30 +0200
David Hildenbrand <david@redhat.com> wrote:

> On 05.04.24 05:29, Matthew Wilcox wrote:
> > On Thu, Apr 04, 2024 at 06:36:39PM +0200, David Hildenbrand wrote:  
> >> +		/* We might get PTE-mapped large folios; split them first. */
> >> +		if (folio_test_large(folio)) {
> >> +			rc = -E2BIG;  
> > 
> > We agree to this point.  I just turned this into -EINVAL.
> >   
> >>   
> >> +	if (rc == -E2BIG) {
> >> +		/*
> >> +		 * Splitting might fail with -EBUSY due to unexpected folio
> >> +		 * references, just like make_folio_secure(). So handle it
> >> +		 * ahead of time without the PTL being held.
> >> +		 */
> >> +		folio_lock(folio);
> >> +		rc = split_folio(folio);
> >> +		folio_unlock(folio);
> >> +		folio_put(folio);
> >> +	}  
> > 
> > Ummm ... if split_folio() succeeds, aren't we going to return 0 from
> > this function, which will be interpreted as make_folio_secure() having
> > succeeded?  
> 
> I assume the code would have to handle that, because it must deal with 
> possible races that would try to convert the folio page.
> 
> But the right thing to do is
> 
> if (!rc)
> 	goto again;
> 
> after the put.

yes please
David Hildenbrand April 11, 2024, 9:09 a.m. UTC | #4
[...]

>> +	if (rc == -E2BIG) {
>> +		/*
>> +		 * Splitting might fail with -EBUSY due to unexpected folio
>> +		 * references, just like make_folio_secure(). So handle it
>> +		 * ahead of time without the PTL being held.
>> +		 */
>> +		folio_lock(folio);
>> +		rc = split_folio(folio);
> 
> if split_folio returns -EAGAIN...
> 
>> +		folio_unlock(folio);
>> +		folio_put(folio);
>> +	}
>> +
>>   	if (rc == -EAGAIN) {
> 
> ... we will not skip this ...
> 
>>   		/*
>>   		 * If we are here because the UVC returned busy or partial
>>   		 * completion, this is just a useless check, but it is safe.
>>   		 */
>> -		wait_on_page_writeback(page);
>> -		put_page(page);
>> +		folio_wait_writeback(folio);
>> +		folio_put(folio);
> 
> ... and we will do one folio_put() too many
> 
>>   	} else if (rc == -EBUSY) {
>>   		/*
>>   		 * If we have tried a local drain and the page refcount
> 
> are we sure that split_folio() can never return -EAGAIN now and in the
> future too?

Yes, and in contrast to documentation, that can actually happen! The 
documentation is even wrong: we return -EAGAIN if there are unexpected 
folio references (e.g., pinned), thanks for raising that.

> 
> maybe just change it to  } else if (...   ?


I think I'll make it all clearer by handling split_folio() return values 
separately.
diff mbox series

Patch

diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h
index 9381879f7ecf..54d015bcd8e3 100644
--- a/arch/s390/include/asm/page.h
+++ b/arch/s390/include/asm/page.h
@@ -215,6 +215,7 @@  static inline unsigned long __phys_addr(unsigned long x, bool is_31bit)
 
 #define phys_to_page(phys)	pfn_to_page(phys_to_pfn(phys))
 #define page_to_phys(page)	pfn_to_phys(page_to_pfn(page))
+#define folio_to_phys(page)	pfn_to_phys(folio_pfn(folio))
 
 static inline void *pfn_to_virt(unsigned long pfn)
 {
diff --git a/arch/s390/kernel/uv.c b/arch/s390/kernel/uv.c
index 7401838b960b..adcbd4b13035 100644
--- a/arch/s390/kernel/uv.c
+++ b/arch/s390/kernel/uv.c
@@ -181,36 +181,36 @@  int uv_convert_owned_from_secure(unsigned long paddr)
 }
 
 /*
- * Calculate the expected ref_count for a page that would otherwise have no
+ * Calculate the expected ref_count for a folio that would otherwise have no
  * further pins. This was cribbed from similar functions in other places in
  * the kernel, but with some slight modifications. We know that a secure
- * page can not be a huge page for example.
+ * folio can only be a small folio for example.
  */
-static int expected_page_refs(struct page *page)
+static int expected_folio_refs(struct folio *folio)
 {
 	int res;
 
-	res = page_mapcount(page);
-	if (PageSwapCache(page)) {
+	res = folio_mapcount(folio);
+	if (folio_test_swapcache(folio)) {
 		res++;
-	} else if (page_mapping(page)) {
+	} else if (folio_mapping(folio)) {
 		res++;
-		if (page_has_private(page))
+		if (folio_has_private(folio))
 			res++;
 	}
 	return res;
 }
 
-static int make_page_secure(struct page *page, struct uv_cb_header *uvcb)
+static int make_folio_secure(struct folio *folio, struct uv_cb_header *uvcb)
 {
 	int expected, cc = 0;
 
-	if (PageWriteback(page))
+	if (folio_test_writeback(folio))
 		return -EAGAIN;
-	expected = expected_page_refs(page);
-	if (!page_ref_freeze(page, expected))
+	expected = expected_folio_refs(folio);
+	if (!folio_ref_freeze(folio, expected))
 		return -EBUSY;
-	set_bit(PG_arch_1, &page->flags);
+	set_bit(PG_arch_1, &folio->flags);
 	/*
 	 * If the UVC does not succeed or fail immediately, we don't want to
 	 * loop for long, or we might get stall notifications.
@@ -220,9 +220,9 @@  static int make_page_secure(struct page *page, struct uv_cb_header *uvcb)
 	 * -EAGAIN and we let the callers deal with it.
 	 */
 	cc = __uv_call(0, (u64)uvcb);
-	page_ref_unfreeze(page, expected);
+	folio_ref_unfreeze(folio, expected);
 	/*
-	 * Return -ENXIO if the page was not mapped, -EINVAL for other errors.
+	 * Return -ENXIO if the folio was not mapped, -EINVAL for other errors.
 	 * If busy or partially completed, return -EAGAIN.
 	 */
 	if (cc == UVC_CC_OK)
@@ -277,7 +277,7 @@  int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb)
 	bool local_drain = false;
 	spinlock_t *ptelock;
 	unsigned long uaddr;
-	struct page *page;
+	struct folio *folio;
 	pte_t *ptep;
 	int rc;
 
@@ -306,33 +306,49 @@  int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb)
 	if (!ptep)
 		goto out;
 	if (pte_present(*ptep) && !(pte_val(*ptep) & _PAGE_INVALID) && pte_write(*ptep)) {
-		page = pte_page(*ptep);
+		folio = page_folio(pte_page(*ptep));
 		rc = -EAGAIN;
-		if (trylock_page(page)) {
+
+		/* We might get PTE-mapped large folios; split them first. */
+		if (folio_test_large(folio)) {
+			rc = -E2BIG;
+		} else if (folio_trylock(folio)) {
 			if (should_export_before_import(uvcb, gmap->mm))
-				uv_convert_from_secure(page_to_phys(page));
-			rc = make_page_secure(page, uvcb);
-			unlock_page(page);
+				uv_convert_from_secure(folio_to_phys(folio));
+			rc = make_folio_secure(folio, uvcb);
+			folio_unlock(folio);
 		}
 
 		/*
-		 * Once we drop the PTL, the page may get unmapped and
+		 * Once we drop the PTL, the folio may get unmapped and
 		 * freed immediately. We need a temporary reference.
 		 */
-		if (rc == -EAGAIN)
-			get_page(page);
+		if (rc == -EAGAIN || rc == -E2BIG)
+			folio_get(folio);
 	}
 	pte_unmap_unlock(ptep, ptelock);
 out:
 	mmap_read_unlock(gmap->mm);
 
+	if (rc == -E2BIG) {
+		/*
+		 * Splitting might fail with -EBUSY due to unexpected folio
+		 * references, just like make_folio_secure(). So handle it
+		 * ahead of time without the PTL being held.
+		 */
+		folio_lock(folio);
+		rc = split_folio(folio);
+		folio_unlock(folio);
+		folio_put(folio);
+	}
+
 	if (rc == -EAGAIN) {
 		/*
 		 * If we are here because the UVC returned busy or partial
 		 * completion, this is just a useless check, but it is safe.
 		 */
-		wait_on_page_writeback(page);
-		put_page(page);
+		folio_wait_writeback(folio);
+		folio_put(folio);
 	} else if (rc == -EBUSY) {
 		/*
 		 * If we have tried a local drain and the page refcount