diff mbox series

[GIT,PULL,v2,09/20] KVM: s390: move pv gmap functions into kvm

Message ID 20250131112510.48531-10-imbrenda@linux.ibm.com (mailing list archive)
State New, archived
Headers show
Series [GIT,PULL,v2,01/20] KVM: s390: vsie: fix some corner-cases when grabbing vsie pages | expand

Commit Message

Claudio Imbrenda Jan. 31, 2025, 11:24 a.m. UTC
Move gmap related functions from kernel/uv into kvm.

Create a new file to collect gmap-related functions.

Reviewed-by: Janosch Frank <frankja@linux.ibm.com>
Reviewed-by: Christoph Schlameuss <schlameuss@linux.ibm.com>
[fixed unpack_one(), thanks mhartmay@linux.ibm.com]
Link: https://lore.kernel.org/r/20250123144627.312456-6-imbrenda@linux.ibm.com
Signed-off-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
Message-ID: <20250123144627.312456-6-imbrenda@linux.ibm.com>
---
 arch/s390/include/asm/gmap.h |   1 +
 arch/s390/include/asm/uv.h   |   6 +-
 arch/s390/kernel/uv.c        | 292 ++++-------------------------------
 arch/s390/kvm/Makefile       |   2 +-
 arch/s390/kvm/gmap.c         | 212 +++++++++++++++++++++++++
 arch/s390/kvm/gmap.h         |  17 ++
 arch/s390/kvm/intercept.c    |   3 +-
 arch/s390/kvm/kvm-s390.c     |   1 +
 arch/s390/kvm/pv.c           |  21 +++
 arch/s390/mm/gmap.c          |  28 ++++
 10 files changed, 315 insertions(+), 268 deletions(-)
 create mode 100644 arch/s390/kvm/gmap.c
 create mode 100644 arch/s390/kvm/gmap.h

Comments

David Hildenbrand Feb. 12, 2025, 4:55 p.m. UTC | #1
On 31.01.25 12:24, Claudio Imbrenda wrote:
> Move gmap related functions from kernel/uv into kvm.
> 
> Create a new file to collect gmap-related functions.
> 
> Reviewed-by: Janosch Frank <frankja@linux.ibm.com>
> Reviewed-by: Christoph Schlameuss <schlameuss@linux.ibm.com>
> [fixed unpack_one(), thanks mhartmay@linux.ibm.com]
> Link: https://lore.kernel.org/r/20250123144627.312456-6-imbrenda@linux.ibm.com
> Signed-off-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
> Message-ID: <20250123144627.312456-6-imbrenda@linux.ibm.com>
> ---

This patch breaks large folio splitting because you end up un-refing
the wrong folios after a split; I tried to make it work, but either
because of other changes in this patch (or in others), I
cannot get it to work and have to give up for today.

Running a simple VM backed by memory-backend-memfd
(which now uses large folios) no longer works (random refcount underflows /
freeing of wrong pages).



The following should be required (but according to my testing insufficient):

 From 71fafff5183c637f20830f6f346e8c9f3eafeb59 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 12 Feb 2025 16:00:32 +0100
Subject: [PATCH] KVM: s390: fix splitting of large folios

If we end up splitting the large folio, doing a put_page() will drop the
wrong reference (unless it was the head page), because we are holding a
reference to the old (large) folio. Similarly, doing another
page_folio() after the split is wrong.

The result is that we end up freeing a page that is still mapped+used.

To fix it, let's pass the page and call split_huge_page() instead.

As an alternative, we could convert all code to use folios, and to
look up the page again from the page table after our split; however, in
context of non-uniform folio splits [1], it make sense to pass the page
where we really want to split.

[1] https://lkml.kernel.org/r/20250211155034.268962-1-ziy@nvidia.com

Fixes: 5cbe24350b7d ("KVM: s390: move pv gmap functions into kvm")
Signed-off-by: David Hildenbrand <david@redhat.com>
---
  arch/s390/include/asm/gmap.h |  3 ++-
  arch/s390/kvm/gmap.c         |  4 ++--
  arch/s390/mm/gmap.c          | 13 +++++++++++--
  3 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h
index 4e73ef46d4b2a..0efa087778135 100644
--- a/arch/s390/include/asm/gmap.h
+++ b/arch/s390/include/asm/gmap.h
@@ -139,7 +139,8 @@ int s390_replace_asce(struct gmap *gmap);
  void s390_uv_destroy_pfns(unsigned long count, unsigned long *pfns);
  int __s390_uv_destroy_range(struct mm_struct *mm, unsigned long start,
  			    unsigned long end, bool interruptible);
-int kvm_s390_wiggle_split_folio(struct mm_struct *mm, struct folio *folio, bool split);
+int kvm_s390_wiggle_split_folio(struct mm_struct *mm, struct folio *folio,
+		struct page *page, bool split);
  unsigned long *gmap_table_walk(struct gmap *gmap, unsigned long gaddr, int level);
  
  /**
diff --git a/arch/s390/kvm/gmap.c b/arch/s390/kvm/gmap.c
index 02adf151d4de4..c2523c63afea3 100644
--- a/arch/s390/kvm/gmap.c
+++ b/arch/s390/kvm/gmap.c
@@ -72,7 +72,7 @@ static int __gmap_make_secure(struct gmap *gmap, struct page *page, void *uvcb)
  		return -EFAULT;
  	if (folio_test_large(folio)) {
  		mmap_read_unlock(gmap->mm);
-		rc = kvm_s390_wiggle_split_folio(gmap->mm, folio, true);
+		rc = kvm_s390_wiggle_split_folio(gmap->mm, folio, page, true);
  		mmap_read_lock(gmap->mm);
  		if (rc)
  			return rc;
@@ -100,7 +100,7 @@ static int __gmap_make_secure(struct gmap *gmap, struct page *page, void *uvcb)
  	/* The folio has too many references, try to shake some off */
  	if (rc == -EBUSY) {
  		mmap_read_unlock(gmap->mm);
-		kvm_s390_wiggle_split_folio(gmap->mm, folio, false);
+		kvm_s390_wiggle_split_folio(gmap->mm, folio, page, false);
  		mmap_read_lock(gmap->mm);
  		return -EAGAIN;
  	}
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index 94d9277858009..3180ad90a255a 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -2631,12 +2631,18 @@ EXPORT_SYMBOL_GPL(s390_replace_asce);
   * kvm_s390_wiggle_split_folio() - try to drain extra references to a folio and optionally split
   * @mm:    the mm containing the folio to work on
   * @folio: the folio
+ * @page:  the folio page where to split the folio
   * @split: whether to split a large folio
   *
+ * If a split of a large folio was requested, the original provided folio must
+ * no longer be used if this function returns 0. The new folio must be looked
+ * up using page_folio(), to which we will then hold a reference.
+ *
   * Context: Must be called while holding an extra reference to the folio;
   *          the mm lock should not be held.
   */
-int kvm_s390_wiggle_split_folio(struct mm_struct *mm, struct folio *folio, bool split)
+int kvm_s390_wiggle_split_folio(struct mm_struct *mm, struct folio *folio,
+		struct page *page, bool split)
  {
  	int rc;
  
@@ -2645,7 +2651,10 @@ int kvm_s390_wiggle_split_folio(struct mm_struct *mm, struct folio *folio, bool
  	lru_add_drain_all();
  	if (split) {
  		folio_lock(folio);
-		rc = split_folio(folio);
+		/* Careful: split_folio() would be wrong. */
+		rc = split_huge_page(page);
+		if (!rc)
+			folio = page_folio(page);
  		folio_unlock(folio);
  
  		if (rc != -EBUSY)
Claudio Imbrenda Feb. 12, 2025, 5:45 p.m. UTC | #2
On Wed, 12 Feb 2025 17:55:18 +0100
David Hildenbrand <david@redhat.com> wrote:

> On 31.01.25 12:24, Claudio Imbrenda wrote:
> > Move gmap related functions from kernel/uv into kvm.
> > 
> > Create a new file to collect gmap-related functions.
> > 
> > Reviewed-by: Janosch Frank <frankja@linux.ibm.com>
> > Reviewed-by: Christoph Schlameuss <schlameuss@linux.ibm.com>
> > [fixed unpack_one(), thanks mhartmay@linux.ibm.com]
> > Link: https://lore.kernel.org/r/20250123144627.312456-6-imbrenda@linux.ibm.com
> > Signed-off-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
> > Message-ID: <20250123144627.312456-6-imbrenda@linux.ibm.com>
> > ---  
> 
> This patch breaks large folio splitting because you end up un-refing
> the wrong folios after a split; I tried to make it work, but either
> because of other changes in this patch (or in others), I
> cannot get it to work and have to give up for today.

yes, I had also noticed that and I already have a fix ready. In fact my
fix was exactly like yours, except that I did not pass the struct folio
anymore to kvm_s390_wiggle_split_folio(), but instead I only pass a
page and use page_folio() at the beginning, and I use
split_huge_page_to_list_to_order() directly instead of split_folio()
 
unfortunately the fix does not fix the issue I'm seeing....

but putting printks everywhere seems to solve the issue, so it seems to
be a race somewhere

> 
> Running a simple VM backed by memory-backend-memfd
> (which now uses large folios) no longer works (random refcount underflows /
> freeing of wrong pages).
> 
> 
> 
> The following should be required (but according to my testing insufficient):
> 
>  From 71fafff5183c637f20830f6f346e8c9f3eafeb59 Mon Sep 17 00:00:00 2001
> From: David Hildenbrand <david@redhat.com>
> Date: Wed, 12 Feb 2025 16:00:32 +0100
> Subject: [PATCH] KVM: s390: fix splitting of large folios
> 
> If we end up splitting the large folio, doing a put_page() will drop the
> wrong reference (unless it was the head page), because we are holding a
> reference to the old (large) folio. Similarly, doing another
> page_folio() after the split is wrong.
> 
> The result is that we end up freeing a page that is still mapped+used.
> 
> To fix it, let's pass the page and call split_huge_page() instead.
> 
> As an alternative, we could convert all code to use folios, and to
> look up the page again from the page table after our split; however, in
> context of non-uniform folio splits [1], it make sense to pass the page
> where we really want to split.
> 
> [1] https://lkml.kernel.org/r/20250211155034.268962-1-ziy@nvidia.com
> 
> Fixes: 5cbe24350b7d ("KVM: s390: move pv gmap functions into kvm")
> Signed-off-by: David Hildenbrand <david@redhat.com>
> ---
>   arch/s390/include/asm/gmap.h |  3 ++-
>   arch/s390/kvm/gmap.c         |  4 ++--
>   arch/s390/mm/gmap.c          | 13 +++++++++++--
>   3 files changed, 15 insertions(+), 5 deletions(-)
> 
> diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h
> index 4e73ef46d4b2a..0efa087778135 100644
> --- a/arch/s390/include/asm/gmap.h
> +++ b/arch/s390/include/asm/gmap.h
> @@ -139,7 +139,8 @@ int s390_replace_asce(struct gmap *gmap);
>   void s390_uv_destroy_pfns(unsigned long count, unsigned long *pfns);
>   int __s390_uv_destroy_range(struct mm_struct *mm, unsigned long start,
>   			    unsigned long end, bool interruptible);
> -int kvm_s390_wiggle_split_folio(struct mm_struct *mm, struct folio *folio, bool split);
> +int kvm_s390_wiggle_split_folio(struct mm_struct *mm, struct folio *folio,
> +		struct page *page, bool split);
>   unsigned long *gmap_table_walk(struct gmap *gmap, unsigned long gaddr, int level);
>   
>   /**
> diff --git a/arch/s390/kvm/gmap.c b/arch/s390/kvm/gmap.c
> index 02adf151d4de4..c2523c63afea3 100644
> --- a/arch/s390/kvm/gmap.c
> +++ b/arch/s390/kvm/gmap.c
> @@ -72,7 +72,7 @@ static int __gmap_make_secure(struct gmap *gmap, struct page *page, void *uvcb)
>   		return -EFAULT;
>   	if (folio_test_large(folio)) {
>   		mmap_read_unlock(gmap->mm);
> -		rc = kvm_s390_wiggle_split_folio(gmap->mm, folio, true);
> +		rc = kvm_s390_wiggle_split_folio(gmap->mm, folio, page, true);
>   		mmap_read_lock(gmap->mm);
>   		if (rc)
>   			return rc;
> @@ -100,7 +100,7 @@ static int __gmap_make_secure(struct gmap *gmap, struct page *page, void *uvcb)
>   	/* The folio has too many references, try to shake some off */
>   	if (rc == -EBUSY) {
>   		mmap_read_unlock(gmap->mm);
> -		kvm_s390_wiggle_split_folio(gmap->mm, folio, false);
> +		kvm_s390_wiggle_split_folio(gmap->mm, folio, page, false);
>   		mmap_read_lock(gmap->mm);
>   		return -EAGAIN;
>   	}
> diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
> index 94d9277858009..3180ad90a255a 100644
> --- a/arch/s390/mm/gmap.c
> +++ b/arch/s390/mm/gmap.c
> @@ -2631,12 +2631,18 @@ EXPORT_SYMBOL_GPL(s390_replace_asce);
>    * kvm_s390_wiggle_split_folio() - try to drain extra references to a folio and optionally split
>    * @mm:    the mm containing the folio to work on
>    * @folio: the folio
> + * @page:  the folio page where to split the folio
>    * @split: whether to split a large folio
>    *
> + * If a split of a large folio was requested, the original provided folio must
> + * no longer be used if this function returns 0. The new folio must be looked
> + * up using page_folio(), to which we will then hold a reference.
> + *
>    * Context: Must be called while holding an extra reference to the folio;
>    *          the mm lock should not be held.
>    */
> -int kvm_s390_wiggle_split_folio(struct mm_struct *mm, struct folio *folio, bool split)
> +int kvm_s390_wiggle_split_folio(struct mm_struct *mm, struct folio *folio,
> +		struct page *page, bool split)
>   {
>   	int rc;
>   
> @@ -2645,7 +2651,10 @@ int kvm_s390_wiggle_split_folio(struct mm_struct *mm, struct folio *folio, bool
>   	lru_add_drain_all();
>   	if (split) {
>   		folio_lock(folio);
> -		rc = split_folio(folio);
> +		/* Careful: split_folio() would be wrong. */
> +		rc = split_huge_page(page);
> +		if (!rc)
> +			folio = page_folio(page);
>   		folio_unlock(folio);
>   
>   		if (rc != -EBUSY)
David Hildenbrand Feb. 12, 2025, 6:14 p.m. UTC | #3
On 12.02.25 18:45, Claudio Imbrenda wrote:
> On Wed, 12 Feb 2025 17:55:18 +0100
> David Hildenbrand <david@redhat.com> wrote:
> 
>> On 31.01.25 12:24, Claudio Imbrenda wrote:
>>> Move gmap related functions from kernel/uv into kvm.
>>>
>>> Create a new file to collect gmap-related functions.
>>>
>>> Reviewed-by: Janosch Frank <frankja@linux.ibm.com>
>>> Reviewed-by: Christoph Schlameuss <schlameuss@linux.ibm.com>
>>> [fixed unpack_one(), thanks mhartmay@linux.ibm.com]
>>> Link: https://lore.kernel.org/r/20250123144627.312456-6-imbrenda@linux.ibm.com
>>> Signed-off-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
>>> Message-ID: <20250123144627.312456-6-imbrenda@linux.ibm.com>
>>> ---
>>
>> This patch breaks large folio splitting because you end up un-refing
>> the wrong folios after a split; I tried to make it work, but either
>> because of other changes in this patch (or in others), I
>> cannot get it to work and have to give up for today.
> 
> yes, I had also noticed that and I already have a fix ready. In fact my
> fix was exactly like yours, except that I did not pass the struct folio
> anymore to kvm_s390_wiggle_split_folio(), but instead I only pass a
> page and use page_folio() at the beginning, and I use
> split_huge_page_to_list_to_order() directly instead of split_folio()
>   
> unfortunately the fix does not fix the issue I'm seeing....
> 
> but putting printks everywhere seems to solve the issue, so it seems to
> be a race somewhere

It also doesn't work with a single vCPU for me. The VM is stuck in

With a two vCPUs (so one can report the lockup), I get:

[   62.645168] rcu: INFO: rcu_sched self-detected stall on CPU
[   62.645181] rcu:     0-....: (5999 ticks this GP) idle=0104/1/0x4000000000000002 softirq=2/2 fqs=2997
[   62.645186] rcu:     (t=6000 jiffies g=-1199 q=62 ncpus=2)
[   62.645191] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.14.0-427.33.1.el9_4.s390x #1
[   62.645194] Hardware name: IBM 3931 LA1 400 (KVM/Linux)
[   62.645195] Krnl PSW : 0704c00180000000 0000000024b3e776 (set_memory_decrypted+0x66/0xa0)
[   62.645206]            R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:0 PM:0 RI:0 EA:3
[   62.645208] Krnl GPRS: 00000000ca004000 0000037f00000001 000000008092f000 0000000000000000
[   62.645210]            0000037fffb1bbc0 0000000000000001 0000000025e75208 000000008092f000
[   62.645211]            0000000080873808 0000037fffb1bcd8 0000000000001000 0000000025e75220
[   62.645213]            0000000080281500 00000000258aa480 0000000024c0b17a 0000037fffb1bb20
[   62.645220] Krnl Code: 0000000024b3e76a: a784000f            brc     8,0000000024b3e788
[   62.645220]            0000000024b3e76e: a7210fff            tmll    %r2,4095
[   62.645220]           #0000000024b3e772: a7740017            brc     7,0000000024b3e7a0
[   62.645220]           >0000000024b3e776: b9a40034            uvc     %r3,%r4,0
[   62.645220]            0000000024b3e77a: b2220010            ipm     %r1
[   62.645220]            0000000024b3e77e: 8810001c            srl     %r1,28
[   62.645220]            0000000024b3e782: ec12fffa017e        cij     %r1,1,2,0000000024b3e776
[   62.645220]            0000000024b3e788: a72b1000            aghi    %r2,4096
[   62.645232] Call Trace:
[   62.645234]  [<0000000024b3e776>] set_memory_decrypted+0x66/0xa0
[   62.645238]  [<0000000024c0b17a>] dma_direct_alloc+0x16a/0x2d0
[   62.645242]  [<0000000024c09b92>] dma_alloc_attrs+0x62/0x80
[   62.645243]  [<000000002546c950>] cio_gp_dma_create+0x60/0xa0
[   62.645248]  [<0000000025ebb712>] css_bus_init+0x102/0x1b8
[   62.645252]  [<0000000025ebb7ea>] channel_subsystem_init+0x22/0xf8
[   62.645254]  [<0000000024b149ac>] do_one_initcall+0x3c/0x200
[   62.645256]  [<0000000025e777be>] do_initcalls+0x11e/0x148
[   62.645260]  [<0000000025e77a34>] kernel_init_freeable+0x1cc/0x208
[   62.645262]  [<00000000254ad01e>] kernel_init+0x2e/0x170
[   62.645264]  [<0000000024b16fdc>] __ret_from_fork+0x3c/0x60
[   62.645266]  [<00000000254bb07a>] ret_from_fork+0xa/0x40


The removed PTE lock would only explain it if we would have a concurrent GUP etc.
from QEMU I/O ? Not sure.

To fix the wrong refcount freezing, doing exactly what folio splitting does
(migration PTEs, locking the pagecache etc., freezing->converting,
removing migration ptes) should work, but requires a bit of work.
David Hildenbrand Feb. 13, 2025, 10:02 a.m. UTC | #4
On 12.02.25 19:14, David Hildenbrand wrote:
> On 12.02.25 18:45, Claudio Imbrenda wrote:
>> On Wed, 12 Feb 2025 17:55:18 +0100
>> David Hildenbrand <david@redhat.com> wrote:
>>
>>> On 31.01.25 12:24, Claudio Imbrenda wrote:
>>>> Move gmap related functions from kernel/uv into kvm.
>>>>
>>>> Create a new file to collect gmap-related functions.
>>>>
>>>> Reviewed-by: Janosch Frank <frankja@linux.ibm.com>
>>>> Reviewed-by: Christoph Schlameuss <schlameuss@linux.ibm.com>
>>>> [fixed unpack_one(), thanks mhartmay@linux.ibm.com]
>>>> Link: https://lore.kernel.org/r/20250123144627.312456-6-imbrenda@linux.ibm.com
>>>> Signed-off-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
>>>> Message-ID: <20250123144627.312456-6-imbrenda@linux.ibm.com>
>>>> ---
>>>
>>> This patch breaks large folio splitting because you end up un-refing
>>> the wrong folios after a split; I tried to make it work, but either
>>> because of other changes in this patch (or in others), I
>>> cannot get it to work and have to give up for today.
>>
>> yes, I had also noticed that and I already have a fix ready. In fact my
>> fix was exactly like yours, except that I did not pass the struct folio
>> anymore to kvm_s390_wiggle_split_folio(), but instead I only pass a
>> page and use page_folio() at the beginning, and I use
>> split_huge_page_to_list_to_order() directly instead of split_folio()
>>    
>> unfortunately the fix does not fix the issue I'm seeing....
>>
>> but putting printks everywhere seems to solve the issue, so it seems to
>> be a race somewhere
> 
> It also doesn't work with a single vCPU for me. The VM is stuck in
> 
> With a two vCPUs (so one can report the lockup), I get:
> 
> [   62.645168] rcu: INFO: rcu_sched self-detected stall on CPU
> [   62.645181] rcu:     0-....: (5999 ticks this GP) idle=0104/1/0x4000000000000002 softirq=2/2 fqs=2997
> [   62.645186] rcu:     (t=6000 jiffies g=-1199 q=62 ncpus=2)
> [   62.645191] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.14.0-427.33.1.el9_4.s390x #1
> [   62.645194] Hardware name: IBM 3931 LA1 400 (KVM/Linux)
> [   62.645195] Krnl PSW : 0704c00180000000 0000000024b3e776 (set_memory_decrypted+0x66/0xa0)
> [   62.645206]            R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:0 PM:0 RI:0 EA:3
> [   62.645208] Krnl GPRS: 00000000ca004000 0000037f00000001 000000008092f000 0000000000000000
> [   62.645210]            0000037fffb1bbc0 0000000000000001 0000000025e75208 000000008092f000
> [   62.645211]            0000000080873808 0000037fffb1bcd8 0000000000001000 0000000025e75220
> [   62.645213]            0000000080281500 00000000258aa480 0000000024c0b17a 0000037fffb1bb20
> [   62.645220] Krnl Code: 0000000024b3e76a: a784000f            brc     8,0000000024b3e788
> [   62.645220]            0000000024b3e76e: a7210fff            tmll    %r2,4095
> [   62.645220]           #0000000024b3e772: a7740017            brc     7,0000000024b3e7a0
> [   62.645220]           >0000000024b3e776: b9a40034            uvc     %r3,%r4,0
> [   62.645220]            0000000024b3e77a: b2220010            ipm     %r1
> [   62.645220]            0000000024b3e77e: 8810001c            srl     %r1,28
> [   62.645220]            0000000024b3e782: ec12fffa017e        cij     %r1,1,2,0000000024b3e776
> [   62.645220]            0000000024b3e788: a72b1000            aghi    %r2,4096
> [   62.645232] Call Trace:
> [   62.645234]  [<0000000024b3e776>] set_memory_decrypted+0x66/0xa0
> [   62.645238]  [<0000000024c0b17a>] dma_direct_alloc+0x16a/0x2d0
> [   62.645242]  [<0000000024c09b92>] dma_alloc_attrs+0x62/0x80
> [   62.645243]  [<000000002546c950>] cio_gp_dma_create+0x60/0xa0
> [   62.645248]  [<0000000025ebb712>] css_bus_init+0x102/0x1b8
> [   62.645252]  [<0000000025ebb7ea>] channel_subsystem_init+0x22/0xf8
> [   62.645254]  [<0000000024b149ac>] do_one_initcall+0x3c/0x200
> [   62.645256]  [<0000000025e777be>] do_initcalls+0x11e/0x148
> [   62.645260]  [<0000000025e77a34>] kernel_init_freeable+0x1cc/0x208
> [   62.645262]  [<00000000254ad01e>] kernel_init+0x2e/0x170
> [   62.645264]  [<0000000024b16fdc>] __ret_from_fork+0x3c/0x60
> [   62.645266]  [<00000000254bb07a>] ret_from_fork+0xa/0x40
> 

I can only suspect that it is related to the following: if we split a non-anon
folio, we unmap it from the page tables, and don't remap it again -- the next
fault will do that. Maybe, for some reason that behavior is incompatible with your changes.

I don't quit see how, because we should just trigger another fault to look up
the page in gmap_make_secure()->gfn_to_page() when we re-enter gmap_make_secure() after a split.

> 
> The removed PTE lock would only explain it if we would have a concurrent GUP etc.
> from QEMU I/O ? Not sure.
> 
> To fix the wrong refcount freezing, doing exactly what folio splitting does
> (migration PTEs, locking the pagecache etc., freezing->converting,
> removing migration ptes) should work, but requires a bit of work.

I played with the following abomination to see if I could fix the refcount freezing somehow.

It doesn't work, because the UVC keeps failing: I assume because it actually
needs the page to be mapped into that particular page table for the UVC to complete.


To fix refcount freezing with that (folio still mapped), we'd have to make sure that
folio_mapcount()==1 while we hold the PTL, and doing something similar to below,
except that the rmap/anon locking and unmap/remap handling would not apply. The
pagecache most likely would have to be locked to prevent new references from that while
we freeze the refcount.

In case we would have folio_mapcount() != 1 on an anon page, we would have to give up:
impossible if it is mapped writable -- so no problem.

In case we would have folio_mapcount() != 1 on a pagecache page, we would have to
force an unmap of the all page table mappings using e.g., try_to_unmap(), to then retry
again.

But the PTL seems unavoidable in that case to prevent concurrent GUP-slow etc, so we
can safely freeze the refcount.


 From c2555fc34801ca9ba49f93ee1249ecd25248377a Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Thu, 13 Feb 2025 09:49:54 +0100
Subject: [PATCH] tmp

Signed-off-by: David Hildenbrand <david@redhat.com>
---
  arch/s390/kernel/uv.c | 139 +++++++++++++++++++++++++++++++++++-------
  include/linux/rmap.h  |  17 ++++++
  mm/internal.h         |  16 -----
  3 files changed, 133 insertions(+), 39 deletions(-)

diff --git a/arch/s390/kernel/uv.c b/arch/s390/kernel/uv.c
index 9f05df2da2f73..d6ea8951fa53b 100644
--- a/arch/s390/kernel/uv.c
+++ b/arch/s390/kernel/uv.c
@@ -15,6 +15,7 @@
  #include <linux/pagemap.h>
  #include <linux/swap.h>
  #include <linux/pagewalk.h>
+#include <linux/rmap.h>
  #include <asm/facility.h>
  #include <asm/sections.h>
  #include <asm/uv.h>
@@ -227,6 +228,45 @@ static int expected_folio_refs(struct folio *folio)
  	return res;
  }
  
+static void unmap_folio(struct folio *folio)
+{
+	enum ttu_flags ttu_flags = TTU_RMAP_LOCKED | TTU_SYNC |
+		TTU_BATCH_FLUSH;
+
+	VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
+
+	if (folio_test_pmd_mappable(folio))
+		ttu_flags |= TTU_SPLIT_HUGE_PMD;
+
+	/*
+	 * Anon pages need migration entries to preserve them, but file
+	 * pages can simply be left unmapped, then faulted back on demand.
+	 * If that is ever changed (perhaps for mlock), update remap_page().
+	 */
+	if (folio_test_anon(folio))
+		try_to_migrate(folio, ttu_flags);
+	else
+		try_to_unmap(folio, ttu_flags | TTU_IGNORE_MLOCK);
+
+	try_to_unmap_flush();
+}
+
+static void remap_page(struct folio *folio, unsigned long nr, int flags)
+{
+	int i = 0;
+
+	/* If unmap_folio() uses try_to_migrate() on file, remove this check */
+	if (!folio_test_anon(folio))
+		return;
+	for (;;) {
+		remove_migration_ptes(folio, folio, RMP_LOCKED | flags);
+		i += folio_nr_pages(folio);
+		if (i >= nr)
+			break;
+		folio = folio_next(folio);
+	}
+}
+
  /**
   * make_folio_secure() - make a folio secure
   * @folio: the folio to make secure
@@ -247,35 +287,88 @@ static int expected_folio_refs(struct folio *folio)
   */
  int make_folio_secure(struct folio *folio, struct uv_cb_header *uvcb)
  {
-	int expected, cc = 0;
+	XA_STATE(xas, &folio->mapping->i_pages, folio->index);
+	struct address_space *mapping = NULL;
+	struct anon_vma *anon_vma = NULL;
+	int ret, cc = 0;
+	int expected;
  
  	if (folio_test_large(folio))
  		return -E2BIG;
  	if (folio_test_writeback(folio))
  		return -EBUSY;
-	expected = expected_folio_refs(folio) + 1;
-	if (!folio_ref_freeze(folio, expected))
+
+	/* Does it make sense to try at all? */
+	if (folio_ref_count(folio) != expected_folio_refs(folio) + 1)
  		return -EBUSY;
-	set_bit(PG_arch_1, &folio->flags);
-	/*
-	 * If the UVC does not succeed or fail immediately, we don't want to
-	 * loop for long, or we might get stall notifications.
-	 * On the other hand, this is a complex scenario and we are holding a lot of
-	 * locks, so we can't easily sleep and reschedule. We try only once,
-	 * and if the UVC returned busy or partial completion, we return
-	 * -EAGAIN and we let the callers deal with it.
-	 */
-	cc = __uv_call(0, (u64)uvcb);
-	folio_ref_unfreeze(folio, expected);
-	/*
-	 * Return -ENXIO if the folio was not mapped, -EINVAL for other errors.
-	 * If busy or partially completed, return -EAGAIN.
-	 */
-	if (cc == UVC_CC_OK)
-		return 0;
-	else if (cc == UVC_CC_BUSY || cc == UVC_CC_PARTIAL)
-		return -EAGAIN;
-	return uvcb->rc == 0x10a ? -ENXIO : -EINVAL;
+
+	/* See split_huge_page_to_list_to_order() on the nasty details. */
+	if (folio_test_anon(folio)) {
+		anon_vma = folio_get_anon_vma(folio);
+		if (!anon_vma)
+			return -EBUSY;
+		anon_vma_lock_write(anon_vma);
+	} else {
+		mapping = folio->mapping;
+		if (!mapping)
+			return -EBUSY;
+		/* Hmmm, do we need filemap_release_folio()? */
+		i_mmap_lock_read(mapping);
+	}
+
+	unmap_folio(folio);
+
+	local_irq_disable();
+	if (mapping) {
+		xas_lock(&xas);
+		xas_reset(&xas);
+		if (xas_load(&xas) != folio) {
+			ret = -EBUSY;
+			goto fail;
+		}
+	}
+
+	expected = expected_folio_refs(folio) + 1;
+	if (!folio_mapped(folio) &&
+	    folio_ref_freeze(folio, expected)) {
+		set_bit(PG_arch_1, &folio->flags);
+		/*
+		 * If the UVC does not succeed or fail immediately, we don't want to
+		 * loop for long, or we might get stall notifications.
+		 * On the other hand, this is a complex scenario and we are holding a lot of
+		 * locks, so we can't easily sleep and reschedule. We try only once,
+		 * and if the UVC returned busy or partial completion, we return
+		 * -EAGAIN and we let the callers deal with it.
+		 */
+		cc = __uv_call(0, (u64)uvcb);
+		folio_ref_unfreeze(folio, expected);
+		/*
+		 * Return -ENXIO if the folio was not mapped, -EINVAL for other errors.
+		 * If busy or partially completed, return -EAGAIN.
+		 */
+		if (cc == UVC_CC_OK)
+			ret = 0;
+		else if (cc == UVC_CC_BUSY || cc == UVC_CC_PARTIAL)
+			ret = -EAGAIN;
+		else
+			ret = uvcb->rc == 0x10a ? -ENXIO : -EINVAL;
+	} else {
+		ret = -EBUSY;
+	}
+
+	if (mapping)
+		xas_unlock(&xas);
+fail:
+	local_irq_enable();
+	remap_page(folio, 1, 0);
+	if (anon_vma) {
+		anon_vma_unlock_write(anon_vma);
+		put_anon_vma(anon_vma);
+	}
+	if (mapping)
+		i_mmap_unlock_read(mapping);
+	xas_destroy(&xas);
+	return ret;
  }
  EXPORT_SYMBOL_GPL(make_folio_secure);
  
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 683a04088f3f2..2d241ab48bf08 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -663,6 +663,23 @@ int folio_referenced(struct folio *, int is_locked,
  void try_to_migrate(struct folio *folio, enum ttu_flags flags);
  void try_to_unmap(struct folio *, enum ttu_flags flags);
  
+#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+void try_to_unmap_flush(void);
+void try_to_unmap_flush_dirty(void);
+void flush_tlb_batched_pending(struct mm_struct *mm);
+#else
+static inline void try_to_unmap_flush(void)
+{
+}
+static inline void try_to_unmap_flush_dirty(void)
+{
+}
+static inline void flush_tlb_batched_pending(struct mm_struct *mm)
+{
+}
+#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
+
+
  int make_device_exclusive_range(struct mm_struct *mm, unsigned long start,
  				unsigned long end, struct page **pages,
  				void *arg);
diff --git a/mm/internal.h b/mm/internal.h
index 109ef30fee11f..5338906163ca7 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1202,22 +1202,6 @@ struct tlbflush_unmap_batch;
   */
  extern struct workqueue_struct *mm_percpu_wq;
  
-#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
-void try_to_unmap_flush(void);
-void try_to_unmap_flush_dirty(void);
-void flush_tlb_batched_pending(struct mm_struct *mm);
-#else
-static inline void try_to_unmap_flush(void)
-{
-}
-static inline void try_to_unmap_flush_dirty(void)
-{
-}
-static inline void flush_tlb_batched_pending(struct mm_struct *mm)
-{
-}
-#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
-
  extern const struct trace_print_flags pageflag_names[];
  extern const struct trace_print_flags vmaflag_names[];
  extern const struct trace_print_flags gfpflag_names[];
diff mbox series

Patch

diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h
index 13f51a6a5bb1..3e66f53fe3cc 100644
--- a/arch/s390/include/asm/gmap.h
+++ b/arch/s390/include/asm/gmap.h
@@ -149,6 +149,7 @@  int s390_replace_asce(struct gmap *gmap);
 void s390_uv_destroy_pfns(unsigned long count, unsigned long *pfns);
 int __s390_uv_destroy_range(struct mm_struct *mm, unsigned long start,
 			    unsigned long end, bool interruptible);
+int kvm_s390_wiggle_split_folio(struct mm_struct *mm, struct folio *folio, bool split);
 
 /**
  * s390_uv_destroy_range - Destroy a range of pages in the given mm.
diff --git a/arch/s390/include/asm/uv.h b/arch/s390/include/asm/uv.h
index dc332609f2c3..b11f5b6d0bd1 100644
--- a/arch/s390/include/asm/uv.h
+++ b/arch/s390/include/asm/uv.h
@@ -628,12 +628,12 @@  static inline int is_prot_virt_host(void)
 }
 
 int uv_pin_shared(unsigned long paddr);
-int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb);
-int gmap_destroy_page(struct gmap *gmap, unsigned long gaddr);
 int uv_destroy_folio(struct folio *folio);
 int uv_destroy_pte(pte_t pte);
 int uv_convert_from_secure_pte(pte_t pte);
-int gmap_convert_to_secure(struct gmap *gmap, unsigned long gaddr);
+int make_folio_secure(struct folio *folio, struct uv_cb_header *uvcb);
+int uv_convert_from_secure(unsigned long paddr);
+int uv_convert_from_secure_folio(struct folio *folio);
 
 void setup_uv(void);
 
diff --git a/arch/s390/kernel/uv.c b/arch/s390/kernel/uv.c
index 6f9654a191ad..9f05df2da2f7 100644
--- a/arch/s390/kernel/uv.c
+++ b/arch/s390/kernel/uv.c
@@ -19,19 +19,6 @@ 
 #include <asm/sections.h>
 #include <asm/uv.h>
 
-#if !IS_ENABLED(CONFIG_KVM)
-unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr)
-{
-	return 0;
-}
-
-int gmap_fault(struct gmap *gmap, unsigned long gaddr,
-	       unsigned int fault_flags)
-{
-	return 0;
-}
-#endif
-
 /* the bootdata_preserved fields come from ones in arch/s390/boot/uv.c */
 int __bootdata_preserved(prot_virt_guest);
 EXPORT_SYMBOL(prot_virt_guest);
@@ -159,6 +146,7 @@  int uv_destroy_folio(struct folio *folio)
 	folio_put(folio);
 	return rc;
 }
+EXPORT_SYMBOL(uv_destroy_folio);
 
 /*
  * The present PTE still indirectly holds a folio reference through the mapping.
@@ -175,7 +163,7 @@  int uv_destroy_pte(pte_t pte)
  *
  * @paddr: Absolute host address of page to be exported
  */
-static int uv_convert_from_secure(unsigned long paddr)
+int uv_convert_from_secure(unsigned long paddr)
 {
 	struct uv_cb_cfs uvcb = {
 		.header.cmd = UVC_CMD_CONV_FROM_SEC_STOR,
@@ -187,11 +175,12 @@  static int uv_convert_from_secure(unsigned long paddr)
 		return -EINVAL;
 	return 0;
 }
+EXPORT_SYMBOL_GPL(uv_convert_from_secure);
 
 /*
  * The caller must already hold a reference to the folio.
  */
-static int uv_convert_from_secure_folio(struct folio *folio)
+int uv_convert_from_secure_folio(struct folio *folio)
 {
 	int rc;
 
@@ -206,6 +195,7 @@  static int uv_convert_from_secure_folio(struct folio *folio)
 	folio_put(folio);
 	return rc;
 }
+EXPORT_SYMBOL_GPL(uv_convert_from_secure_folio);
 
 /*
  * The present PTE still indirectly holds a folio reference through the mapping.
@@ -237,13 +227,33 @@  static int expected_folio_refs(struct folio *folio)
 	return res;
 }
 
-static int make_folio_secure(struct folio *folio, struct uv_cb_header *uvcb)
+/**
+ * make_folio_secure() - make a folio secure
+ * @folio: the folio to make secure
+ * @uvcb: the uvcb that describes the UVC to be used
+ *
+ * The folio @folio will be made secure if possible, @uvcb will be passed
+ * as-is to the UVC.
+ *
+ * Return: 0 on success;
+ *         -EBUSY if the folio is in writeback or has too many references;
+ *         -E2BIG if the folio is large;
+ *         -EAGAIN if the UVC needs to be attempted again;
+ *         -ENXIO if the address is not mapped;
+ *         -EINVAL if the UVC failed for other reasons.
+ *
+ * Context: The caller must hold exactly one extra reference on the folio
+ *          (it's the same logic as split_folio())
+ */
+int make_folio_secure(struct folio *folio, struct uv_cb_header *uvcb)
 {
 	int expected, cc = 0;
 
+	if (folio_test_large(folio))
+		return -E2BIG;
 	if (folio_test_writeback(folio))
-		return -EAGAIN;
-	expected = expected_folio_refs(folio);
+		return -EBUSY;
+	expected = expected_folio_refs(folio) + 1;
 	if (!folio_ref_freeze(folio, expected))
 		return -EBUSY;
 	set_bit(PG_arch_1, &folio->flags);
@@ -267,251 +277,7 @@  static int make_folio_secure(struct folio *folio, struct uv_cb_header *uvcb)
 		return -EAGAIN;
 	return uvcb->rc == 0x10a ? -ENXIO : -EINVAL;
 }
-
-/**
- * should_export_before_import - Determine whether an export is needed
- * before an import-like operation
- * @uvcb: the Ultravisor control block of the UVC to be performed
- * @mm: the mm of the process
- *
- * Returns whether an export is needed before every import-like operation.
- * This is needed for shared pages, which don't trigger a secure storage
- * exception when accessed from a different guest.
- *
- * Although considered as one, the Unpin Page UVC is not an actual import,
- * so it is not affected.
- *
- * No export is needed also when there is only one protected VM, because the
- * page cannot belong to the wrong VM in that case (there is no "other VM"
- * it can belong to).
- *
- * Return: true if an export is needed before every import, otherwise false.
- */
-static bool should_export_before_import(struct uv_cb_header *uvcb, struct mm_struct *mm)
-{
-	/*
-	 * The misc feature indicates, among other things, that importing a
-	 * shared page from a different protected VM will automatically also
-	 * transfer its ownership.
-	 */
-	if (uv_has_feature(BIT_UV_FEAT_MISC))
-		return false;
-	if (uvcb->cmd == UVC_CMD_UNPIN_PAGE_SHARED)
-		return false;
-	return atomic_read(&mm->context.protected_count) > 1;
-}
-
-/*
- * Drain LRU caches: the local one on first invocation and the ones of all
- * CPUs on successive invocations. Returns "true" on the first invocation.
- */
-static bool drain_lru(bool *drain_lru_called)
-{
-	/*
-	 * If we have tried a local drain and the folio refcount
-	 * still does not match our expected safe value, try with a
-	 * system wide drain. This is needed if the pagevecs holding
-	 * the page are on a different CPU.
-	 */
-	if (*drain_lru_called) {
-		lru_add_drain_all();
-		/* We give up here, don't retry immediately. */
-		return false;
-	}
-	/*
-	 * We are here if the folio refcount does not match the
-	 * expected safe value. The main culprits are usually
-	 * pagevecs. With lru_add_drain() we drain the pagevecs
-	 * on the local CPU so that hopefully the refcount will
-	 * reach the expected safe value.
-	 */
-	lru_add_drain();
-	*drain_lru_called = true;
-	/* The caller should try again immediately */
-	return true;
-}
-
-/*
- * Requests the Ultravisor to make a page accessible to a guest.
- * If it's brought in the first time, it will be cleared. If
- * it has been exported before, it will be decrypted and integrity
- * checked.
- */
-int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb)
-{
-	struct vm_area_struct *vma;
-	bool drain_lru_called = false;
-	spinlock_t *ptelock;
-	unsigned long uaddr;
-	struct folio *folio;
-	pte_t *ptep;
-	int rc;
-
-again:
-	rc = -EFAULT;
-	mmap_read_lock(gmap->mm);
-
-	uaddr = __gmap_translate(gmap, gaddr);
-	if (IS_ERR_VALUE(uaddr))
-		goto out;
-	vma = vma_lookup(gmap->mm, uaddr);
-	if (!vma)
-		goto out;
-	/*
-	 * Secure pages cannot be huge and userspace should not combine both.
-	 * In case userspace does it anyway this will result in an -EFAULT for
-	 * the unpack. The guest is thus never reaching secure mode. If
-	 * userspace is playing dirty tricky with mapping huge pages later
-	 * on this will result in a segmentation fault.
-	 */
-	if (is_vm_hugetlb_page(vma))
-		goto out;
-
-	rc = -ENXIO;
-	ptep = get_locked_pte(gmap->mm, uaddr, &ptelock);
-	if (!ptep)
-		goto out;
-	if (pte_present(*ptep) && !(pte_val(*ptep) & _PAGE_INVALID) && pte_write(*ptep)) {
-		folio = page_folio(pte_page(*ptep));
-		rc = -EAGAIN;
-		if (folio_test_large(folio)) {
-			rc = -E2BIG;
-		} else if (folio_trylock(folio)) {
-			if (should_export_before_import(uvcb, gmap->mm))
-				uv_convert_from_secure(PFN_PHYS(folio_pfn(folio)));
-			rc = make_folio_secure(folio, uvcb);
-			folio_unlock(folio);
-		}
-
-		/*
-		 * Once we drop the PTL, the folio may get unmapped and
-		 * freed immediately. We need a temporary reference.
-		 */
-		if (rc == -EAGAIN || rc == -E2BIG)
-			folio_get(folio);
-	}
-	pte_unmap_unlock(ptep, ptelock);
-out:
-	mmap_read_unlock(gmap->mm);
-
-	switch (rc) {
-	case -E2BIG:
-		folio_lock(folio);
-		rc = split_folio(folio);
-		folio_unlock(folio);
-		folio_put(folio);
-
-		switch (rc) {
-		case 0:
-			/* Splitting succeeded, try again immediately. */
-			goto again;
-		case -EAGAIN:
-			/* Additional folio references. */
-			if (drain_lru(&drain_lru_called))
-				goto again;
-			return -EAGAIN;
-		case -EBUSY:
-			/* Unexpected race. */
-			return -EAGAIN;
-		}
-		WARN_ON_ONCE(1);
-		return -ENXIO;
-	case -EAGAIN:
-		/*
-		 * If we are here because the UVC returned busy or partial
-		 * completion, this is just a useless check, but it is safe.
-		 */
-		folio_wait_writeback(folio);
-		folio_put(folio);
-		return -EAGAIN;
-	case -EBUSY:
-		/* Additional folio references. */
-		if (drain_lru(&drain_lru_called))
-			goto again;
-		return -EAGAIN;
-	case -ENXIO:
-		if (gmap_fault(gmap, gaddr, FAULT_FLAG_WRITE))
-			return -EFAULT;
-		return -EAGAIN;
-	}
-	return rc;
-}
-EXPORT_SYMBOL_GPL(gmap_make_secure);
-
-int gmap_convert_to_secure(struct gmap *gmap, unsigned long gaddr)
-{
-	struct uv_cb_cts uvcb = {
-		.header.cmd = UVC_CMD_CONV_TO_SEC_STOR,
-		.header.len = sizeof(uvcb),
-		.guest_handle = gmap->guest_handle,
-		.gaddr = gaddr,
-	};
-
-	return gmap_make_secure(gmap, gaddr, &uvcb);
-}
-EXPORT_SYMBOL_GPL(gmap_convert_to_secure);
-
-/**
- * gmap_destroy_page - Destroy a guest page.
- * @gmap: the gmap of the guest
- * @gaddr: the guest address to destroy
- *
- * An attempt will be made to destroy the given guest page. If the attempt
- * fails, an attempt is made to export the page. If both attempts fail, an
- * appropriate error is returned.
- */
-int gmap_destroy_page(struct gmap *gmap, unsigned long gaddr)
-{
-	struct vm_area_struct *vma;
-	struct folio_walk fw;
-	unsigned long uaddr;
-	struct folio *folio;
-	int rc;
-
-	rc = -EFAULT;
-	mmap_read_lock(gmap->mm);
-
-	uaddr = __gmap_translate(gmap, gaddr);
-	if (IS_ERR_VALUE(uaddr))
-		goto out;
-	vma = vma_lookup(gmap->mm, uaddr);
-	if (!vma)
-		goto out;
-	/*
-	 * Huge pages should not be able to become secure
-	 */
-	if (is_vm_hugetlb_page(vma))
-		goto out;
-
-	rc = 0;
-	folio = folio_walk_start(&fw, vma, uaddr, 0);
-	if (!folio)
-		goto out;
-	/*
-	 * See gmap_make_secure(): large folios cannot be secure. Small
-	 * folio implies FW_LEVEL_PTE.
-	 */
-	if (folio_test_large(folio) || !pte_write(fw.pte))
-		goto out_walk_end;
-	rc = uv_destroy_folio(folio);
-	/*
-	 * Fault handlers can race; it is possible that two CPUs will fault
-	 * on the same secure page. One CPU can destroy the page, reboot,
-	 * re-enter secure mode and import it, while the second CPU was
-	 * stuck at the beginning of the handler. At some point the second
-	 * CPU will be able to progress, and it will not be able to destroy
-	 * the page. In that case we do not want to terminate the process,
-	 * we instead try to export the page.
-	 */
-	if (rc)
-		rc = uv_convert_from_secure_folio(folio);
-out_walk_end:
-	folio_walk_end(&fw, vma);
-out:
-	mmap_read_unlock(gmap->mm);
-	return rc;
-}
-EXPORT_SYMBOL_GPL(gmap_destroy_page);
+EXPORT_SYMBOL_GPL(make_folio_secure);
 
 /*
  * To be called with the folio locked or with an extra reference! This will
diff --git a/arch/s390/kvm/Makefile b/arch/s390/kvm/Makefile
index 02217fb4ae10..d972dea657fd 100644
--- a/arch/s390/kvm/Makefile
+++ b/arch/s390/kvm/Makefile
@@ -8,7 +8,7 @@  include $(srctree)/virt/kvm/Makefile.kvm
 ccflags-y := -Ivirt/kvm -Iarch/s390/kvm
 
 kvm-y += kvm-s390.o intercept.o interrupt.o priv.o sigp.o
-kvm-y += diag.o gaccess.o guestdbg.o vsie.o pv.o
+kvm-y += diag.o gaccess.o guestdbg.o vsie.o pv.o gmap.o
 
 kvm-$(CONFIG_VFIO_PCI_ZDEV_KVM) += pci.o
 obj-$(CONFIG_KVM) += kvm.o
diff --git a/arch/s390/kvm/gmap.c b/arch/s390/kvm/gmap.c
new file mode 100644
index 000000000000..02adf151d4de
--- /dev/null
+++ b/arch/s390/kvm/gmap.c
@@ -0,0 +1,212 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Guest memory management for KVM/s390
+ *
+ * Copyright IBM Corp. 2008, 2020, 2024
+ *
+ *    Author(s): Claudio Imbrenda <imbrenda@linux.ibm.com>
+ *               Martin Schwidefsky <schwidefsky@de.ibm.com>
+ *               David Hildenbrand <david@redhat.com>
+ *               Janosch Frank <frankja@linux.vnet.ibm.com>
+ */
+
+#include <linux/compiler.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/pgtable.h>
+#include <linux/pagemap.h>
+
+#include <asm/lowcore.h>
+#include <asm/gmap.h>
+#include <asm/uv.h>
+
+#include "gmap.h"
+
+/**
+ * should_export_before_import - Determine whether an export is needed
+ * before an import-like operation
+ * @uvcb: the Ultravisor control block of the UVC to be performed
+ * @mm: the mm of the process
+ *
+ * Returns whether an export is needed before every import-like operation.
+ * This is needed for shared pages, which don't trigger a secure storage
+ * exception when accessed from a different guest.
+ *
+ * Although considered as one, the Unpin Page UVC is not an actual import,
+ * so it is not affected.
+ *
+ * No export is needed also when there is only one protected VM, because the
+ * page cannot belong to the wrong VM in that case (there is no "other VM"
+ * it can belong to).
+ *
+ * Return: true if an export is needed before every import, otherwise false.
+ */
+static bool should_export_before_import(struct uv_cb_header *uvcb, struct mm_struct *mm)
+{
+	/*
+	 * The misc feature indicates, among other things, that importing a
+	 * shared page from a different protected VM will automatically also
+	 * transfer its ownership.
+	 */
+	if (uv_has_feature(BIT_UV_FEAT_MISC))
+		return false;
+	if (uvcb->cmd == UVC_CMD_UNPIN_PAGE_SHARED)
+		return false;
+	return atomic_read(&mm->context.protected_count) > 1;
+}
+
+static int __gmap_make_secure(struct gmap *gmap, struct page *page, void *uvcb)
+{
+	struct folio *folio = page_folio(page);
+	int rc;
+
+	/*
+	 * Secure pages cannot be huge and userspace should not combine both.
+	 * In case userspace does it anyway this will result in an -EFAULT for
+	 * the unpack. The guest is thus never reaching secure mode.
+	 * If userspace plays dirty tricks and decides to map huge pages at a
+	 * later point in time, it will receive a segmentation fault or
+	 * KVM_RUN will return -EFAULT.
+	 */
+	if (folio_test_hugetlb(folio))
+		return -EFAULT;
+	if (folio_test_large(folio)) {
+		mmap_read_unlock(gmap->mm);
+		rc = kvm_s390_wiggle_split_folio(gmap->mm, folio, true);
+		mmap_read_lock(gmap->mm);
+		if (rc)
+			return rc;
+		folio = page_folio(page);
+	}
+
+	if (!folio_trylock(folio))
+		return -EAGAIN;
+	if (should_export_before_import(uvcb, gmap->mm))
+		uv_convert_from_secure(folio_to_phys(folio));
+	rc = make_folio_secure(folio, uvcb);
+	folio_unlock(folio);
+
+	/*
+	 * In theory a race is possible and the folio might have become
+	 * large again before the folio_trylock() above. In that case, no
+	 * action is performed and -EAGAIN is returned; the callers will
+	 * have to try again later.
+	 * In most cases this implies running the VM again, getting the same
+	 * exception again, and make another attempt in this function.
+	 * This is expected to happen extremely rarely.
+	 */
+	if (rc == -E2BIG)
+		return -EAGAIN;
+	/* The folio has too many references, try to shake some off */
+	if (rc == -EBUSY) {
+		mmap_read_unlock(gmap->mm);
+		kvm_s390_wiggle_split_folio(gmap->mm, folio, false);
+		mmap_read_lock(gmap->mm);
+		return -EAGAIN;
+	}
+
+	return rc;
+}
+
+/**
+ * gmap_make_secure() - make one guest page secure
+ * @gmap: the guest gmap
+ * @gaddr: the guest address that needs to be made secure
+ * @uvcb: the UVCB specifying which operation needs to be performed
+ *
+ * Context: needs to be called with kvm->srcu held.
+ * Return: 0 on success, < 0 in case of error (see __gmap_make_secure()).
+ */
+int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb)
+{
+	struct kvm *kvm = gmap->private;
+	struct page *page;
+	int rc = 0;
+
+	lockdep_assert_held(&kvm->srcu);
+
+	page = gfn_to_page(kvm, gpa_to_gfn(gaddr));
+	mmap_read_lock(gmap->mm);
+	if (page)
+		rc = __gmap_make_secure(gmap, page, uvcb);
+	kvm_release_page_clean(page);
+	mmap_read_unlock(gmap->mm);
+
+	return rc;
+}
+
+int gmap_convert_to_secure(struct gmap *gmap, unsigned long gaddr)
+{
+	struct uv_cb_cts uvcb = {
+		.header.cmd = UVC_CMD_CONV_TO_SEC_STOR,
+		.header.len = sizeof(uvcb),
+		.guest_handle = gmap->guest_handle,
+		.gaddr = gaddr,
+	};
+
+	return gmap_make_secure(gmap, gaddr, &uvcb);
+}
+
+/**
+ * __gmap_destroy_page() - Destroy a guest page.
+ * @gmap: the gmap of the guest
+ * @page: the page to destroy
+ *
+ * An attempt will be made to destroy the given guest page. If the attempt
+ * fails, an attempt is made to export the page. If both attempts fail, an
+ * appropriate error is returned.
+ *
+ * Context: must be called holding the mm lock for gmap->mm
+ */
+static int __gmap_destroy_page(struct gmap *gmap, struct page *page)
+{
+	struct folio *folio = page_folio(page);
+	int rc;
+
+	/*
+	 * See gmap_make_secure(): large folios cannot be secure. Small
+	 * folio implies FW_LEVEL_PTE.
+	 */
+	if (folio_test_large(folio))
+		return -EFAULT;
+
+	rc = uv_destroy_folio(folio);
+	/*
+	 * Fault handlers can race; it is possible that two CPUs will fault
+	 * on the same secure page. One CPU can destroy the page, reboot,
+	 * re-enter secure mode and import it, while the second CPU was
+	 * stuck at the beginning of the handler. At some point the second
+	 * CPU will be able to progress, and it will not be able to destroy
+	 * the page. In that case we do not want to terminate the process,
+	 * we instead try to export the page.
+	 */
+	if (rc)
+		rc = uv_convert_from_secure_folio(folio);
+
+	return rc;
+}
+
+/**
+ * gmap_destroy_page() - Destroy a guest page.
+ * @gmap: the gmap of the guest
+ * @gaddr: the guest address to destroy
+ *
+ * An attempt will be made to destroy the given guest page. If the attempt
+ * fails, an attempt is made to export the page. If both attempts fail, an
+ * appropriate error is returned.
+ *
+ * Context: may sleep.
+ */
+int gmap_destroy_page(struct gmap *gmap, unsigned long gaddr)
+{
+	struct page *page;
+	int rc = 0;
+
+	mmap_read_lock(gmap->mm);
+	page = gfn_to_page(gmap->private, gpa_to_gfn(gaddr));
+	if (page)
+		rc = __gmap_destroy_page(gmap, page);
+	kvm_release_page_clean(page);
+	mmap_read_unlock(gmap->mm);
+	return rc;
+}
diff --git a/arch/s390/kvm/gmap.h b/arch/s390/kvm/gmap.h
new file mode 100644
index 000000000000..f2b52ce29be3
--- /dev/null
+++ b/arch/s390/kvm/gmap.h
@@ -0,0 +1,17 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ *  KVM guest address space mapping code
+ *
+ *    Copyright IBM Corp. 2007, 2016, 2025
+ *    Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
+ *               Claudio Imbrenda <imbrenda@linux.ibm.com>
+ */
+
+#ifndef ARCH_KVM_S390_GMAP_H
+#define ARCH_KVM_S390_GMAP_H
+
+int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb);
+int gmap_convert_to_secure(struct gmap *gmap, unsigned long gaddr);
+int gmap_destroy_page(struct gmap *gmap, unsigned long gaddr);
+
+#endif
diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c
index 5bbaadf75dc6..acf10aefd08f 100644
--- a/arch/s390/kvm/intercept.c
+++ b/arch/s390/kvm/intercept.c
@@ -21,6 +21,7 @@ 
 #include "gaccess.h"
 #include "trace.h"
 #include "trace-s390.h"
+#include "gmap.h"
 
 u8 kvm_s390_get_ilen(struct kvm_vcpu *vcpu)
 {
@@ -549,7 +550,7 @@  static int handle_pv_uvc(struct kvm_vcpu *vcpu)
 	 * If the unpin did not succeed, the guest will exit again for the UVC
 	 * and we will retry the unpin.
 	 */
-	if (rc == -EINVAL)
+	if (rc == -EINVAL || rc == -ENXIO)
 		return 0;
 	/*
 	 * If we got -EAGAIN here, we simply return it. It will eventually
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index fc44002a7b04..a25ca440760f 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -50,6 +50,7 @@ 
 #include "kvm-s390.h"
 #include "gaccess.h"
 #include "pci.h"
+#include "gmap.h"
 
 #define CREATE_TRACE_POINTS
 #include "trace.h"
diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c
index 75e81ba26d04..22c012aa5206 100644
--- a/arch/s390/kvm/pv.c
+++ b/arch/s390/kvm/pv.c
@@ -17,6 +17,7 @@ 
 #include <linux/sched/mm.h>
 #include <linux/mmu_notifier.h>
 #include "kvm-s390.h"
+#include "gmap.h"
 
 bool kvm_s390_pv_is_protected(struct kvm *kvm)
 {
@@ -638,10 +639,28 @@  static int unpack_one(struct kvm *kvm, unsigned long addr, u64 tweak,
 		.tweak[1] = offset,
 	};
 	int ret = gmap_make_secure(kvm->arch.gmap, addr, &uvcb);
+	unsigned long vmaddr;
+	bool unlocked;
 
 	*rc = uvcb.header.rc;
 	*rrc = uvcb.header.rrc;
 
+	if (ret == -ENXIO) {
+		mmap_read_lock(kvm->mm);
+		vmaddr = gfn_to_hva(kvm, gpa_to_gfn(addr));
+		if (kvm_is_error_hva(vmaddr)) {
+			ret = -EFAULT;
+		} else {
+			ret = fixup_user_fault(kvm->mm, vmaddr, FAULT_FLAG_WRITE, &unlocked);
+			if (!ret)
+				ret = __gmap_link(kvm->arch.gmap, addr, vmaddr);
+		}
+		mmap_read_unlock(kvm->mm);
+		if (!ret)
+			return -EAGAIN;
+		return ret;
+	}
+
 	if (ret && ret != -EAGAIN)
 		KVM_UV_EVENT(kvm, 3, "PROTVIRT VM UNPACK: failed addr %llx with rc %x rrc %x",
 			     uvcb.gaddr, *rc, *rrc);
@@ -660,6 +679,8 @@  int kvm_s390_pv_unpack(struct kvm *kvm, unsigned long addr, unsigned long size,
 	KVM_UV_EVENT(kvm, 3, "PROTVIRT VM UNPACK: start addr %lx size %lx",
 		     addr, size);
 
+	guard(srcu)(&kvm->srcu);
+
 	while (offset < size) {
 		ret = unpack_one(kvm, addr, tweak, offset, rc, rrc);
 		if (ret == -EAGAIN) {
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index 16b8a36c56de..3e6e25119a96 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -3035,3 +3035,31 @@  int s390_replace_asce(struct gmap *gmap)
 	return 0;
 }
 EXPORT_SYMBOL_GPL(s390_replace_asce);
+
+/**
+ * kvm_s390_wiggle_split_folio() - try to drain extra references to a folio and optionally split
+ * @mm:    the mm containing the folio to work on
+ * @folio: the folio
+ * @split: whether to split a large folio
+ *
+ * Context: Must be called while holding an extra reference to the folio;
+ *          the mm lock should not be held.
+ */
+int kvm_s390_wiggle_split_folio(struct mm_struct *mm, struct folio *folio, bool split)
+{
+	int rc;
+
+	lockdep_assert_not_held(&mm->mmap_lock);
+	folio_wait_writeback(folio);
+	lru_add_drain_all();
+	if (split) {
+		folio_lock(folio);
+		rc = split_folio(folio);
+		folio_unlock(folio);
+
+		if (rc != -EBUSY)
+			return rc;
+	}
+	return -EAGAIN;
+}
+EXPORT_SYMBOL_GPL(kvm_s390_wiggle_split_folio);