diff mbox series

[RFC,09/37] KVM: s390: protvirt: Implement on-demand pinning

Message ID 20191024114059.102802-10-frankja@linux.ibm.com (mailing list archive)
State New, archived
Headers show
Series KVM: s390: Add support for protected VMs | expand

Commit Message

Janosch Frank Oct. 24, 2019, 11:40 a.m. UTC
From: Claudio Imbrenda <imbrenda@linux.ibm.com>

Pin the guest pages when they are first accessed, instead of all at
the same time when starting the guest.

Signed-off-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
---
 arch/s390/include/asm/gmap.h |  1 +
 arch/s390/include/asm/uv.h   |  6 +++++
 arch/s390/kernel/uv.c        | 20 ++++++++++++++
 arch/s390/kvm/kvm-s390.c     |  2 ++
 arch/s390/kvm/pv.c           | 51 ++++++++++++++++++++++++++++++------
 5 files changed, 72 insertions(+), 8 deletions(-)

Comments

David Hildenbrand Oct. 25, 2019, 8:49 a.m. UTC | #1
On 24.10.19 13:40, Janosch Frank wrote:
> From: Claudio Imbrenda <imbrenda@linux.ibm.com>
> 
> Pin the guest pages when they are first accessed, instead of all at
> the same time when starting the guest.

Please explain why you do stuff. Why do we have to pin the hole guest 
memory? Why can't we mlock() the hole memory to avoid swapping in user 
space?

This really screams for a proper explanation.
> 
> Signed-off-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
> ---
>   arch/s390/include/asm/gmap.h |  1 +
>   arch/s390/include/asm/uv.h   |  6 +++++
>   arch/s390/kernel/uv.c        | 20 ++++++++++++++
>   arch/s390/kvm/kvm-s390.c     |  2 ++
>   arch/s390/kvm/pv.c           | 51 ++++++++++++++++++++++++++++++------
>   5 files changed, 72 insertions(+), 8 deletions(-)
> 
> diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h
> index 99b3eedda26e..483f64427c0e 100644
> --- a/arch/s390/include/asm/gmap.h
> +++ b/arch/s390/include/asm/gmap.h
> @@ -63,6 +63,7 @@ struct gmap {
>   	struct gmap *parent;
>   	unsigned long orig_asce;
>   	unsigned long se_handle;
> +	struct page **pinned_pages;
>   	int edat_level;
>   	bool removed;
>   	bool initialized;
> diff --git a/arch/s390/include/asm/uv.h b/arch/s390/include/asm/uv.h
> index 99cdd2034503..9ce9363aee1c 100644
> --- a/arch/s390/include/asm/uv.h
> +++ b/arch/s390/include/asm/uv.h
> @@ -298,6 +298,7 @@ static inline int uv_convert_from_secure(unsigned long paddr)
>   	return -EINVAL;
>   }
>   
> +int kvm_s390_pv_pin_page(struct gmap *gmap, unsigned long gpa);
>   /*
>    * Requests the Ultravisor to make a page accessible to a guest
>    * (import). If it's brought in the first time, it will be cleared. If
> @@ -317,6 +318,11 @@ static inline int uv_convert_to_secure(struct gmap *gmap, unsigned long gaddr)
>   		.gaddr = gaddr
>   	};
>   
> +	down_read(&gmap->mm->mmap_sem);
> +	cc = kvm_s390_pv_pin_page(gmap, gaddr);
> +	up_read(&gmap->mm->mmap_sem);
> +	if (cc)
> +		return cc;
>   	cc = uv_call(0, (u64)&uvcb);
>   
>   	if (!cc)
> diff --git a/arch/s390/kernel/uv.c b/arch/s390/kernel/uv.c
> index f7778493e829..36554402b5c6 100644
> --- a/arch/s390/kernel/uv.c
> +++ b/arch/s390/kernel/uv.c
> @@ -98,4 +98,24 @@ void adjust_to_uv_max(unsigned long *vmax)
>   	if (prot_virt_host && *vmax > uv_info.max_sec_stor_addr)
>   		*vmax = uv_info.max_sec_stor_addr;
>   }
> +
> +int kvm_s390_pv_pin_page(struct gmap *gmap, unsigned long gpa)
> +{
> +	unsigned long hva, gfn = gpa / PAGE_SIZE;
> +	int rc;
> +
> +	if (!gmap->pinned_pages)
> +		return -EINVAL;
> +	hva = __gmap_translate(gmap, gpa);
> +	if (IS_ERR_VALUE(hva))
> +		return -EFAULT;
> +	if (gmap->pinned_pages[gfn])
> +		return -EEXIST;
> +	rc = get_user_pages_fast(hva, 1, FOLL_WRITE, gmap->pinned_pages + gfn);
> +	if (rc < 0)
> +		return rc;
> +	return 0;
> +}
> +EXPORT_SYMBOL_GPL(kvm_s390_pv_pin_page);
> +
>   #endif
> diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
> index d1ba12f857e7..490fde080107 100644
> --- a/arch/s390/kvm/kvm-s390.c
> +++ b/arch/s390/kvm/kvm-s390.c
> @@ -2196,6 +2196,7 @@ static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd)
>   		/* All VCPUs have to be destroyed before this call. */
>   		mutex_lock(&kvm->lock);
>   		kvm_s390_vcpu_block_all(kvm);
> +		kvm_s390_pv_unpin(kvm);
>   		r = kvm_s390_pv_destroy_vm(kvm);
>   		if (!r)
>   			kvm_s390_pv_dealloc_vm(kvm);
> @@ -2680,6 +2681,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
>   	kvm_s390_gisa_destroy(kvm);
>   	if (IS_ENABLED(CONFIG_KVM_S390_PROTECTED_VIRTUALIZATION_HOST) &&
>   	    kvm_s390_pv_is_protected(kvm)) {
> +		kvm_s390_pv_unpin(kvm);
>   		kvm_s390_pv_destroy_vm(kvm);
>   		kvm_s390_pv_dealloc_vm(kvm);
>   	}
> diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c
> index 80aecd5bea9e..383e660e2221 100644
> --- a/arch/s390/kvm/pv.c
> +++ b/arch/s390/kvm/pv.c
> @@ -15,8 +15,35 @@
>   #include <asm/mman.h>
>   #include "kvm-s390.h"
>   
> +static void unpin_destroy(struct page **pages, int nr)
> +{
> +	int i;
> +	struct page *page;
> +	u8 *val;
> +
> +	for (i = 0; i < nr; i++) {
> +		page = pages[i];
> +		if (!page)	/* page was never used */
> +			continue;
> +		val = (void *)page_to_phys(page);
> +		READ_ONCE(*val);
> +		put_page(page);
> +	}
> +}
> +
> +void kvm_s390_pv_unpin(struct kvm *kvm)
> +{
> +	unsigned long npages = kvm->arch.pv.guest_len / PAGE_SIZE;
> +
> +	mutex_lock(&kvm->slots_lock);
> +	unpin_destroy(kvm->arch.gmap->pinned_pages, npages);
> +	mutex_unlock(&kvm->slots_lock);
> +}
> +
>   void kvm_s390_pv_dealloc_vm(struct kvm *kvm)
>   {
> +	vfree(kvm->arch.gmap->pinned_pages);
> +	kvm->arch.gmap->pinned_pages = NULL;
>   	vfree(kvm->arch.pv.stor_var);
>   	free_pages(kvm->arch.pv.stor_base,
>   		   get_order(uv_info.guest_base_stor_len));
> @@ -28,7 +55,6 @@ int kvm_s390_pv_alloc_vm(struct kvm *kvm)
>   	unsigned long base = uv_info.guest_base_stor_len;
>   	unsigned long virt = uv_info.guest_virt_var_stor_len;
>   	unsigned long npages = 0, vlen = 0;
> -	struct kvm_memslots *slots;
>   	struct kvm_memory_slot *memslot;
>   
>   	kvm->arch.pv.stor_var = NULL;
> @@ -43,22 +69,26 @@ int kvm_s390_pv_alloc_vm(struct kvm *kvm)
>   	 * Slots are sorted by GFN
>   	 */
>   	mutex_lock(&kvm->slots_lock);
> -	slots = kvm_memslots(kvm);
> -	memslot = slots->memslots;
> +	memslot = kvm_memslots(kvm)->memslots;
>   	npages = memslot->base_gfn + memslot->npages;
> -
>   	mutex_unlock(&kvm->slots_lock);
> +
> +	kvm->arch.gmap->pinned_pages = vzalloc(npages * sizeof(struct page *));
> +	if (!kvm->arch.gmap->pinned_pages)
> +		goto out_err;
>   	kvm->arch.pv.guest_len = npages * PAGE_SIZE;
>   
>   	/* Allocate variable storage */
>   	vlen = ALIGN(virt * ((npages * PAGE_SIZE) / HPAGE_SIZE), PAGE_SIZE);
>   	vlen += uv_info.guest_virt_base_stor_len;
>   	kvm->arch.pv.stor_var = vzalloc(vlen);
> -	if (!kvm->arch.pv.stor_var) {
> -		kvm_s390_pv_dealloc_vm(kvm);
> -		return -ENOMEM;
> -	}
> +	if (!kvm->arch.pv.stor_var)
> +		goto out_err;
>   	return 0;
> +
> +out_err:
> +	kvm_s390_pv_dealloc_vm(kvm);
> +	return -ENOMEM;
>   }
>   
>   int kvm_s390_pv_destroy_vm(struct kvm *kvm)
> @@ -216,6 +246,11 @@ int kvm_s390_pv_unpack(struct kvm *kvm, unsigned long addr, unsigned long size,
>   	for (i = 0; i < size / PAGE_SIZE; i++) {
>   		uvcb.gaddr = addr + i * PAGE_SIZE;
>   		uvcb.tweak[1] = i * PAGE_SIZE;
> +		down_read(&kvm->mm->mmap_sem);
> +		rc = kvm_s390_pv_pin_page(kvm->arch.gmap, uvcb.gaddr);
> +		up_read(&kvm->mm->mmap_sem);
> +		if (rc && (rc != -EEXIST))
> +			break;
>   retry:
>   		rc = uv_call(0, (u64)&uvcb);
>   		if (!rc)
>
Christian Borntraeger Oct. 31, 2019, 3:41 p.m. UTC | #2
On 25.10.19 10:49, David Hildenbrand wrote:
> On 24.10.19 13:40, Janosch Frank wrote:
>> From: Claudio Imbrenda <imbrenda@linux.ibm.com>
>>
>> Pin the guest pages when they are first accessed, instead of all at
>> the same time when starting the guest.
> 
> Please explain why you do stuff. Why do we have to pin the hole guest memory? Why can't we mlock() the hole memory to avoid swapping in user space?

Basically we pin the guest for the same reason as AMD did it for their SEV. It is hard
to synchronize page import/export with the I/O for paging. For example you can actually
fault in a page that is currently under paging I/O. What do you do? import (so that the 
guest can run) or export (so that the I/O will work). As this turned out to be harder then
we though we decided to defer paging to a later point in time.

As we do not want to rely on the userspace to do the mlock this is now done in the kernel.



> 
> This really screams for a proper explanation.
>>
>> Signed-off-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
>> ---
>>   arch/s390/include/asm/gmap.h |  1 +
>>   arch/s390/include/asm/uv.h   |  6 +++++
>>   arch/s390/kernel/uv.c        | 20 ++++++++++++++
>>   arch/s390/kvm/kvm-s390.c     |  2 ++
>>   arch/s390/kvm/pv.c           | 51 ++++++++++++++++++++++++++++++------
>>   5 files changed, 72 insertions(+), 8 deletions(-)
>>
>> diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h
>> index 99b3eedda26e..483f64427c0e 100644
>> --- a/arch/s390/include/asm/gmap.h
>> +++ b/arch/s390/include/asm/gmap.h
>> @@ -63,6 +63,7 @@ struct gmap {
>>       struct gmap *parent;
>>       unsigned long orig_asce;
>>       unsigned long se_handle;
>> +    struct page **pinned_pages;
>>       int edat_level;
>>       bool removed;
>>       bool initialized;
>> diff --git a/arch/s390/include/asm/uv.h b/arch/s390/include/asm/uv.h
>> index 99cdd2034503..9ce9363aee1c 100644
>> --- a/arch/s390/include/asm/uv.h
>> +++ b/arch/s390/include/asm/uv.h
>> @@ -298,6 +298,7 @@ static inline int uv_convert_from_secure(unsigned long paddr)
>>       return -EINVAL;
>>   }
>>   +int kvm_s390_pv_pin_page(struct gmap *gmap, unsigned long gpa);
>>   /*
>>    * Requests the Ultravisor to make a page accessible to a guest
>>    * (import). If it's brought in the first time, it will be cleared. If
>> @@ -317,6 +318,11 @@ static inline int uv_convert_to_secure(struct gmap *gmap, unsigned long gaddr)
>>           .gaddr = gaddr
>>       };
>>   +    down_read(&gmap->mm->mmap_sem);
>> +    cc = kvm_s390_pv_pin_page(gmap, gaddr);
>> +    up_read(&gmap->mm->mmap_sem);
>> +    if (cc)
>> +        return cc;
>>       cc = uv_call(0, (u64)&uvcb);
>>         if (!cc)
>> diff --git a/arch/s390/kernel/uv.c b/arch/s390/kernel/uv.c
>> index f7778493e829..36554402b5c6 100644
>> --- a/arch/s390/kernel/uv.c
>> +++ b/arch/s390/kernel/uv.c
>> @@ -98,4 +98,24 @@ void adjust_to_uv_max(unsigned long *vmax)
>>       if (prot_virt_host && *vmax > uv_info.max_sec_stor_addr)
>>           *vmax = uv_info.max_sec_stor_addr;
>>   }
>> +
>> +int kvm_s390_pv_pin_page(struct gmap *gmap, unsigned long gpa)
>> +{
>> +    unsigned long hva, gfn = gpa / PAGE_SIZE;
>> +    int rc;
>> +
>> +    if (!gmap->pinned_pages)
>> +        return -EINVAL;
>> +    hva = __gmap_translate(gmap, gpa);
>> +    if (IS_ERR_VALUE(hva))
>> +        return -EFAULT;
>> +    if (gmap->pinned_pages[gfn])
>> +        return -EEXIST;
>> +    rc = get_user_pages_fast(hva, 1, FOLL_WRITE, gmap->pinned_pages + gfn);
>> +    if (rc < 0)
>> +        return rc;
>> +    return 0;
>> +}
>> +EXPORT_SYMBOL_GPL(kvm_s390_pv_pin_page);
>> +
>>   #endif
>> diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
>> index d1ba12f857e7..490fde080107 100644
>> --- a/arch/s390/kvm/kvm-s390.c
>> +++ b/arch/s390/kvm/kvm-s390.c
>> @@ -2196,6 +2196,7 @@ static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd)
>>           /* All VCPUs have to be destroyed before this call. */
>>           mutex_lock(&kvm->lock);
>>           kvm_s390_vcpu_block_all(kvm);
>> +        kvm_s390_pv_unpin(kvm);
>>           r = kvm_s390_pv_destroy_vm(kvm);
>>           if (!r)
>>               kvm_s390_pv_dealloc_vm(kvm);
>> @@ -2680,6 +2681,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
>>       kvm_s390_gisa_destroy(kvm);
>>       if (IS_ENABLED(CONFIG_KVM_S390_PROTECTED_VIRTUALIZATION_HOST) &&
>>           kvm_s390_pv_is_protected(kvm)) {
>> +        kvm_s390_pv_unpin(kvm);
>>           kvm_s390_pv_destroy_vm(kvm);
>>           kvm_s390_pv_dealloc_vm(kvm);
>>       }
>> diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c
>> index 80aecd5bea9e..383e660e2221 100644
>> --- a/arch/s390/kvm/pv.c
>> +++ b/arch/s390/kvm/pv.c
>> @@ -15,8 +15,35 @@
>>   #include <asm/mman.h>
>>   #include "kvm-s390.h"
>>   +static void unpin_destroy(struct page **pages, int nr)
>> +{
>> +    int i;
>> +    struct page *page;
>> +    u8 *val;
>> +
>> +    for (i = 0; i < nr; i++) {
>> +        page = pages[i];
>> +        if (!page)    /* page was never used */
>> +            continue;
>> +        val = (void *)page_to_phys(page);
>> +        READ_ONCE(*val);
>> +        put_page(page);
>> +    }
>> +}
>> +
>> +void kvm_s390_pv_unpin(struct kvm *kvm)
>> +{
>> +    unsigned long npages = kvm->arch.pv.guest_len / PAGE_SIZE;
>> +
>> +    mutex_lock(&kvm->slots_lock);
>> +    unpin_destroy(kvm->arch.gmap->pinned_pages, npages);
>> +    mutex_unlock(&kvm->slots_lock);
>> +}
>> +
>>   void kvm_s390_pv_dealloc_vm(struct kvm *kvm)
>>   {
>> +    vfree(kvm->arch.gmap->pinned_pages);
>> +    kvm->arch.gmap->pinned_pages = NULL;
>>       vfree(kvm->arch.pv.stor_var);
>>       free_pages(kvm->arch.pv.stor_base,
>>              get_order(uv_info.guest_base_stor_len));
>> @@ -28,7 +55,6 @@ int kvm_s390_pv_alloc_vm(struct kvm *kvm)
>>       unsigned long base = uv_info.guest_base_stor_len;
>>       unsigned long virt = uv_info.guest_virt_var_stor_len;
>>       unsigned long npages = 0, vlen = 0;
>> -    struct kvm_memslots *slots;
>>       struct kvm_memory_slot *memslot;
>>         kvm->arch.pv.stor_var = NULL;
>> @@ -43,22 +69,26 @@ int kvm_s390_pv_alloc_vm(struct kvm *kvm)
>>        * Slots are sorted by GFN
>>        */
>>       mutex_lock(&kvm->slots_lock);
>> -    slots = kvm_memslots(kvm);
>> -    memslot = slots->memslots;
>> +    memslot = kvm_memslots(kvm)->memslots;
>>       npages = memslot->base_gfn + memslot->npages;
>> -
>>       mutex_unlock(&kvm->slots_lock);
>> +
>> +    kvm->arch.gmap->pinned_pages = vzalloc(npages * sizeof(struct page *));
>> +    if (!kvm->arch.gmap->pinned_pages)
>> +        goto out_err;
>>       kvm->arch.pv.guest_len = npages * PAGE_SIZE;
>>         /* Allocate variable storage */
>>       vlen = ALIGN(virt * ((npages * PAGE_SIZE) / HPAGE_SIZE), PAGE_SIZE);
>>       vlen += uv_info.guest_virt_base_stor_len;
>>       kvm->arch.pv.stor_var = vzalloc(vlen);
>> -    if (!kvm->arch.pv.stor_var) {
>> -        kvm_s390_pv_dealloc_vm(kvm);
>> -        return -ENOMEM;
>> -    }
>> +    if (!kvm->arch.pv.stor_var)
>> +        goto out_err;
>>       return 0;
>> +
>> +out_err:
>> +    kvm_s390_pv_dealloc_vm(kvm);
>> +    return -ENOMEM;
>>   }
>>     int kvm_s390_pv_destroy_vm(struct kvm *kvm)
>> @@ -216,6 +246,11 @@ int kvm_s390_pv_unpack(struct kvm *kvm, unsigned long addr, unsigned long size,
>>       for (i = 0; i < size / PAGE_SIZE; i++) {
>>           uvcb.gaddr = addr + i * PAGE_SIZE;
>>           uvcb.tweak[1] = i * PAGE_SIZE;
>> +        down_read(&kvm->mm->mmap_sem);
>> +        rc = kvm_s390_pv_pin_page(kvm->arch.gmap, uvcb.gaddr);
>> +        up_read(&kvm->mm->mmap_sem);
>> +        if (rc && (rc != -EEXIST))
>> +            break;
>>   retry:
>>           rc = uv_call(0, (u64)&uvcb);
>>           if (!rc)
>>
> 
>
David Hildenbrand Oct. 31, 2019, 5:30 p.m. UTC | #3
On 31.10.19 16:41, Christian Borntraeger wrote:
> 
> 
> On 25.10.19 10:49, David Hildenbrand wrote:
>> On 24.10.19 13:40, Janosch Frank wrote:
>>> From: Claudio Imbrenda <imbrenda@linux.ibm.com>
>>>
>>> Pin the guest pages when they are first accessed, instead of all at
>>> the same time when starting the guest.
>>
>> Please explain why you do stuff. Why do we have to pin the hole guest memory? Why can't we mlock() the hole memory to avoid swapping in user space?
> 
> Basically we pin the guest for the same reason as AMD did it for their SEV. It is hard

Pinning all guest memory is very ugly. What you want is "don't page", 
what you get is unmovable pages all over the place. I was hoping that 
you could get around this by having an automatic back-and-forth 
conversion in place (due to the special new exceptions).

> to synchronize page import/export with the I/O for paging. For example you can actually
> fault in a page that is currently under paging I/O. What do you do? import (so that the
> guest can run) or export (so that the I/O will work). As this turned out to be harder then
> we though we decided to defer paging to a later point in time.

I don't quite see the issue yet. If you page out, the page will 
automatically (on access) be converted to !secure/encrypted memory. If 
the UV/guest wants to access it, it will be automatically converted to 
secure/unencrypted memory. If you have concurrent access, it will be 
converted back and forth until one party is done.

A proper automatic conversion should make this work. What am I missing?

> 
> As we do not want to rely on the userspace to do the mlock this is now done in the kernel.

I wonder if we could come up with an alternative (similar to how we 
override VM_MERGEABLE in the kernel) that can be called and ensured in 
the kernel. E.g., marking whole VMAs as "don't page" (I remember 
something like "special VMAs" like used for VDSOs that achieve exactly 
that, but I am absolutely no expert on that). That would be much nicer 
than pinning all pages and remembering what you pinned in huge page 
arrays ...
Janosch Frank Oct. 31, 2019, 8:57 p.m. UTC | #4
On 10/31/19 6:30 PM, David Hildenbrand wrote:
> On 31.10.19 16:41, Christian Borntraeger wrote:
>>
>>
>> On 25.10.19 10:49, David Hildenbrand wrote:
>>> On 24.10.19 13:40, Janosch Frank wrote:
>>>> From: Claudio Imbrenda <imbrenda@linux.ibm.com>
>>>>
>>>> Pin the guest pages when they are first accessed, instead of all at
>>>> the same time when starting the guest.
>>>
>>> Please explain why you do stuff. Why do we have to pin the hole guest memory? Why can't we mlock() the hole memory to avoid swapping in user space?
>>
>> Basically we pin the guest for the same reason as AMD did it for their SEV. It is hard
> 
> Pinning all guest memory is very ugly. What you want is "don't page", 
> what you get is unmovable pages all over the place. I was hoping that 
> you could get around this by having an automatic back-and-forth 
> conversion in place (due to the special new exceptions).

Yes, that's one of the ideas that have been circulating.

> 
>> to synchronize page import/export with the I/O for paging. For example you can actually
>> fault in a page that is currently under paging I/O. What do you do? import (so that the
>> guest can run) or export (so that the I/O will work). As this turned out to be harder then
>> we though we decided to defer paging to a later point in time.
> 
> I don't quite see the issue yet. If you page out, the page will 
> automatically (on access) be converted to !secure/encrypted memory. If 
> the UV/guest wants to access it, it will be automatically converted to 
> secure/unencrypted memory. If you have concurrent access, it will be 
> converted back and forth until one party is done.

IO does not trigger an export on an imported page, but an error
condition in the IO subsystem. The page code does not read pages through
the cpu, but often just asks the device to read directly and that's
where everything goes wrong. We could bounce swapping, but chose to pin
for now until we find a proper solution to that problem which nicely
integrates into linux.

> 
> A proper automatic conversion should make this work. What am I missing?
> 
>>
>> As we do not want to rely on the userspace to do the mlock this is now done in the kernel.
> 
> I wonder if we could come up with an alternative (similar to how we 
> override VM_MERGEABLE in the kernel) that can be called and ensured in 
> the kernel. E.g., marking whole VMAs as "don't page" (I remember 
> something like "special VMAs" like used for VDSOs that achieve exactly 
> that, but I am absolutely no expert on that). That would be much nicer 
> than pinning all pages and remembering what you pinned in huge page 
> arrays ...

It might be more worthwhile to just accept one or two releases with
pinning and fix the root of the problem than design a nice stopgap.

Btw. s390 is not alone with the problem and we'll try to have another
discussion tomorrow with AMD to find a solution which works for more
than one architecture.
Claudio Imbrenda Nov. 1, 2019, 8:50 a.m. UTC | #5
On Thu, 31 Oct 2019 18:30:30 +0100
David Hildenbrand <david@redhat.com> wrote:

> On 31.10.19 16:41, Christian Borntraeger wrote:
> > 
> > 
> > On 25.10.19 10:49, David Hildenbrand wrote:  
> >> On 24.10.19 13:40, Janosch Frank wrote:  
> >>> From: Claudio Imbrenda <imbrenda@linux.ibm.com>
> >>>
> >>> Pin the guest pages when they are first accessed, instead of all
> >>> at the same time when starting the guest.  
> >>
> >> Please explain why you do stuff. Why do we have to pin the hole
> >> guest memory? Why can't we mlock() the hole memory to avoid
> >> swapping in user space?  
> > 
> > Basically we pin the guest for the same reason as AMD did it for
> > their SEV. It is hard  
> 
> Pinning all guest memory is very ugly. What you want is "don't page", 
> what you get is unmovable pages all over the place. I was hoping that 
> you could get around this by having an automatic back-and-forth 
> conversion in place (due to the special new exceptions).

we're not pinning all of guest memory, btw, but only the pages that are
actually used.

so if you have a *huge* guest, only the few pages used by the kernel and
initrd are actually pinned at VM start. Then one by one the ones
actually used when the guest is running get pinned on first use.

I don't need to add anything regarding the other points since the other
have commented already :)
Christian Borntraeger Nov. 2, 2019, 8:53 a.m. UTC | #6
On 24.10.19 13:40, Janosch Frank wrote:
> From: Claudio Imbrenda <imbrenda@linux.ibm.com>
> 
> Pin the guest pages when they are first accessed, instead of all at
> the same time when starting the guest.
> 
> Signed-off-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
[...]
> diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c
> index 80aecd5bea9e..383e660e2221 100644
> --- a/arch/s390/kvm/pv.c
> +++ b/arch/s390/kvm/pv.c
> @@ -15,8 +15,35 @@
>  #include <asm/mman.h>
>  #include "kvm-s390.h"
>  
> +static void unpin_destroy(struct page **pages, int nr)
> +{
> +	int i;
> +	struct page *page;
> +	u8 *val;
> +
> +	for (i = 0; i < nr; i++) {
> +		page = pages[i];
> +		if (!page)	/* page was never used */
> +			continue;
> +		val = (void *)page_to_phys(page);

Why dont we call the convert from secure directly here to avoid the fault overhead?

> +		READ_ONCE(*val);
> +		put_page(page);

as we also do the export here (via implicit reading that page) this can take a while.
I think we must add a cond_resched here. 

> +	}
> +}
> +

[...]
> -
>  	mutex_unlock(&kvm->slots_lock);
> +
> +	kvm->arch.gmap->pinned_pages = vzalloc(npages * sizeof(struct page *));
> +	if (!kvm->arch.gmap->pinned_pages)
> +		goto out_err;
>  	kvm->arch.pv.guest_len = npages * PAGE_SIZE;
>  
>  	/* Allocate variable storage */
>  	vlen = ALIGN(virt * ((npages * PAGE_SIZE) / HPAGE_SIZE), PAGE_SIZE);
>  	vlen += uv_info.guest_virt_base_stor_len;
>  	kvm->arch.pv.stor_var = vzalloc(vlen);
> -	if (!kvm->arch.pv.stor_var) {
> -		kvm_s390_pv_dealloc_vm(kvm);
> -		return -ENOMEM;
> -	}
> +	if (!kvm->arch.pv.stor_var)
> +		goto out_err;
>  	return 0;
> +
> +out_err:
> +	kvm_s390_pv_dealloc_vm(kvm);
> +	return -ENOMEM;
>  }
>  
>  int kvm_s390_pv_destroy_vm(struct kvm *kvm)
> @@ -216,6 +246,11 @@ int kvm_s390_pv_unpack(struct kvm *kvm, unsigned long addr, unsigned long size,
>  	for (i = 0; i < size / PAGE_SIZE; i++) {
>  		uvcb.gaddr = addr + i * PAGE_SIZE;
>  		uvcb.tweak[1] = i * PAGE_SIZE;
> +		down_read(&kvm->mm->mmap_sem);
> +		rc = kvm_s390_pv_pin_page(kvm->arch.gmap, uvcb.gaddr);
> +		up_read(&kvm->mm->mmap_sem);

Here we should also have a cond_resched();

> +		if (rc && (rc != -EEXIST))
> +			break;
>  retry:
>  		rc = uv_call(0, (u64)&uvcb);
>  		if (!rc)
>
David Hildenbrand Nov. 4, 2019, 10:19 a.m. UTC | #7
>>> to synchronize page import/export with the I/O for paging. For example you can actually
>>> fault in a page that is currently under paging I/O. What do you do? import (so that the
>>> guest can run) or export (so that the I/O will work). As this turned out to be harder then
>>> we though we decided to defer paging to a later point in time.
>>
>> I don't quite see the issue yet. If you page out, the page will
>> automatically (on access) be converted to !secure/encrypted memory. If
>> the UV/guest wants to access it, it will be automatically converted to
>> secure/unencrypted memory. If you have concurrent access, it will be
>> converted back and forth until one party is done.
> 
> IO does not trigger an export on an imported page, but an error
> condition in the IO subsystem. The page code does not read pages through

Ah, that makes it much clearer. Thanks!

> the cpu, but often just asks the device to read directly and that's
> where everything goes wrong. We could bounce swapping, but chose to pin
> for now until we find a proper solution to that problem which nicely
> integrates into linux.

How hard would it be to

1. Detect the error condition
2. Try a read on the affected page from the CPU (will will automatically 
convert to encrypted/!secure)
3. Restart the I/O

I assume that this is a corner case where we don't really have to care 
about performance in the first shot.

> 
>>
>> A proper automatic conversion should make this work. What am I missing?
>>
>>>
>>> As we do not want to rely on the userspace to do the mlock this is now done in the kernel.
>>
>> I wonder if we could come up with an alternative (similar to how we
>> override VM_MERGEABLE in the kernel) that can be called and ensured in
>> the kernel. E.g., marking whole VMAs as "don't page" (I remember
>> something like "special VMAs" like used for VDSOs that achieve exactly
>> that, but I am absolutely no expert on that). That would be much nicer
>> than pinning all pages and remembering what you pinned in huge page
>> arrays ...
> 
> It might be more worthwhile to just accept one or two releases with
> pinning and fix the root of the problem than design a nice stopgap.

Quite honestly, to me this feels like a prototype hack that deserves a 
proper solution first. The issue with this hack is that it affects user 
space (esp. MADV_DONTNEED no longer working correctly). It's not just 
something you once fix in the kernel and be done with it.
> 
> Btw. s390 is not alone with the problem and we'll try to have another
> discussion tomorrow with AMD to find a solution which works for more
> than one architecture.

Let me know if there was an interesting outcome.
David Hildenbrand Nov. 4, 2019, 10:22 a.m. UTC | #8
On 01.11.19 09:50, Claudio Imbrenda wrote:
> On Thu, 31 Oct 2019 18:30:30 +0100
> David Hildenbrand <david@redhat.com> wrote:
> 
>> On 31.10.19 16:41, Christian Borntraeger wrote:
>>>
>>>
>>> On 25.10.19 10:49, David Hildenbrand wrote:
>>>> On 24.10.19 13:40, Janosch Frank wrote:
>>>>> From: Claudio Imbrenda <imbrenda@linux.ibm.com>
>>>>>
>>>>> Pin the guest pages when they are first accessed, instead of all
>>>>> at the same time when starting the guest.
>>>>
>>>> Please explain why you do stuff. Why do we have to pin the hole
>>>> guest memory? Why can't we mlock() the hole memory to avoid
>>>> swapping in user space?
>>>
>>> Basically we pin the guest for the same reason as AMD did it for
>>> their SEV. It is hard
>>
>> Pinning all guest memory is very ugly. What you want is "don't page",
>> what you get is unmovable pages all over the place. I was hoping that
>> you could get around this by having an automatic back-and-forth
>> conversion in place (due to the special new exceptions).
> 
> we're not pinning all of guest memory, btw, but only the pages that are
> actually used.

Any longer-running guest will eventually touch all guest physical memory 
(e.g., page cache, page shuffling), so this is only an optimization for 
short running guests.
Janosch Frank Nov. 4, 2019, 10:25 a.m. UTC | #9
On 11/4/19 11:19 AM, David Hildenbrand wrote:
>>>> to synchronize page import/export with the I/O for paging. For example you can actually
>>>> fault in a page that is currently under paging I/O. What do you do? import (so that the
>>>> guest can run) or export (so that the I/O will work). As this turned out to be harder then
>>>> we though we decided to defer paging to a later point in time.
>>>
>>> I don't quite see the issue yet. If you page out, the page will
>>> automatically (on access) be converted to !secure/encrypted memory. If
>>> the UV/guest wants to access it, it will be automatically converted to
>>> secure/unencrypted memory. If you have concurrent access, it will be
>>> converted back and forth until one party is done.
>>
>> IO does not trigger an export on an imported page, but an error
>> condition in the IO subsystem. The page code does not read pages through
> 
> Ah, that makes it much clearer. Thanks!
> 
>> the cpu, but often just asks the device to read directly and that's
>> where everything goes wrong. We could bounce swapping, but chose to pin
>> for now until we find a proper solution to that problem which nicely
>> integrates into linux.
> 
> How hard would it be to
> 
> 1. Detect the error condition
> 2. Try a read on the affected page from the CPU (will will automatically 
> convert to encrypted/!secure)
> 3. Restart the I/O
> 
> I assume that this is a corner case where we don't really have to care 
> about performance in the first shot.

Restarting IO can be quite difficult with CCW, we might need to change
request data...

> 
>>
>>>
>>> A proper automatic conversion should make this work. What am I missing?
>>>
>>>>
>>>> As we do not want to rely on the userspace to do the mlock this is now done in the kernel.
>>>
>>> I wonder if we could come up with an alternative (similar to how we
>>> override VM_MERGEABLE in the kernel) that can be called and ensured in
>>> the kernel. E.g., marking whole VMAs as "don't page" (I remember
>>> something like "special VMAs" like used for VDSOs that achieve exactly
>>> that, but I am absolutely no expert on that). That would be much nicer
>>> than pinning all pages and remembering what you pinned in huge page
>>> arrays ...
>>
>> It might be more worthwhile to just accept one or two releases with
>> pinning and fix the root of the problem than design a nice stopgap.
> 
> Quite honestly, to me this feels like a prototype hack that deserves a 
> proper solution first. The issue with this hack is that it affects user 
> space (esp. MADV_DONTNEED no longer working correctly). It's not just 
> something you once fix in the kernel and be done with it.

It is a hack, yes.
But we're not the only architecture to need it x86 pins all the memory
at the start of the VM and that code is already upstream...

>>
>> Btw. s390 is not alone with the problem and we'll try to have another
>> discussion tomorrow with AMD to find a solution which works for more
>> than one architecture.
> 
> Let me know if there was an interesting outcome.
>
David Hildenbrand Nov. 4, 2019, 10:27 a.m. UTC | #10
On 04.11.19 11:25, Janosch Frank wrote:
> On 11/4/19 11:19 AM, David Hildenbrand wrote:
>>>>> to synchronize page import/export with the I/O for paging. For example you can actually
>>>>> fault in a page that is currently under paging I/O. What do you do? import (so that the
>>>>> guest can run) or export (so that the I/O will work). As this turned out to be harder then
>>>>> we though we decided to defer paging to a later point in time.
>>>>
>>>> I don't quite see the issue yet. If you page out, the page will
>>>> automatically (on access) be converted to !secure/encrypted memory. If
>>>> the UV/guest wants to access it, it will be automatically converted to
>>>> secure/unencrypted memory. If you have concurrent access, it will be
>>>> converted back and forth until one party is done.
>>>
>>> IO does not trigger an export on an imported page, but an error
>>> condition in the IO subsystem. The page code does not read pages through
>>
>> Ah, that makes it much clearer. Thanks!
>>
>>> the cpu, but often just asks the device to read directly and that's
>>> where everything goes wrong. We could bounce swapping, but chose to pin
>>> for now until we find a proper solution to that problem which nicely
>>> integrates into linux.
>>
>> How hard would it be to
>>
>> 1. Detect the error condition
>> 2. Try a read on the affected page from the CPU (will will automatically
>> convert to encrypted/!secure)
>> 3. Restart the I/O
>>
>> I assume that this is a corner case where we don't really have to care
>> about performance in the first shot.
> 
> Restarting IO can be quite difficult with CCW, we might need to change
> request data...

I am no I/O expert, so I can't comment if that would be possible :(


>>>> A proper automatic conversion should make this work. What am I missing?
>>>>
>>>>>
>>>>> As we do not want to rely on the userspace to do the mlock this is now done in the kernel.
>>>>
>>>> I wonder if we could come up with an alternative (similar to how we
>>>> override VM_MERGEABLE in the kernel) that can be called and ensured in
>>>> the kernel. E.g., marking whole VMAs as "don't page" (I remember
>>>> something like "special VMAs" like used for VDSOs that achieve exactly
>>>> that, but I am absolutely no expert on that). That would be much nicer
>>>> than pinning all pages and remembering what you pinned in huge page
>>>> arrays ...
>>>
>>> It might be more worthwhile to just accept one or two releases with
>>> pinning and fix the root of the problem than design a nice stopgap.
>>
>> Quite honestly, to me this feels like a prototype hack that deserves a
>> proper solution first. The issue with this hack is that it affects user
>> space (esp. MADV_DONTNEED no longer working correctly). It's not just
>> something you once fix in the kernel and be done with it.
> 
> It is a hack, yes.
> But we're not the only architecture to need it x86 pins all the memory
> at the start of the VM and that code is already upstream...

IMHO that doesn't make it any better. It is and remains a prototype hack 
in my opinion.
Christian Borntraeger Nov. 4, 2019, 1:58 p.m. UTC | #11
On 04.11.19 11:19, David Hildenbrand wrote:
>>>> to synchronize page import/export with the I/O for paging. For example you can actually
>>>> fault in a page that is currently under paging I/O. What do you do? import (so that the
>>>> guest can run) or export (so that the I/O will work). As this turned out to be harder then
>>>> we though we decided to defer paging to a later point in time.
>>>
>>> I don't quite see the issue yet. If you page out, the page will
>>> automatically (on access) be converted to !secure/encrypted memory. If
>>> the UV/guest wants to access it, it will be automatically converted to
>>> secure/unencrypted memory. If you have concurrent access, it will be
>>> converted back and forth until one party is done.
>>
>> IO does not trigger an export on an imported page, but an error
>> condition in the IO subsystem. The page code does not read pages through
> 
> Ah, that makes it much clearer. Thanks!
> 
>> the cpu, but often just asks the device to read directly and that's
>> where everything goes wrong. We could bounce swapping, but chose to pin
>> for now until we find a proper solution to that problem which nicely
>> integrates into linux.
> 
> How hard would it be to
> 
> 1. Detect the error condition
> 2. Try a read on the affected page from the CPU (will will automatically convert to encrypted/!secure)
> 3. Restart the I/O
> 
> I assume that this is a corner case where we don't really have to care about performance in the first shot.

We have looked into this. You would need to implement this in the low level
handler for every I/O. DASD, FCP, PCI based NVME, iscsi. Where do you want
to stop?
There is no proper global bounce buffer that works for everything. 

>>>
>>> A proper automatic conversion should make this work. What am I missing?
>>>
>>>>
>>>> As we do not want to rely on the userspace to do the mlock this is now done in the kernel.
>>>
>>> I wonder if we could come up with an alternative (similar to how we
>>> override VM_MERGEABLE in the kernel) that can be called and ensured in
>>> the kernel. E.g., marking whole VMAs as "don't page" (I remember
>>> something like "special VMAs" like used for VDSOs that achieve exactly
>>> that, but I am absolutely no expert on that). That would be much nicer
>>> than pinning all pages and remembering what you pinned in huge page
>>> arrays ...
>>
>> It might be more worthwhile to just accept one or two releases with
>> pinning and fix the root of the problem than design a nice stopgap.
> 
> Quite honestly, to me this feels like a prototype hack that deserves a proper solution first. The issue with this hack is that it affects user space (esp. MADV_DONTNEED no longer working correctly). It's not just something you once fix in the kernel and be done with it.

I disagree. Pinning is a valid initial version. I would find it strange to
allow it for AMD SEV, but not allowing it for s390x. 
As far as I can tell  MADV_DONTNEED continues to work within the bounds
of specification. In fact, it does work (or does not depending on your 
perspective :-) ) exactly in the same way as on hugetlbfs,which is also
a way of pinning.

And yes, I am in full agreement that we must work on lifting that
restriction. 


>>
>> Btw. s390 is not alone with the problem and we'll try to have another
>> discussion tomorrow with AMD to find a solution which works for more
>> than one architecture.
> 
> Let me know if there was an interesting outcome.
David Hildenbrand Nov. 4, 2019, 2:08 p.m. UTC | #12
On 04.11.19 14:58, Christian Borntraeger wrote:
> 
> 
> On 04.11.19 11:19, David Hildenbrand wrote:
>>>>> to synchronize page import/export with the I/O for paging. For example you can actually
>>>>> fault in a page that is currently under paging I/O. What do you do? import (so that the
>>>>> guest can run) or export (so that the I/O will work). As this turned out to be harder then
>>>>> we though we decided to defer paging to a later point in time.
>>>>
>>>> I don't quite see the issue yet. If you page out, the page will
>>>> automatically (on access) be converted to !secure/encrypted memory. If
>>>> the UV/guest wants to access it, it will be automatically converted to
>>>> secure/unencrypted memory. If you have concurrent access, it will be
>>>> converted back and forth until one party is done.
>>>
>>> IO does not trigger an export on an imported page, but an error
>>> condition in the IO subsystem. The page code does not read pages through
>>
>> Ah, that makes it much clearer. Thanks!
>>
>>> the cpu, but often just asks the device to read directly and that's
>>> where everything goes wrong. We could bounce swapping, but chose to pin
>>> for now until we find a proper solution to that problem which nicely
>>> integrates into linux.
>>
>> How hard would it be to
>>
>> 1. Detect the error condition
>> 2. Try a read on the affected page from the CPU (will will automatically convert to encrypted/!secure)
>> 3. Restart the I/O
>>
>> I assume that this is a corner case where we don't really have to care about performance in the first shot.
> 
> We have looked into this. You would need to implement this in the low level
> handler for every I/O. DASD, FCP, PCI based NVME, iscsi. Where do you want
> to stop?

If that's the real fix, we should do that. Maybe one can focus on the 
real use cases first. But I am no I/O expert, so my judgment might be 
completely wrong.

> There is no proper global bounce buffer that works for everything.
> 
>>>>
>>>> A proper automatic conversion should make this work. What am I missing?
>>>>
>>>>>
>>>>> As we do not want to rely on the userspace to do the mlock this is now done in the kernel.
>>>>
>>>> I wonder if we could come up with an alternative (similar to how we
>>>> override VM_MERGEABLE in the kernel) that can be called and ensured in
>>>> the kernel. E.g., marking whole VMAs as "don't page" (I remember
>>>> something like "special VMAs" like used for VDSOs that achieve exactly
>>>> that, but I am absolutely no expert on that). That would be much nicer
>>>> than pinning all pages and remembering what you pinned in huge page
>>>> arrays ...
>>>
>>> It might be more worthwhile to just accept one or two releases with
>>> pinning and fix the root of the problem than design a nice stopgap.
>>
>> Quite honestly, to me this feels like a prototype hack that deserves a proper solution first. The issue with this hack is that it affects user space (esp. MADV_DONTNEED no longer working correctly). It's not just something you once fix in the kernel and be done with it.
> 
> I disagree. Pinning is a valid initial version. I would find it strange to
> allow it for AMD SEV, but not allowing it for s390x.

"not allowing" is wrong. I don't like it, but I am not NACKing it. All I 
am saying is that this is for me a big fat "prototype" marker.

As a workaround, I would much rather want to see a "don't page" control 
(process/vma) than pinning every single page if "paging" is the only 
concern. Such an internal handling would not imply any real user space 
changes (as noted, like MADV_DONTNEED would see).

> As far as I can tell  MADV_DONTNEED continues to work within the bounds
> of specification. In fact, it does work (or does not depending on your

There is a reason why we disallow MADV_DONTNEED in QEMU when we have 
such vfio devices (e.g., balloon, postcopy live migration, ...). It does 
no longer work as specified for devices that pinned pages. You get 
inconsistent mappings.

> perspective :-) ) exactly in the same way as on hugetlbfs,which is also
> a way of pinning.

MADV_DONTNEED is documented to not work on huge pages. That's a 
different story. You have to use FALLOC_FL_PUNCH_HOLE.
David Hildenbrand Nov. 4, 2019, 2:17 p.m. UTC | #13
On 24.10.19 13:40, Janosch Frank wrote:
> From: Claudio Imbrenda <imbrenda@linux.ibm.com>
> 
> Pin the guest pages when they are first accessed, instead of all at
> the same time when starting the guest.
> 
> Signed-off-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
> ---
>   arch/s390/include/asm/gmap.h |  1 +
>   arch/s390/include/asm/uv.h   |  6 +++++
>   arch/s390/kernel/uv.c        | 20 ++++++++++++++
>   arch/s390/kvm/kvm-s390.c     |  2 ++
>   arch/s390/kvm/pv.c           | 51 ++++++++++++++++++++++++++++++------
>   5 files changed, 72 insertions(+), 8 deletions(-)
> 
> diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h
> index 99b3eedda26e..483f64427c0e 100644
> --- a/arch/s390/include/asm/gmap.h
> +++ b/arch/s390/include/asm/gmap.h
> @@ -63,6 +63,7 @@ struct gmap {
>   	struct gmap *parent;
>   	unsigned long orig_asce;
>   	unsigned long se_handle;
> +	struct page **pinned_pages;
>   	int edat_level;
>   	bool removed;
>   	bool initialized;
> diff --git a/arch/s390/include/asm/uv.h b/arch/s390/include/asm/uv.h
> index 99cdd2034503..9ce9363aee1c 100644
> --- a/arch/s390/include/asm/uv.h
> +++ b/arch/s390/include/asm/uv.h
> @@ -298,6 +298,7 @@ static inline int uv_convert_from_secure(unsigned long paddr)
>   	return -EINVAL;
>   }
>   
> +int kvm_s390_pv_pin_page(struct gmap *gmap, unsigned long gpa);
>   /*
>    * Requests the Ultravisor to make a page accessible to a guest
>    * (import). If it's brought in the first time, it will be cleared. If
> @@ -317,6 +318,11 @@ static inline int uv_convert_to_secure(struct gmap *gmap, unsigned long gaddr)
>   		.gaddr = gaddr
>   	};
>   
> +	down_read(&gmap->mm->mmap_sem);
> +	cc = kvm_s390_pv_pin_page(gmap, gaddr);
> +	up_read(&gmap->mm->mmap_sem);
> +	if (cc)
> +		return cc;
>   	cc = uv_call(0, (u64)&uvcb);

So, a theoretical question: Is any in-flight I/O from paging stopped 
when we try to pin the pages? I am no export on paging, but the comment 
from Christian ("you can actually fault in a page that is currently 
under paging I/O.") made me wonder.

Let's assume you could have parallel I/O on that page. You would do a 
uv_call() and convert the page to secure/unencrypted. The I/O can easily 
stumble over that (now inaccessible) page and report an error.

Or is any such race not possible because we are using 
get_user_pages_fast() vs. get_user_pages()? (then, I'd love to see a 
comment regarding that in the patch description)
David Hildenbrand Nov. 4, 2019, 2:42 p.m. UTC | #14
On 04.11.19 15:08, David Hildenbrand wrote:
> On 04.11.19 14:58, Christian Borntraeger wrote:
>>
>>
>> On 04.11.19 11:19, David Hildenbrand wrote:
>>>>>> to synchronize page import/export with the I/O for paging. For example you can actually
>>>>>> fault in a page that is currently under paging I/O. What do you do? import (so that the
>>>>>> guest can run) or export (so that the I/O will work). As this turned out to be harder then
>>>>>> we though we decided to defer paging to a later point in time.
>>>>>
>>>>> I don't quite see the issue yet. If you page out, the page will
>>>>> automatically (on access) be converted to !secure/encrypted memory. If
>>>>> the UV/guest wants to access it, it will be automatically converted to
>>>>> secure/unencrypted memory. If you have concurrent access, it will be
>>>>> converted back and forth until one party is done.
>>>>
>>>> IO does not trigger an export on an imported page, but an error
>>>> condition in the IO subsystem. The page code does not read pages through
>>>
>>> Ah, that makes it much clearer. Thanks!
>>>
>>>> the cpu, but often just asks the device to read directly and that's
>>>> where everything goes wrong. We could bounce swapping, but chose to pin
>>>> for now until we find a proper solution to that problem which nicely
>>>> integrates into linux.
>>>
>>> How hard would it be to
>>>
>>> 1. Detect the error condition
>>> 2. Try a read on the affected page from the CPU (will will automatically convert to encrypted/!secure)
>>> 3. Restart the I/O
>>>
>>> I assume that this is a corner case where we don't really have to care about performance in the first shot.
>>
>> We have looked into this. You would need to implement this in the low level
>> handler for every I/O. DASD, FCP, PCI based NVME, iscsi. Where do you want
>> to stop?
> 
> If that's the real fix, we should do that. Maybe one can focus on the
> real use cases first. But I am no I/O expert, so my judgment might be
> completely wrong.
> 

Oh, and by the way, as discussed you really only have to care about 
accesses via "real" I/O devices (IOW, not via the CPU). When accessing 
via the CPU, you should have automatic conversion back and forth. As I 
am no expert on I/O, I have no idea how iscsi fits into this picture 
here (especially on s390x).
Cornelia Huck Nov. 4, 2019, 5:17 p.m. UTC | #15
On Mon, 4 Nov 2019 15:42:11 +0100
David Hildenbrand <david@redhat.com> wrote:

> On 04.11.19 15:08, David Hildenbrand wrote:
> > On 04.11.19 14:58, Christian Borntraeger wrote:  
> >>
> >>
> >> On 04.11.19 11:19, David Hildenbrand wrote:  
> >>>>>> to synchronize page import/export with the I/O for paging. For example you can actually
> >>>>>> fault in a page that is currently under paging I/O. What do you do? import (so that the
> >>>>>> guest can run) or export (so that the I/O will work). As this turned out to be harder then
> >>>>>> we though we decided to defer paging to a later point in time.  
> >>>>>
> >>>>> I don't quite see the issue yet. If you page out, the page will
> >>>>> automatically (on access) be converted to !secure/encrypted memory. If
> >>>>> the UV/guest wants to access it, it will be automatically converted to
> >>>>> secure/unencrypted memory. If you have concurrent access, it will be
> >>>>> converted back and forth until one party is done.  
> >>>>
> >>>> IO does not trigger an export on an imported page, but an error
> >>>> condition in the IO subsystem. The page code does not read pages through  
> >>>
> >>> Ah, that makes it much clearer. Thanks!
> >>>  
> >>>> the cpu, but often just asks the device to read directly and that's
> >>>> where everything goes wrong. We could bounce swapping, but chose to pin
> >>>> for now until we find a proper solution to that problem which nicely
> >>>> integrates into linux.  
> >>>
> >>> How hard would it be to
> >>>
> >>> 1. Detect the error condition
> >>> 2. Try a read on the affected page from the CPU (will will automatically convert to encrypted/!secure)
> >>> 3. Restart the I/O
> >>>
> >>> I assume that this is a corner case where we don't really have to care about performance in the first shot.  
> >>
> >> We have looked into this. You would need to implement this in the low level
> >> handler for every I/O. DASD, FCP, PCI based NVME, iscsi. Where do you want
> >> to stop?  
> > 
> > If that's the real fix, we should do that. Maybe one can focus on the
> > real use cases first. But I am no I/O expert, so my judgment might be
> > completely wrong.
> >   
> 
> Oh, and by the way, as discussed you really only have to care about 
> accesses via "real" I/O devices (IOW, not via the CPU). When accessing 
> via the CPU, you should have automatic conversion back and forth. As I 
> am no expert on I/O, I have no idea how iscsi fits into this picture 
> here (especially on s390x).
> 

By "real" I/O devices, you mean things like channel devices, right? (So
everything where you basically hand off control to a different kind of
processor.)

For classic channel I/O (as used by dasd), I'd expect something like
getting a check condition on a ccw if the CU or device cannot access
the memory. You will know how far the channel program has progressed,
and might be able to restart (from the beginning or from that point).
Probably has a chance of working for a subset of channel programs.

For QDIO (as used by FCP), I have no idea how this is could work, as we
have long-running channel programs there and any error basically kills
the queues, which you would have to re-setup from the beginning.

For PCI devices, I have no idea how the instructions even act.

From my point of view, that error/restart approach looks nice on paper,
but it seems hard to make it work in the general case (and I'm unsure
if it's possible at all.)
David Hildenbrand Nov. 4, 2019, 5:44 p.m. UTC | #16
On 04.11.19 18:17, Cornelia Huck wrote:
> On Mon, 4 Nov 2019 15:42:11 +0100
> David Hildenbrand <david@redhat.com> wrote:
> 
>> On 04.11.19 15:08, David Hildenbrand wrote:
>>> On 04.11.19 14:58, Christian Borntraeger wrote:
>>>>
>>>>
>>>> On 04.11.19 11:19, David Hildenbrand wrote:
>>>>>>>> to synchronize page import/export with the I/O for paging. For example you can actually
>>>>>>>> fault in a page that is currently under paging I/O. What do you do? import (so that the
>>>>>>>> guest can run) or export (so that the I/O will work). As this turned out to be harder then
>>>>>>>> we though we decided to defer paging to a later point in time.
>>>>>>>
>>>>>>> I don't quite see the issue yet. If you page out, the page will
>>>>>>> automatically (on access) be converted to !secure/encrypted memory. If
>>>>>>> the UV/guest wants to access it, it will be automatically converted to
>>>>>>> secure/unencrypted memory. If you have concurrent access, it will be
>>>>>>> converted back and forth until one party is done.
>>>>>>
>>>>>> IO does not trigger an export on an imported page, but an error
>>>>>> condition in the IO subsystem. The page code does not read pages through
>>>>>
>>>>> Ah, that makes it much clearer. Thanks!
>>>>>   
>>>>>> the cpu, but often just asks the device to read directly and that's
>>>>>> where everything goes wrong. We could bounce swapping, but chose to pin
>>>>>> for now until we find a proper solution to that problem which nicely
>>>>>> integrates into linux.
>>>>>
>>>>> How hard would it be to
>>>>>
>>>>> 1. Detect the error condition
>>>>> 2. Try a read on the affected page from the CPU (will will automatically convert to encrypted/!secure)
>>>>> 3. Restart the I/O
>>>>>
>>>>> I assume that this is a corner case where we don't really have to care about performance in the first shot.
>>>>
>>>> We have looked into this. You would need to implement this in the low level
>>>> handler for every I/O. DASD, FCP, PCI based NVME, iscsi. Where do you want
>>>> to stop?
>>>
>>> If that's the real fix, we should do that. Maybe one can focus on the
>>> real use cases first. But I am no I/O expert, so my judgment might be
>>> completely wrong.
>>>    
>>
>> Oh, and by the way, as discussed you really only have to care about
>> accesses via "real" I/O devices (IOW, not via the CPU). When accessing
>> via the CPU, you should have automatic conversion back and forth. As I
>> am no expert on I/O, I have no idea how iscsi fits into this picture
>> here (especially on s390x).
>>
> 
> By "real" I/O devices, you mean things like channel devices, right? (So
> everything where you basically hand off control to a different kind of
> processor.)

Exactly.

> 
> For classic channel I/O (as used by dasd), I'd expect something like
> getting a check condition on a ccw if the CU or device cannot access
> the memory. You will know how far the channel program has progressed,
> and might be able to restart (from the beginning or from that point).
> Probably has a chance of working for a subset of channel programs.

Yeah, sound sane to me.

> 
> For QDIO (as used by FCP), I have no idea how this is could work, as we
> have long-running channel programs there and any error basically kills
> the queues, which you would have to re-setup from the beginning.
> 
> For PCI devices, I have no idea how the instructions even act.
> 
>  From my point of view, that error/restart approach looks nice on paper,
> but it seems hard to make it work in the general case (and I'm unsure
> if it's possible at all.)

Then I'm afraid whoever designed protected virtualization didn't 
properly consider concurrent I/O access to encrypted pages. It might not 
be easy to sort out, though, so I understand why the I/O part was 
designed that way :)

I was wondering if one could implement some kind of automatic conversion 
"back and forth" on I/O access (or even on any access within the HW). I 
mean, "basically" it's just encrypting/decrypting the page and updating 
the state by the UV (+ synchronization, lol). But yeah, the UV is 
involved, and would be triggered somehow via I/O access to these pages.
Right now that conversion is performed via exceptions by the OS 
explicitly. Instead of passing exceptions, the UV could convert 
automatically. Smells like massive HW changes, if possible and desired 
at all.

I do wonder what would happen if you back your guest memory not on 
anonymous memory but on e.g., a file. Could be that this eliminates all 
options besides pinning and fixing I/O, because we're talking about 
writeback and not paging.

HOWEVER, reading https://lwn.net/Articles/787636/

"Kara talked mostly about the writeback code; in some cases, it will 
simply skip pages that are pinned. But there are cases where those pages 
must be written out — "somebody has called fsync(), and they expect 
something to be saved". In this case, pinned pages will be written, but 
they will not be marked clean at the end of the operation; they will 
still be write-protected in the page tables while writeback is underway, 
though."

So, sounds like you will get concurrent I/O access even without paging 
... and that would leave fixing I/O the only option with the current HW 
design AFAIKS :/
David Hildenbrand Nov. 4, 2019, 6:38 p.m. UTC | #17
On 04.11.19 18:17, Cornelia Huck wrote:
> On Mon, 4 Nov 2019 15:42:11 +0100
> David Hildenbrand <david@redhat.com> wrote:
> 
>> On 04.11.19 15:08, David Hildenbrand wrote:
>>> On 04.11.19 14:58, Christian Borntraeger wrote:
>>>>
>>>>
>>>> On 04.11.19 11:19, David Hildenbrand wrote:
>>>>>>>> to synchronize page import/export with the I/O for paging. For example you can actually
>>>>>>>> fault in a page that is currently under paging I/O. What do you do? import (so that the
>>>>>>>> guest can run) or export (so that the I/O will work). As this turned out to be harder then
>>>>>>>> we though we decided to defer paging to a later point in time.
>>>>>>>
>>>>>>> I don't quite see the issue yet. If you page out, the page will
>>>>>>> automatically (on access) be converted to !secure/encrypted memory. If
>>>>>>> the UV/guest wants to access it, it will be automatically converted to
>>>>>>> secure/unencrypted memory. If you have concurrent access, it will be
>>>>>>> converted back and forth until one party is done.
>>>>>>
>>>>>> IO does not trigger an export on an imported page, but an error
>>>>>> condition in the IO subsystem. The page code does not read pages through
>>>>>
>>>>> Ah, that makes it much clearer. Thanks!
>>>>>   
>>>>>> the cpu, but often just asks the device to read directly and that's
>>>>>> where everything goes wrong. We could bounce swapping, but chose to pin
>>>>>> for now until we find a proper solution to that problem which nicely
>>>>>> integrates into linux.
>>>>>
>>>>> How hard would it be to
>>>>>
>>>>> 1. Detect the error condition
>>>>> 2. Try a read on the affected page from the CPU (will will automatically convert to encrypted/!secure)
>>>>> 3. Restart the I/O
>>>>>
>>>>> I assume that this is a corner case where we don't really have to care about performance in the first shot.
>>>>
>>>> We have looked into this. You would need to implement this in the low level
>>>> handler for every I/O. DASD, FCP, PCI based NVME, iscsi. Where do you want
>>>> to stop?
>>>
>>> If that's the real fix, we should do that. Maybe one can focus on the
>>> real use cases first. But I am no I/O expert, so my judgment might be
>>> completely wrong.
>>>    
>>
>> Oh, and by the way, as discussed you really only have to care about
>> accesses via "real" I/O devices (IOW, not via the CPU). When accessing
>> via the CPU, you should have automatic conversion back and forth. As I
>> am no expert on I/O, I have no idea how iscsi fits into this picture
>> here (especially on s390x).
>>
> 
> By "real" I/O devices, you mean things like channel devices, right? (So
> everything where you basically hand off control to a different kind of
> processor.)
> 
> For classic channel I/O (as used by dasd), I'd expect something like
> getting a check condition on a ccw if the CU or device cannot access
> the memory. You will know how far the channel program has progressed,
> and might be able to restart (from the beginning or from that point).
> Probably has a chance of working for a subset of channel programs.
> 
> For QDIO (as used by FCP), I have no idea how this is could work, as we
> have long-running channel programs there and any error basically kills
> the queues, which you would have to re-setup from the beginning.
> 
> For PCI devices, I have no idea how the instructions even act.
> 
>  From my point of view, that error/restart approach looks nice on paper,
> but it seems hard to make it work in the general case (and I'm unsure
> if it's possible at all.)

One thought: If all we do during an I/O request is read or write (or 
even a mixture), can we simply restart the whole I/O again, although we 
did partial reads/writes? This would eliminate the "know how far the 
channel program has progressed". On error, one would have to touch each 
involved page (e.g., try to read first byte to trigger a conversion) and 
restart the I/O. I can understand that this might sound simpler than it 
is (if it is even possible) and might still be problematic for QDIO as 
far as I understand. Just a thought.
Cornelia Huck Nov. 5, 2019, 9:15 a.m. UTC | #18
On Mon, 4 Nov 2019 19:38:27 +0100
David Hildenbrand <david@redhat.com> wrote:

> On 04.11.19 18:17, Cornelia Huck wrote:
> > On Mon, 4 Nov 2019 15:42:11 +0100
> > David Hildenbrand <david@redhat.com> wrote:
> >   
> >> On 04.11.19 15:08, David Hildenbrand wrote:  
> >>> On 04.11.19 14:58, Christian Borntraeger wrote:  

> >>>>> How hard would it be to
> >>>>>
> >>>>> 1. Detect the error condition
> >>>>> 2. Try a read on the affected page from the CPU (will will automatically convert to encrypted/!secure)
> >>>>> 3. Restart the I/O
> >>>>>
> >>>>> I assume that this is a corner case where we don't really have to care about performance in the first shot.  
> >>>>
> >>>> We have looked into this. You would need to implement this in the low level
> >>>> handler for every I/O. DASD, FCP, PCI based NVME, iscsi. Where do you want
> >>>> to stop?  
> >>>
> >>> If that's the real fix, we should do that. Maybe one can focus on the
> >>> real use cases first. But I am no I/O expert, so my judgment might be
> >>> completely wrong.
> >>>      
> >>
> >> Oh, and by the way, as discussed you really only have to care about
> >> accesses via "real" I/O devices (IOW, not via the CPU). When accessing
> >> via the CPU, you should have automatic conversion back and forth. As I
> >> am no expert on I/O, I have no idea how iscsi fits into this picture
> >> here (especially on s390x).
> >>  
> > 
> > By "real" I/O devices, you mean things like channel devices, right? (So
> > everything where you basically hand off control to a different kind of
> > processor.)
> > 
> > For classic channel I/O (as used by dasd), I'd expect something like
> > getting a check condition on a ccw if the CU or device cannot access
> > the memory. You will know how far the channel program has progressed,
> > and might be able to restart (from the beginning or from that point).
> > Probably has a chance of working for a subset of channel programs.

NB that there's more than simple reads/writes... could also be control
commands, some of which do read/writes as well.

> > 
> > For QDIO (as used by FCP), I have no idea how this is could work, as we
> > have long-running channel programs there and any error basically kills
> > the queues, which you would have to re-setup from the beginning.
> > 
> > For PCI devices, I have no idea how the instructions even act.
> > 
> >  From my point of view, that error/restart approach looks nice on paper,
> > but it seems hard to make it work in the general case (and I'm unsure
> > if it's possible at all.)  
> 
> One thought: If all we do during an I/O request is read or write (or 
> even a mixture), can we simply restart the whole I/O again, although we 
> did partial reads/writes? This would eliminate the "know how far the 
> channel program has progressed". On error, one would have to touch each 
> involved page (e.g., try to read first byte to trigger a conversion) and 
> restart the I/O. I can understand that this might sound simpler than it 
> is (if it is even possible)

Any control commands might have side effects, though. Problems there
should be uncommon; there's still the _general_ case, though :(

Also, there's stuff like rewriting the channel program w/o prefetch,
jumping with TIC, etc. Linux probably does not do the former, but at
least the dasd driver uses NOP/TIC for error recovery.

> and might still be problematic for QDIO as 
> far as I understand. Just a thought.

Yes, given that for QDIO, establishing the queues is simply one
long-running channel program...
diff mbox series

Patch

diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h
index 99b3eedda26e..483f64427c0e 100644
--- a/arch/s390/include/asm/gmap.h
+++ b/arch/s390/include/asm/gmap.h
@@ -63,6 +63,7 @@  struct gmap {
 	struct gmap *parent;
 	unsigned long orig_asce;
 	unsigned long se_handle;
+	struct page **pinned_pages;
 	int edat_level;
 	bool removed;
 	bool initialized;
diff --git a/arch/s390/include/asm/uv.h b/arch/s390/include/asm/uv.h
index 99cdd2034503..9ce9363aee1c 100644
--- a/arch/s390/include/asm/uv.h
+++ b/arch/s390/include/asm/uv.h
@@ -298,6 +298,7 @@  static inline int uv_convert_from_secure(unsigned long paddr)
 	return -EINVAL;
 }
 
+int kvm_s390_pv_pin_page(struct gmap *gmap, unsigned long gpa);
 /*
  * Requests the Ultravisor to make a page accessible to a guest
  * (import). If it's brought in the first time, it will be cleared. If
@@ -317,6 +318,11 @@  static inline int uv_convert_to_secure(struct gmap *gmap, unsigned long gaddr)
 		.gaddr = gaddr
 	};
 
+	down_read(&gmap->mm->mmap_sem);
+	cc = kvm_s390_pv_pin_page(gmap, gaddr);
+	up_read(&gmap->mm->mmap_sem);
+	if (cc)
+		return cc;
 	cc = uv_call(0, (u64)&uvcb);
 
 	if (!cc)
diff --git a/arch/s390/kernel/uv.c b/arch/s390/kernel/uv.c
index f7778493e829..36554402b5c6 100644
--- a/arch/s390/kernel/uv.c
+++ b/arch/s390/kernel/uv.c
@@ -98,4 +98,24 @@  void adjust_to_uv_max(unsigned long *vmax)
 	if (prot_virt_host && *vmax > uv_info.max_sec_stor_addr)
 		*vmax = uv_info.max_sec_stor_addr;
 }
+
+int kvm_s390_pv_pin_page(struct gmap *gmap, unsigned long gpa)
+{
+	unsigned long hva, gfn = gpa / PAGE_SIZE;
+	int rc;
+
+	if (!gmap->pinned_pages)
+		return -EINVAL;
+	hva = __gmap_translate(gmap, gpa);
+	if (IS_ERR_VALUE(hva))
+		return -EFAULT;
+	if (gmap->pinned_pages[gfn])
+		return -EEXIST;
+	rc = get_user_pages_fast(hva, 1, FOLL_WRITE, gmap->pinned_pages + gfn);
+	if (rc < 0)
+		return rc;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_s390_pv_pin_page);
+
 #endif
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index d1ba12f857e7..490fde080107 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -2196,6 +2196,7 @@  static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd)
 		/* All VCPUs have to be destroyed before this call. */
 		mutex_lock(&kvm->lock);
 		kvm_s390_vcpu_block_all(kvm);
+		kvm_s390_pv_unpin(kvm);
 		r = kvm_s390_pv_destroy_vm(kvm);
 		if (!r)
 			kvm_s390_pv_dealloc_vm(kvm);
@@ -2680,6 +2681,7 @@  void kvm_arch_destroy_vm(struct kvm *kvm)
 	kvm_s390_gisa_destroy(kvm);
 	if (IS_ENABLED(CONFIG_KVM_S390_PROTECTED_VIRTUALIZATION_HOST) &&
 	    kvm_s390_pv_is_protected(kvm)) {
+		kvm_s390_pv_unpin(kvm);
 		kvm_s390_pv_destroy_vm(kvm);
 		kvm_s390_pv_dealloc_vm(kvm);
 	}
diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c
index 80aecd5bea9e..383e660e2221 100644
--- a/arch/s390/kvm/pv.c
+++ b/arch/s390/kvm/pv.c
@@ -15,8 +15,35 @@ 
 #include <asm/mman.h>
 #include "kvm-s390.h"
 
+static void unpin_destroy(struct page **pages, int nr)
+{
+	int i;
+	struct page *page;
+	u8 *val;
+
+	for (i = 0; i < nr; i++) {
+		page = pages[i];
+		if (!page)	/* page was never used */
+			continue;
+		val = (void *)page_to_phys(page);
+		READ_ONCE(*val);
+		put_page(page);
+	}
+}
+
+void kvm_s390_pv_unpin(struct kvm *kvm)
+{
+	unsigned long npages = kvm->arch.pv.guest_len / PAGE_SIZE;
+
+	mutex_lock(&kvm->slots_lock);
+	unpin_destroy(kvm->arch.gmap->pinned_pages, npages);
+	mutex_unlock(&kvm->slots_lock);
+}
+
 void kvm_s390_pv_dealloc_vm(struct kvm *kvm)
 {
+	vfree(kvm->arch.gmap->pinned_pages);
+	kvm->arch.gmap->pinned_pages = NULL;
 	vfree(kvm->arch.pv.stor_var);
 	free_pages(kvm->arch.pv.stor_base,
 		   get_order(uv_info.guest_base_stor_len));
@@ -28,7 +55,6 @@  int kvm_s390_pv_alloc_vm(struct kvm *kvm)
 	unsigned long base = uv_info.guest_base_stor_len;
 	unsigned long virt = uv_info.guest_virt_var_stor_len;
 	unsigned long npages = 0, vlen = 0;
-	struct kvm_memslots *slots;
 	struct kvm_memory_slot *memslot;
 
 	kvm->arch.pv.stor_var = NULL;
@@ -43,22 +69,26 @@  int kvm_s390_pv_alloc_vm(struct kvm *kvm)
 	 * Slots are sorted by GFN
 	 */
 	mutex_lock(&kvm->slots_lock);
-	slots = kvm_memslots(kvm);
-	memslot = slots->memslots;
+	memslot = kvm_memslots(kvm)->memslots;
 	npages = memslot->base_gfn + memslot->npages;
-
 	mutex_unlock(&kvm->slots_lock);
+
+	kvm->arch.gmap->pinned_pages = vzalloc(npages * sizeof(struct page *));
+	if (!kvm->arch.gmap->pinned_pages)
+		goto out_err;
 	kvm->arch.pv.guest_len = npages * PAGE_SIZE;
 
 	/* Allocate variable storage */
 	vlen = ALIGN(virt * ((npages * PAGE_SIZE) / HPAGE_SIZE), PAGE_SIZE);
 	vlen += uv_info.guest_virt_base_stor_len;
 	kvm->arch.pv.stor_var = vzalloc(vlen);
-	if (!kvm->arch.pv.stor_var) {
-		kvm_s390_pv_dealloc_vm(kvm);
-		return -ENOMEM;
-	}
+	if (!kvm->arch.pv.stor_var)
+		goto out_err;
 	return 0;
+
+out_err:
+	kvm_s390_pv_dealloc_vm(kvm);
+	return -ENOMEM;
 }
 
 int kvm_s390_pv_destroy_vm(struct kvm *kvm)
@@ -216,6 +246,11 @@  int kvm_s390_pv_unpack(struct kvm *kvm, unsigned long addr, unsigned long size,
 	for (i = 0; i < size / PAGE_SIZE; i++) {
 		uvcb.gaddr = addr + i * PAGE_SIZE;
 		uvcb.tweak[1] = i * PAGE_SIZE;
+		down_read(&kvm->mm->mmap_sem);
+		rc = kvm_s390_pv_pin_page(kvm->arch.gmap, uvcb.gaddr);
+		up_read(&kvm->mm->mmap_sem);
+		if (rc && (rc != -EEXIST))
+			break;
 retry:
 		rc = uv_call(0, (u64)&uvcb);
 		if (!rc)