diff mbox series

[v4.5,09/36] KVM: s390: protvirt: Add initial vm and cpu lifecycle handling

Message ID 20200225214822.3611-1-borntraeger@de.ibm.com (mailing list archive)
State New, archived
Headers show
Series None | expand

Commit Message

Christian Borntraeger Feb. 25, 2020, 9:48 p.m. UTC
From: Janosch Frank <frankja@linux.ibm.com>

This contains 3 main changes:
1. changes in SIE control block handling for secure guests
2. helper functions for create/destroy/unpack secure guests
3. KVM_S390_PV_COMMAND ioctl to allow userspace dealing with secure
machines

Signed-off-by: Janosch Frank <frankja@linux.ibm.com>
[borntraeger@de.ibm.com: patch merging, splitting, fixing]
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/include/asm/kvm_host.h |  24 ++-
 arch/s390/include/asm/uv.h       |  69 ++++++++
 arch/s390/kvm/Makefile           |   2 +-
 arch/s390/kvm/kvm-s390.c         | 209 +++++++++++++++++++++++-
 arch/s390/kvm/kvm-s390.h         |  33 ++++
 arch/s390/kvm/pv.c               | 269 +++++++++++++++++++++++++++++++
 include/uapi/linux/kvm.h         |  31 ++++
 7 files changed, 633 insertions(+), 4 deletions(-)
 create mode 100644 arch/s390/kvm/pv.c

Comments

David Hildenbrand Feb. 25, 2020, 10:37 p.m. UTC | #1
> +static int kvm_s390_pv_alloc_vm(struct kvm *kvm)
> +{
> +	unsigned long base = uv_info.guest_base_stor_len;
> +	unsigned long virt = uv_info.guest_virt_var_stor_len;
> +	unsigned long npages = 0, vlen = 0;
> +	struct kvm_memory_slot *memslot;
> +
> +	kvm->arch.pv.stor_var = NULL;
> +	kvm->arch.pv.stor_base = __get_free_pages(GFP_KERNEL, get_order(base));
> +	if (!kvm->arch.pv.stor_base)
> +		return -ENOMEM;
> +
> +	/*
> +	 * Calculate current guest storage for allocation of the
> +	 * variable storage, which is based on the length in MB.
> +	 *
> +	 * Slots are sorted by GFN
> +	 */
> +	mutex_lock(&kvm->slots_lock);
> +	memslot = kvm_memslots(kvm)->memslots;
> +	npages = memslot->base_gfn + memslot->npages;
> +	mutex_unlock(&kvm->slots_lock);

As discussed, I think we should just use mem_limit and check against
some hardcoded upper limit. But yeah, we can do that as an addon (in
which case memory hotplug will require special tweaks to detect this
from user space ... e.g., a new capability)


[...]

> +int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
> +{
> +		struct uv_cb_cgc uvcb = {
> +		.header.cmd = UVC_CMD_CREATE_SEC_CONF,
> +		.header.len = sizeof(uvcb)
> +	};
> +	int cc, ret;
> +	u16 dummy;
> +
> +	ret = kvm_s390_pv_alloc_vm(kvm);
> +	if (ret)
> +		return ret;
> +
> +	/* Inputs */
> +	uvcb.guest_stor_origin = 0; /* MSO is 0 for KVM */
> +	uvcb.guest_stor_len = kvm->arch.pv.guest_len;
> +	uvcb.guest_asce = kvm->arch.gmap->asce;
> +	uvcb.guest_sca = (unsigned long)kvm->arch.sca;
> +	uvcb.conf_base_stor_origin = (u64)kvm->arch.pv.stor_base;
> +	uvcb.conf_virt_stor_origin = (u64)kvm->arch.pv.stor_var;
> +
> +	cc = uv_call(0, (u64)&uvcb);
> +	*rc = uvcb.header.rc;
> +	*rrc = uvcb.header.rrc;
> +	KVM_UV_EVENT(kvm, 3, "PROTVIRT CREATE VM: handle %llx len %llx rc %x rrc %x",
> +		     uvcb.guest_handle, uvcb.guest_stor_len, *rc, *rrc);
> +
> +	/* Outputs */
> +	kvm->arch.pv.handle = uvcb.guest_handle;
> +
> +	if (cc) {
> +		if (uvcb.header.rc & UVC_RC_NEED_DESTROY)
> +			kvm_s390_pv_deinit_vm(kvm, &dummy, &dummy);
> +		else
> +			kvm_s390_pv_dealloc_vm(kvm);
> +		return -EIO;

A lot easier to read :)


Fell free add my rb with or without the mem_limit change.

Reviewed-by: David Hildenbrand <david@redhat.com>
Christian Borntraeger Feb. 26, 2020, 8:12 a.m. UTC | #2
On 25.02.20 23:37, David Hildenbrand wrote:
> 
>> +static int kvm_s390_pv_alloc_vm(struct kvm *kvm)
>> +{
>> +	unsigned long base = uv_info.guest_base_stor_len;
>> +	unsigned long virt = uv_info.guest_virt_var_stor_len;
>> +	unsigned long npages = 0, vlen = 0;
>> +	struct kvm_memory_slot *memslot;
>> +
>> +	kvm->arch.pv.stor_var = NULL;
>> +	kvm->arch.pv.stor_base = __get_free_pages(GFP_KERNEL, get_order(base));
>> +	if (!kvm->arch.pv.stor_base)
>> +		return -ENOMEM;
>> +
>> +	/*
>> +	 * Calculate current guest storage for allocation of the
>> +	 * variable storage, which is based on the length in MB.
>> +	 *
>> +	 * Slots are sorted by GFN
>> +	 */
>> +	mutex_lock(&kvm->slots_lock);
>> +	memslot = kvm_memslots(kvm)->memslots;
>> +	npages = memslot->base_gfn + memslot->npages;
>> +	mutex_unlock(&kvm->slots_lock);
> 
> As discussed, I think we should just use mem_limit and check against
> some hardcoded upper limit. But yeah, we can do that as an addon (in
> which case memory hotplug will require special tweaks to detect this
> from user space ... e.g., a new capability)
> 
> 
> [...]
> 
>> +int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
>> +{
>> +		struct uv_cb_cgc uvcb = {
>> +		.header.cmd = UVC_CMD_CREATE_SEC_CONF,
>> +		.header.len = sizeof(uvcb)
>> +	};
>> +	int cc, ret;
>> +	u16 dummy;
>> +
>> +	ret = kvm_s390_pv_alloc_vm(kvm);
>> +	if (ret)
>> +		return ret;
>> +
>> +	/* Inputs */
>> +	uvcb.guest_stor_origin = 0; /* MSO is 0 for KVM */
>> +	uvcb.guest_stor_len = kvm->arch.pv.guest_len;
>> +	uvcb.guest_asce = kvm->arch.gmap->asce;
>> +	uvcb.guest_sca = (unsigned long)kvm->arch.sca;
>> +	uvcb.conf_base_stor_origin = (u64)kvm->arch.pv.stor_base;
>> +	uvcb.conf_virt_stor_origin = (u64)kvm->arch.pv.stor_var;
>> +
>> +	cc = uv_call(0, (u64)&uvcb);
>> +	*rc = uvcb.header.rc;
>> +	*rrc = uvcb.header.rrc;
>> +	KVM_UV_EVENT(kvm, 3, "PROTVIRT CREATE VM: handle %llx len %llx rc %x rrc %x",
>> +		     uvcb.guest_handle, uvcb.guest_stor_len, *rc, *rrc);
>> +
>> +	/* Outputs */
>> +	kvm->arch.pv.handle = uvcb.guest_handle;
>> +
>> +	if (cc) {
>> +		if (uvcb.header.rc & UVC_RC_NEED_DESTROY)
>> +			kvm_s390_pv_deinit_vm(kvm, &dummy, &dummy);
>> +		else
>> +			kvm_s390_pv_dealloc_vm(kvm);
>> +		return -EIO;
> 
> A lot easier to read :)
> 
> 
> Fell free add my rb with or without the mem_limit change.

I think I will keep the memslot logic. For hotplug we actually need
to notify the ultravisor that the guest size has changed as only the
ultravisor will be able to inject an addressing exception.

Lets keep it simple for now 

> 
> Reviewed-by: David Hildenbrand <david@redhat.com>

thanks for all the good review. Code looks much better now :-)
David Hildenbrand Feb. 26, 2020, 8:28 a.m. UTC | #3
On 26.02.20 09:12, Christian Borntraeger wrote:
> 
> 
> On 25.02.20 23:37, David Hildenbrand wrote:
>>
>>> +static int kvm_s390_pv_alloc_vm(struct kvm *kvm)
>>> +{
>>> +	unsigned long base = uv_info.guest_base_stor_len;
>>> +	unsigned long virt = uv_info.guest_virt_var_stor_len;
>>> +	unsigned long npages = 0, vlen = 0;
>>> +	struct kvm_memory_slot *memslot;
>>> +
>>> +	kvm->arch.pv.stor_var = NULL;
>>> +	kvm->arch.pv.stor_base = __get_free_pages(GFP_KERNEL, get_order(base));
>>> +	if (!kvm->arch.pv.stor_base)
>>> +		return -ENOMEM;
>>> +
>>> +	/*
>>> +	 * Calculate current guest storage for allocation of the
>>> +	 * variable storage, which is based on the length in MB.
>>> +	 *
>>> +	 * Slots are sorted by GFN
>>> +	 */
>>> +	mutex_lock(&kvm->slots_lock);
>>> +	memslot = kvm_memslots(kvm)->memslots;
>>> +	npages = memslot->base_gfn + memslot->npages;
>>> +	mutex_unlock(&kvm->slots_lock);
>>
>> As discussed, I think we should just use mem_limit and check against
>> some hardcoded upper limit. But yeah, we can do that as an addon (in
>> which case memory hotplug will require special tweaks to detect this
>> from user space ... e.g., a new capability)
>>
>>
>> [...]
>>
>>> +int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
>>> +{
>>> +		struct uv_cb_cgc uvcb = {
>>> +		.header.cmd = UVC_CMD_CREATE_SEC_CONF,
>>> +		.header.len = sizeof(uvcb)
>>> +	};
>>> +	int cc, ret;
>>> +	u16 dummy;
>>> +
>>> +	ret = kvm_s390_pv_alloc_vm(kvm);
>>> +	if (ret)
>>> +		return ret;
>>> +
>>> +	/* Inputs */
>>> +	uvcb.guest_stor_origin = 0; /* MSO is 0 for KVM */
>>> +	uvcb.guest_stor_len = kvm->arch.pv.guest_len;
>>> +	uvcb.guest_asce = kvm->arch.gmap->asce;
>>> +	uvcb.guest_sca = (unsigned long)kvm->arch.sca;
>>> +	uvcb.conf_base_stor_origin = (u64)kvm->arch.pv.stor_base;
>>> +	uvcb.conf_virt_stor_origin = (u64)kvm->arch.pv.stor_var;
>>> +
>>> +	cc = uv_call(0, (u64)&uvcb);
>>> +	*rc = uvcb.header.rc;
>>> +	*rrc = uvcb.header.rrc;
>>> +	KVM_UV_EVENT(kvm, 3, "PROTVIRT CREATE VM: handle %llx len %llx rc %x rrc %x",
>>> +		     uvcb.guest_handle, uvcb.guest_stor_len, *rc, *rrc);
>>> +
>>> +	/* Outputs */
>>> +	kvm->arch.pv.handle = uvcb.guest_handle;
>>> +
>>> +	if (cc) {
>>> +		if (uvcb.header.rc & UVC_RC_NEED_DESTROY)
>>> +			kvm_s390_pv_deinit_vm(kvm, &dummy, &dummy);
>>> +		else
>>> +			kvm_s390_pv_dealloc_vm(kvm);
>>> +		return -EIO;
>>
>> A lot easier to read :)
>>
>>
>> Fell free add my rb with or without the mem_limit change.
> 
> I think I will keep the memslot logic. For hotplug we actually need
> to notify the ultravisor that the guest size has changed as only the
> ultravisor will be able to inject an addressing exception.

So holes in between memory slots won't be properly considered? What will
happen if a guest accesses such memory right now?

> 
> Lets keep it simple for now 
> 

I double checked (virt/kvm/kvm_main.c:update_memslots()), and the slots
are definitely sorted "based on their GFN". I think, it's lowest GFN
first, so the code in here would be wrong with more than one slot.

Can you double check, because I might misinterpret the code.
Christian Borntraeger Feb. 26, 2020, 9:12 a.m. UTC | #4
On 26.02.20 09:28, David Hildenbrand wrote:
> On 26.02.20 09:12, Christian Borntraeger wrote:
>>
>>
>> On 25.02.20 23:37, David Hildenbrand wrote:
>>>
>>>> +static int kvm_s390_pv_alloc_vm(struct kvm *kvm)
>>>> +{
>>>> +	unsigned long base = uv_info.guest_base_stor_len;
>>>> +	unsigned long virt = uv_info.guest_virt_var_stor_len;
>>>> +	unsigned long npages = 0, vlen = 0;
>>>> +	struct kvm_memory_slot *memslot;
>>>> +
>>>> +	kvm->arch.pv.stor_var = NULL;
>>>> +	kvm->arch.pv.stor_base = __get_free_pages(GFP_KERNEL, get_order(base));
>>>> +	if (!kvm->arch.pv.stor_base)
>>>> +		return -ENOMEM;
>>>> +
>>>> +	/*
>>>> +	 * Calculate current guest storage for allocation of the
>>>> +	 * variable storage, which is based on the length in MB.
>>>> +	 *
>>>> +	 * Slots are sorted by GFN
>>>> +	 */
>>>> +	mutex_lock(&kvm->slots_lock);
>>>> +	memslot = kvm_memslots(kvm)->memslots;
>>>> +	npages = memslot->base_gfn + memslot->npages;
>>>> +	mutex_unlock(&kvm->slots_lock);
>>>
>>> As discussed, I think we should just use mem_limit and check against
>>> some hardcoded upper limit. But yeah, we can do that as an addon (in
>>> which case memory hotplug will require special tweaks to detect this
>>> from user space ... e.g., a new capability)
>>>
>>>
>>> [...]
>>>
>>>> +int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
>>>> +{
>>>> +		struct uv_cb_cgc uvcb = {
>>>> +		.header.cmd = UVC_CMD_CREATE_SEC_CONF,
>>>> +		.header.len = sizeof(uvcb)
>>>> +	};
>>>> +	int cc, ret;
>>>> +	u16 dummy;
>>>> +
>>>> +	ret = kvm_s390_pv_alloc_vm(kvm);
>>>> +	if (ret)
>>>> +		return ret;
>>>> +
>>>> +	/* Inputs */
>>>> +	uvcb.guest_stor_origin = 0; /* MSO is 0 for KVM */
>>>> +	uvcb.guest_stor_len = kvm->arch.pv.guest_len;
>>>> +	uvcb.guest_asce = kvm->arch.gmap->asce;
>>>> +	uvcb.guest_sca = (unsigned long)kvm->arch.sca;
>>>> +	uvcb.conf_base_stor_origin = (u64)kvm->arch.pv.stor_base;
>>>> +	uvcb.conf_virt_stor_origin = (u64)kvm->arch.pv.stor_var;
>>>> +
>>>> +	cc = uv_call(0, (u64)&uvcb);
>>>> +	*rc = uvcb.header.rc;
>>>> +	*rrc = uvcb.header.rrc;
>>>> +	KVM_UV_EVENT(kvm, 3, "PROTVIRT CREATE VM: handle %llx len %llx rc %x rrc %x",
>>>> +		     uvcb.guest_handle, uvcb.guest_stor_len, *rc, *rrc);
>>>> +
>>>> +	/* Outputs */
>>>> +	kvm->arch.pv.handle = uvcb.guest_handle;
>>>> +
>>>> +	if (cc) {
>>>> +		if (uvcb.header.rc & UVC_RC_NEED_DESTROY)
>>>> +			kvm_s390_pv_deinit_vm(kvm, &dummy, &dummy);
>>>> +		else
>>>> +			kvm_s390_pv_dealloc_vm(kvm);
>>>> +		return -EIO;
>>>
>>> A lot easier to read :)
>>>
>>>
>>> Fell free add my rb with or without the mem_limit change.
>>
>> I think I will keep the memslot logic. For hotplug we actually need
>> to notify the ultravisor that the guest size has changed as only the
>> ultravisor will be able to inject an addressing exception.
> 
> So holes in between memory slots won't be properly considered? What will
> happen if a guest accesses such memory right now?

QEMU would get a fault (just like when QEMU would read from a non-existing mapping).
I think this is ok, as for non-secure this would also trigger a crash (in the guest
though) since we do not provide the proper memory increment handling in QEMU after
we  have dropped the standby memory support. 


>> Lets keep it simple for now 
>>
> 
> I double checked (virt/kvm/kvm_main.c:update_memslots()), and the slots
> are definitely sorted "based on their GFN". I think, it's lowest GFN
> first, so the code in here would be wrong with more than one slot.
> 
> Can you double check, because I might misinterpret the code.

kvm_s390_get_cmma also uses the first memslot to calculate the buffer size.
I verified that with a hacked QEMU and printk that this is indeed sorted
started with the last memslot.
David Hildenbrand Feb. 26, 2020, 9:15 a.m. UTC | #5
On 26.02.20 10:12, Christian Borntraeger wrote:
> 
> 
> On 26.02.20 09:28, David Hildenbrand wrote:
>> On 26.02.20 09:12, Christian Borntraeger wrote:
>>>
>>>
>>> On 25.02.20 23:37, David Hildenbrand wrote:
>>>>
>>>>> +static int kvm_s390_pv_alloc_vm(struct kvm *kvm)
>>>>> +{
>>>>> +	unsigned long base = uv_info.guest_base_stor_len;
>>>>> +	unsigned long virt = uv_info.guest_virt_var_stor_len;
>>>>> +	unsigned long npages = 0, vlen = 0;
>>>>> +	struct kvm_memory_slot *memslot;
>>>>> +
>>>>> +	kvm->arch.pv.stor_var = NULL;
>>>>> +	kvm->arch.pv.stor_base = __get_free_pages(GFP_KERNEL, get_order(base));
>>>>> +	if (!kvm->arch.pv.stor_base)
>>>>> +		return -ENOMEM;
>>>>> +
>>>>> +	/*
>>>>> +	 * Calculate current guest storage for allocation of the
>>>>> +	 * variable storage, which is based on the length in MB.
>>>>> +	 *
>>>>> +	 * Slots are sorted by GFN
>>>>> +	 */
>>>>> +	mutex_lock(&kvm->slots_lock);
>>>>> +	memslot = kvm_memslots(kvm)->memslots;
>>>>> +	npages = memslot->base_gfn + memslot->npages;
>>>>> +	mutex_unlock(&kvm->slots_lock);
>>>>
>>>> As discussed, I think we should just use mem_limit and check against
>>>> some hardcoded upper limit. But yeah, we can do that as an addon (in
>>>> which case memory hotplug will require special tweaks to detect this
>>>> from user space ... e.g., a new capability)
>>>>
>>>>
>>>> [...]
>>>>
>>>>> +int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
>>>>> +{
>>>>> +		struct uv_cb_cgc uvcb = {
>>>>> +		.header.cmd = UVC_CMD_CREATE_SEC_CONF,
>>>>> +		.header.len = sizeof(uvcb)
>>>>> +	};
>>>>> +	int cc, ret;
>>>>> +	u16 dummy;
>>>>> +
>>>>> +	ret = kvm_s390_pv_alloc_vm(kvm);
>>>>> +	if (ret)
>>>>> +		return ret;
>>>>> +
>>>>> +	/* Inputs */
>>>>> +	uvcb.guest_stor_origin = 0; /* MSO is 0 for KVM */
>>>>> +	uvcb.guest_stor_len = kvm->arch.pv.guest_len;
>>>>> +	uvcb.guest_asce = kvm->arch.gmap->asce;
>>>>> +	uvcb.guest_sca = (unsigned long)kvm->arch.sca;
>>>>> +	uvcb.conf_base_stor_origin = (u64)kvm->arch.pv.stor_base;
>>>>> +	uvcb.conf_virt_stor_origin = (u64)kvm->arch.pv.stor_var;
>>>>> +
>>>>> +	cc = uv_call(0, (u64)&uvcb);
>>>>> +	*rc = uvcb.header.rc;
>>>>> +	*rrc = uvcb.header.rrc;
>>>>> +	KVM_UV_EVENT(kvm, 3, "PROTVIRT CREATE VM: handle %llx len %llx rc %x rrc %x",
>>>>> +		     uvcb.guest_handle, uvcb.guest_stor_len, *rc, *rrc);
>>>>> +
>>>>> +	/* Outputs */
>>>>> +	kvm->arch.pv.handle = uvcb.guest_handle;
>>>>> +
>>>>> +	if (cc) {
>>>>> +		if (uvcb.header.rc & UVC_RC_NEED_DESTROY)
>>>>> +			kvm_s390_pv_deinit_vm(kvm, &dummy, &dummy);
>>>>> +		else
>>>>> +			kvm_s390_pv_dealloc_vm(kvm);
>>>>> +		return -EIO;
>>>>
>>>> A lot easier to read :)
>>>>
>>>>
>>>> Fell free add my rb with or without the mem_limit change.
>>>
>>> I think I will keep the memslot logic. For hotplug we actually need
>>> to notify the ultravisor that the guest size has changed as only the
>>> ultravisor will be able to inject an addressing exception.
>>
>> So holes in between memory slots won't be properly considered? What will
>> happen if a guest accesses such memory right now?
> 
> QEMU would get a fault (just like when QEMU would read from a non-existing mapping).
> I think this is ok, as for non-secure this would also trigger a crash (in the guest
> though) since we do not provide the proper memory increment handling in QEMU after
> we  have dropped the standby memory support. 

Yeah, should be okay for all current use cases.

>> I double checked (virt/kvm/kvm_main.c:update_memslots()), and the slots
>> are definitely sorted "based on their GFN". I think, it's lowest GFN
>> first, so the code in here would be wrong with more than one slot.
>>
>> Can you double check, because I might misinterpret the code.
> 
> kvm_s390_get_cmma also uses the first memslot to calculate the buffer size.
> I verified that with a hacked QEMU and printk that this is indeed sorted
> started with the last memslot. 

Then I'm really looking forward to the memslot rework currently on the
list, because this is not-so-nice-code IMHO. Thanks for verifying!
Cornelia Huck Feb. 26, 2020, 10:01 a.m. UTC | #6
On Tue, 25 Feb 2020 16:48:22 -0500
Christian Borntraeger <borntraeger@de.ibm.com> wrote:

> From: Janosch Frank <frankja@linux.ibm.com>
> 
> This contains 3 main changes:
> 1. changes in SIE control block handling for secure guests
> 2. helper functions for create/destroy/unpack secure guests
> 3. KVM_S390_PV_COMMAND ioctl to allow userspace dealing with secure
> machines
> 
> Signed-off-by: Janosch Frank <frankja@linux.ibm.com>
> [borntraeger@de.ibm.com: patch merging, splitting, fixing]
> Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
> ---
>  arch/s390/include/asm/kvm_host.h |  24 ++-
>  arch/s390/include/asm/uv.h       |  69 ++++++++
>  arch/s390/kvm/Makefile           |   2 +-
>  arch/s390/kvm/kvm-s390.c         | 209 +++++++++++++++++++++++-
>  arch/s390/kvm/kvm-s390.h         |  33 ++++
>  arch/s390/kvm/pv.c               | 269 +++++++++++++++++++++++++++++++
>  include/uapi/linux/kvm.h         |  31 ++++
>  7 files changed, 633 insertions(+), 4 deletions(-)
>  create mode 100644 arch/s390/kvm/pv.c

(...)

> @@ -2165,6 +2168,160 @@ static int kvm_s390_set_cmma_bits(struct kvm *kvm,
>  	return r;
>  }
>  
> +static int kvm_s390_cpus_from_pv(struct kvm *kvm, u16 *rcp, u16 *rrcp)
> +{
> +	struct kvm_vcpu *vcpu;
> +	u16 rc, rrc;
> +	int ret = 0;
> +	int i;
> +
> +	/*
> +	 * We ignore failures and try to destroy as many CPUs as possible.

What is this 'destroying'? Is that really the right terminology? From a
quick glance, I would expect something more in the vein of cpu
unplugging, and I don't think that's what is happening here.

(I have obviously not yet read the whole thing, please give people some
more time to review this huge patch.)

> +	 * At the same time we must not free the assigned resources when
> +	 * this fails, as the ultravisor has still access to that memory.
> +	 * So kvm_s390_pv_destroy_cpu can leave a "wanted" memory leak
> +	 * behind.
> +	 * We want to return the first failure rc and rrc, though.
> +	 */
> +	kvm_for_each_vcpu(i, vcpu, kvm) {
> +		mutex_lock(&vcpu->mutex);
> +		if (kvm_s390_pv_destroy_cpu(vcpu, &rc, &rrc) && !ret) {
> +			*rcp = rc;
> +			*rrcp = rrc;
> +			ret = -EIO;
> +		}
> +		mutex_unlock(&vcpu->mutex);
> +	}
> +	return ret;
> +}
> +
> +static int kvm_s390_cpus_to_pv(struct kvm *kvm, u16 *rc, u16 *rrc)
> +{
> +	int i, r = 0;
> +	u16 dummy;
> +
> +	struct kvm_vcpu *vcpu;
> +
> +	kvm_for_each_vcpu(i, vcpu, kvm) {
> +		mutex_lock(&vcpu->mutex);
> +		r = kvm_s390_pv_create_cpu(vcpu, rc, rrc);
> +		mutex_unlock(&vcpu->mutex);
> +		if (r)
> +			break;
> +	}
> +	if (r)
> +		kvm_s390_cpus_from_pv(kvm, &dummy, &dummy);

This is a rather unlikely case, so we don't need to optimize this,
right?

Would rc/rrc from the rollback contain anything of interest if the
create fails (that is, anything more interesting than what that
function returns?

Similar comment for the 'create' as for the 'destroy' above. (Not
trying to nitpick, just a bit confused.)

Or is that not the cpu that is created/destroyed, but something else?
Sorry, just trying to understand where this is coming from.

> +	return r;
> +}

(...)

Will look at the remainder of the patch later, maybe I understand the
stuff above better after that.
Cornelia Huck Feb. 26, 2020, 10:38 a.m. UTC | #7
On Tue, 25 Feb 2020 16:48:22 -0500
Christian Borntraeger <borntraeger@de.ibm.com> wrote:

> From: Janosch Frank <frankja@linux.ibm.com>
> 
> This contains 3 main changes:
> 1. changes in SIE control block handling for secure guests
> 2. helper functions for create/destroy/unpack secure guests
> 3. KVM_S390_PV_COMMAND ioctl to allow userspace dealing with secure
> machines
> 
> Signed-off-by: Janosch Frank <frankja@linux.ibm.com>
> [borntraeger@de.ibm.com: patch merging, splitting, fixing]
> Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
> ---
>  arch/s390/include/asm/kvm_host.h |  24 ++-
>  arch/s390/include/asm/uv.h       |  69 ++++++++
>  arch/s390/kvm/Makefile           |   2 +-
>  arch/s390/kvm/kvm-s390.c         | 209 +++++++++++++++++++++++-
>  arch/s390/kvm/kvm-s390.h         |  33 ++++
>  arch/s390/kvm/pv.c               | 269 +++++++++++++++++++++++++++++++
>  include/uapi/linux/kvm.h         |  31 ++++
>  7 files changed, 633 insertions(+), 4 deletions(-)
>  create mode 100644 arch/s390/kvm/pv.c

> +int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
> +{
> +		struct uv_cb_cgc uvcb = {

Broken indentation.

> +		.header.cmd = UVC_CMD_CREATE_SEC_CONF,
> +		.header.len = sizeof(uvcb)
> +	};
> +	int cc, ret;
> +	u16 dummy;
> +
> +	ret = kvm_s390_pv_alloc_vm(kvm);
> +	if (ret)
> +		return ret;
> +
> +	/* Inputs */
> +	uvcb.guest_stor_origin = 0; /* MSO is 0 for KVM */

What is 'MSO'? (i.e., where is that 'M' coming from?)

> +	uvcb.guest_stor_len = kvm->arch.pv.guest_len;
> +	uvcb.guest_asce = kvm->arch.gmap->asce;
> +	uvcb.guest_sca = (unsigned long)kvm->arch.sca;
> +	uvcb.conf_base_stor_origin = (u64)kvm->arch.pv.stor_base;
> +	uvcb.conf_virt_stor_origin = (u64)kvm->arch.pv.stor_var;
> +
> +	cc = uv_call(0, (u64)&uvcb);
> +	*rc = uvcb.header.rc;
> +	*rrc = uvcb.header.rrc;
> +	KVM_UV_EVENT(kvm, 3, "PROTVIRT CREATE VM: handle %llx len %llx rc %x rrc %x",
> +		     uvcb.guest_handle, uvcb.guest_stor_len, *rc, *rrc);
> +
> +	/* Outputs */
> +	kvm->arch.pv.handle = uvcb.guest_handle;

Is this valid if the call failed?

> +
> +	if (cc) {
> +		if (uvcb.header.rc & UVC_RC_NEED_DESTROY)
> +			kvm_s390_pv_deinit_vm(kvm, &dummy, &dummy);
> +		else
> +			kvm_s390_pv_dealloc_vm(kvm);
> +		return -EIO;
> +	}
> +	kvm->arch.gmap->guest_handle = uvcb.guest_handle;

...especially as you assign that handle only down here.

> +	atomic_set(&kvm->mm->context.is_protected, 1);
> +	return 0;
> +}
> +
> +int kvm_s390_pv_set_sec_parms(struct kvm *kvm, void *hdr, u64 length, u16 *rc,
> +			      u16 *rrc)
> +{
> +	struct uv_cb_ssc uvcb = {
> +		.header.cmd = UVC_CMD_SET_SEC_CONF_PARAMS,
> +		.header.len = sizeof(uvcb),
> +		.sec_header_origin = (u64)hdr,
> +		.sec_header_len = length,
> +		.guest_handle = kvm_s390_pv_get_handle(kvm),
> +	};
> +	int cc = uv_call(0, (u64)&uvcb);
> +
> +	*rc = uvcb.header.rc;
> +	*rrc = uvcb.header.rrc;
> +	KVM_UV_EVENT(kvm, 3, "PROTVIRT VM SET PARMS: rc %x rrc %x",
> +		     *rc, *rrc);
> +	if (cc)
> +		return -EINVAL;
> +	return 0;

Maybe
	return cc ? -EINVAL : 0;

(I assume none of the possible rcs in this case could indicate
something that does not map to -EINVAL?)

> +}
Christian Borntraeger Feb. 26, 2020, 10:52 a.m. UTC | #8
On 26.02.20 11:01, Cornelia Huck wrote:
> On Tue, 25 Feb 2020 16:48:22 -0500
> Christian Borntraeger <borntraeger@de.ibm.com> wrote:
> 
>> From: Janosch Frank <frankja@linux.ibm.com>
>>
>> This contains 3 main changes:
>> 1. changes in SIE control block handling for secure guests
>> 2. helper functions for create/destroy/unpack secure guests
>> 3. KVM_S390_PV_COMMAND ioctl to allow userspace dealing with secure
>> machines
>>
>> Signed-off-by: Janosch Frank <frankja@linux.ibm.com>
>> [borntraeger@de.ibm.com: patch merging, splitting, fixing]
>> Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
>> ---
>>  arch/s390/include/asm/kvm_host.h |  24 ++-
>>  arch/s390/include/asm/uv.h       |  69 ++++++++
>>  arch/s390/kvm/Makefile           |   2 +-
>>  arch/s390/kvm/kvm-s390.c         | 209 +++++++++++++++++++++++-
>>  arch/s390/kvm/kvm-s390.h         |  33 ++++
>>  arch/s390/kvm/pv.c               | 269 +++++++++++++++++++++++++++++++
>>  include/uapi/linux/kvm.h         |  31 ++++
>>  7 files changed, 633 insertions(+), 4 deletions(-)
>>  create mode 100644 arch/s390/kvm/pv.c
> 
> (...)
> 
>> @@ -2165,6 +2168,160 @@ static int kvm_s390_set_cmma_bits(struct kvm *kvm,
>>  	return r;
>>  }
>>  
>> +static int kvm_s390_cpus_from_pv(struct kvm *kvm, u16 *rcp, u16 *rrcp)
>> +{
>> +	struct kvm_vcpu *vcpu;
>> +	u16 rc, rrc;
>> +	int ret = 0;
>> +	int i;
>> +
>> +	/*
>> +	 * We ignore failures and try to destroy as many CPUs as possible.
> 
> What is this 'destroying'? Is that really the right terminology? From a
> quick glance, I would expect something more in the vein of cpu
> unplugging, and I don't think that's what is happening here.

It is destroying the secure CPU at the ultravisor (its even the name of
the ultravisor call) so I think destroy fits nicely.

> 
> (I have obviously not yet read the whole thing, please give people some
> more time to review this huge patch.)
> 
>> +	 * At the same time we must not free the assigned resources when
>> +	 * this fails, as the ultravisor has still access to that memory.
>> +	 * So kvm_s390_pv_destroy_cpu can leave a "wanted" memory leak
>> +	 * behind.
>> +	 * We want to return the first failure rc and rrc, though.
>> +	 */
>> +	kvm_for_each_vcpu(i, vcpu, kvm) {
>> +		mutex_lock(&vcpu->mutex);
>> +		if (kvm_s390_pv_destroy_cpu(vcpu, &rc, &rrc) && !ret) {
>> +			*rcp = rc;
>> +			*rrcp = rrc;
>> +			ret = -EIO;
>> +		}
>> +		mutex_unlock(&vcpu->mutex);
>> +	}
>> +	return ret;
>> +}
>> +
>> +static int kvm_s390_cpus_to_pv(struct kvm *kvm, u16 *rc, u16 *rrc)
>> +{
>> +	int i, r = 0;
>> +	u16 dummy;
>> +
>> +	struct kvm_vcpu *vcpu;
>> +
>> +	kvm_for_each_vcpu(i, vcpu, kvm) {
>> +		mutex_lock(&vcpu->mutex);
>> +		r = kvm_s390_pv_create_cpu(vcpu, rc, rrc);
>> +		mutex_unlock(&vcpu->mutex);
>> +		if (r)
>> +			break;
>> +	}
>> +	if (r)
>> +		kvm_s390_cpus_from_pv(kvm, &dummy, &dummy);
> 
> This is a rather unlikely case, so we don't need to optimize this,
> right?

Right. All the error cases here are "should not happen"

> 
> Would rc/rrc from the rollback contain anything of interest if the
> create fails (that is, anything more interesting than what that
> function returns?

I think all rc/rrc from rollback would depend on the first error,
so this is what we return to userspace. 
Good thing is that we log all those in the debug feature so if anyone
wants to see the full things its there in /sys/kernel/debug/s390dbf/kvm-uv/sprintf
> 
> Similar comment for the 'create' as for the 'destroy' above. (Not
> trying to nitpick, just a bit confused.)
> 
> Or is that not the cpu that is created/destroyed, but something else?
> Sorry, just trying to understand where this is coming from.

Its the CPU instance in the ultravisor.
> 
>> +	return r;
>> +}
> 
> (...)
> 
> Will look at the remainder of the patch later, maybe I understand the
> stuff above better after that.
>
Christian Borntraeger Feb. 26, 2020, 11:03 a.m. UTC | #9
On 26.02.20 11:38, Cornelia Huck wrote:
> On Tue, 25 Feb 2020 16:48:22 -0500
> Christian Borntraeger <borntraeger@de.ibm.com> wrote:
> 
>> From: Janosch Frank <frankja@linux.ibm.com>
>>
>> This contains 3 main changes:
>> 1. changes in SIE control block handling for secure guests
>> 2. helper functions for create/destroy/unpack secure guests
>> 3. KVM_S390_PV_COMMAND ioctl to allow userspace dealing with secure
>> machines
>>
>> Signed-off-by: Janosch Frank <frankja@linux.ibm.com>
>> [borntraeger@de.ibm.com: patch merging, splitting, fixing]
>> Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
>> ---
>>  arch/s390/include/asm/kvm_host.h |  24 ++-
>>  arch/s390/include/asm/uv.h       |  69 ++++++++
>>  arch/s390/kvm/Makefile           |   2 +-
>>  arch/s390/kvm/kvm-s390.c         | 209 +++++++++++++++++++++++-
>>  arch/s390/kvm/kvm-s390.h         |  33 ++++
>>  arch/s390/kvm/pv.c               | 269 +++++++++++++++++++++++++++++++
>>  include/uapi/linux/kvm.h         |  31 ++++
>>  7 files changed, 633 insertions(+), 4 deletions(-)
>>  create mode 100644 arch/s390/kvm/pv.c
> 
>> +int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
>> +{
>> +		struct uv_cb_cgc uvcb = {
> 
> Broken indentation.

Fixes.

> 
>> +		.header.cmd = UVC_CMD_CREATE_SEC_CONF,
>> +		.header.len = sizeof(uvcb)
>> +	};
>> +	int cc, ret;
>> +	u16 dummy;
>> +
>> +	ret = kvm_s390_pv_alloc_vm(kvm);
>> +	if (ret)
>> +		return ret;
>> +
>> +	/* Inputs */
>> +	uvcb.guest_stor_origin = 0; /* MSO is 0 for KVM */
> 
> What is 'MSO'? (i.e., where is that 'M' coming from?)

Origin. (the one for the sie block).

> 
>> +	uvcb.guest_stor_len = kvm->arch.pv.guest_len;
>> +	uvcb.guest_asce = kvm->arch.gmap->asce;
>> +	uvcb.guest_sca = (unsigned long)kvm->arch.sca;
>> +	uvcb.conf_base_stor_origin = (u64)kvm->arch.pv.stor_base;
>> +	uvcb.conf_virt_stor_origin = (u64)kvm->arch.pv.stor_var;
>> +
>> +	cc = uv_call(0, (u64)&uvcb);
>> +	*rc = uvcb.header.rc;
>> +	*rrc = uvcb.header.rrc;
>> +	KVM_UV_EVENT(kvm, 3, "PROTVIRT CREATE VM: handle %llx len %llx rc %x rrc %x",
>> +		     uvcb.guest_handle, uvcb.guest_stor_len, *rc, *rrc);
>> +
>> +	/* Outputs */
>> +	kvm->arch.pv.handle = uvcb.guest_handle;
> 
> Is this valid if the call failed?

if not we would clear it below
> 
>> +
>> +	if (cc) {
>> +		if (uvcb.header.rc & UVC_RC_NEED_DESTROY)
>> +			kvm_s390_pv_deinit_vm(kvm, &dummy, &dummy);

		here

>> +		else
>> +			kvm_s390_pv_dealloc_vm(kvm);

		or here

>> +		return -EIO;
>> +	}
>> +	kvm->arch.gmap->guest_handle = uvcb.guest_handle;
> 
> ...especially as you assign that handle only down here.


> 
>> +	atomic_set(&kvm->mm->context.is_protected, 1);
>> +	return 0;
>> +}
>> +
>> +int kvm_s390_pv_set_sec_parms(struct kvm *kvm, void *hdr, u64 length, u16 *rc,
>> +			      u16 *rrc)
>> +{
>> +	struct uv_cb_ssc uvcb = {
>> +		.header.cmd = UVC_CMD_SET_SEC_CONF_PARAMS,
>> +		.header.len = sizeof(uvcb),
>> +		.sec_header_origin = (u64)hdr,
>> +		.sec_header_len = length,
>> +		.guest_handle = kvm_s390_pv_get_handle(kvm),
>> +	};
>> +	int cc = uv_call(0, (u64)&uvcb);
>> +
>> +	*rc = uvcb.header.rc;
>> +	*rrc = uvcb.header.rrc;
>> +	KVM_UV_EVENT(kvm, 3, "PROTVIRT VM SET PARMS: rc %x rrc %x",
>> +		     *rc, *rrc);
>> +	if (cc)
>> +		return -EINVAL;
>> +	return 0;
> 
> Maybe
> 	return cc ? -EINVAL : 0;

Yes.
Cornelia Huck Feb. 26, 2020, 12:26 p.m. UTC | #10
On Tue, 25 Feb 2020 16:48:22 -0500
Christian Borntraeger <borntraeger@de.ibm.com> wrote:

> From: Janosch Frank <frankja@linux.ibm.com>
> 
> This contains 3 main changes:
> 1. changes in SIE control block handling for secure guests
> 2. helper functions for create/destroy/unpack secure guests
> 3. KVM_S390_PV_COMMAND ioctl to allow userspace dealing with secure
> machines
> 
> Signed-off-by: Janosch Frank <frankja@linux.ibm.com>
> [borntraeger@de.ibm.com: patch merging, splitting, fixing]
> Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
> ---
>  arch/s390/include/asm/kvm_host.h |  24 ++-
>  arch/s390/include/asm/uv.h       |  69 ++++++++
>  arch/s390/kvm/Makefile           |   2 +-
>  arch/s390/kvm/kvm-s390.c         | 209 +++++++++++++++++++++++-
>  arch/s390/kvm/kvm-s390.h         |  33 ++++
>  arch/s390/kvm/pv.c               | 269 +++++++++++++++++++++++++++++++
>  include/uapi/linux/kvm.h         |  31 ++++
>  7 files changed, 633 insertions(+), 4 deletions(-)
>  create mode 100644 arch/s390/kvm/pv.c

(...)

> +static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd)
> +{
> +	int r = 0;
> +	u16 dummy;
> +	void __user *argp = (void __user *)cmd->data;
> +
> +	switch (cmd->cmd) {
> +	case KVM_PV_ENABLE: {
> +		r = -EINVAL;
> +		if (kvm_s390_pv_is_protected(kvm))
> +			break;
> +
> +		/*
> +		 *  FMT 4 SIE needs esca. As we never switch back to bsca from
> +		 *  esca, we need no cleanup in the error cases below
> +		 */
> +		r = sca_switch_to_extended(kvm);
> +		if (r)
> +			break;
> +
> +		r = kvm_s390_pv_init_vm(kvm, &cmd->rc, &cmd->rrc);
> +		if (r)
> +			break;
> +
> +		r = kvm_s390_cpus_to_pv(kvm, &cmd->rc, &cmd->rrc);
> +		if (r)
> +			kvm_s390_pv_deinit_vm(kvm, &dummy, &dummy);
> +		break;
> +	}
> +	case KVM_PV_DISABLE: {
> +		r = -EINVAL;
> +		if (!kvm_s390_pv_is_protected(kvm))
> +			break;
> +
> +		r = kvm_s390_cpus_from_pv(kvm, &cmd->rc, &cmd->rrc);
> +		/*
> +		 * If a CPU could not be destroyed, destroy VM will also fail.
> +		 * There is no point in trying to destroy it. Instead return
> +		 * the rc and rrc from the first CPU that failed destroying.
> +		 */
> +		if (r)
> +			break;
> +		r = kvm_s390_pv_deinit_vm(kvm, &cmd->rc, &cmd->rrc);
> +		break;
> +	}

IIUC, we may end up in an odd state in the failure case, because we
might not be able to free up the donated memory, depending on what goes
wrong. Can userspace do anything useful with the vm if that happens?

Even more important, userspace cannot cause repeated donations by
repeatedly calling this ioctl, right?

> +	case KVM_PV_SET_SEC_PARMS: {
> +		struct kvm_s390_pv_sec_parm parms = {};
> +		void *hdr;
> +
> +		r = -EINVAL;
> +		if (!kvm_s390_pv_is_protected(kvm))
> +			break;
> +
> +		r = -EFAULT;
> +		if (copy_from_user(&parms, argp, sizeof(parms)))
> +			break;
> +
> +		/* Currently restricted to 8KB */
> +		r = -EINVAL;
> +		if (parms.length > PAGE_SIZE * 2)
> +			break;
> +
> +		r = -ENOMEM;
> +		hdr = vmalloc(parms.length);
> +		if (!hdr)
> +			break;
> +
> +		r = -EFAULT;
> +		if (!copy_from_user(hdr, (void __user *)parms.origin,
> +				    parms.length))
> +			r = kvm_s390_pv_set_sec_parms(kvm, hdr, parms.length,
> +						      &cmd->rc, &cmd->rrc);
> +
> +		vfree(hdr);
> +		break;
> +	}
> +	case KVM_PV_UNPACK: {
> +		struct kvm_s390_pv_unp unp = {};
> +
> +		r = -EINVAL;
> +		if (!kvm_s390_pv_is_protected(kvm))
> +			break;
> +
> +		r = -EFAULT;
> +		if (copy_from_user(&unp, argp, sizeof(unp)))
> +			break;
> +
> +		r = kvm_s390_pv_unpack(kvm, unp.addr, unp.size, unp.tweak,
> +				       &cmd->rc, &cmd->rrc);
> +		break;
> +	}
> +	case KVM_PV_VERIFY: {
> +		r = -EINVAL;
> +		if (!kvm_s390_pv_is_protected(kvm))
> +			break;
> +
> +		r = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm),
> +				  UVC_CMD_VERIFY_IMG, &cmd->rc, &cmd->rrc);
> +		KVM_UV_EVENT(kvm, 3, "PROTVIRT VERIFY: rc %x rrc %x", cmd->rc,
> +			     cmd->rrc);
> +		break;
> +	}
> +	default:
> +		return -ENOTTY;

Nit: why not r = -ENOTTY, so you get a single exit point?

> +	}
> +	return r;
> +}
> +
>  long kvm_arch_vm_ioctl(struct file *filp,
>  		       unsigned int ioctl, unsigned long arg)
>  {
> @@ -2262,6 +2419,27 @@ long kvm_arch_vm_ioctl(struct file *filp,
>  		mutex_unlock(&kvm->slots_lock);
>  		break;
>  	}
> +	case KVM_S390_PV_COMMAND: {
> +		struct kvm_pv_cmd args;
> +
> +		r = 0;
> +		if (!is_prot_virt_host()) {
> +			r = -EINVAL;
> +			break;
> +		}
> +		if (copy_from_user(&args, argp, sizeof(args))) {
> +			r = -EFAULT;
> +			break;
> +		}

The api states that args.flags must be 0... better enforce that?

> +		mutex_lock(&kvm->lock);
> +		r = kvm_s390_handle_pv(kvm, &args);
> +		mutex_unlock(&kvm->lock);
> +		if (copy_to_user(argp, &args, sizeof(args))) {
> +			r = -EFAULT;
> +			break;
> +		}
> +		break;
> +	}
>  	default:
>  		r = -ENOTTY;
>  	}

(...)

> @@ -2558,10 +2741,19 @@ static void kvm_free_vcpus(struct kvm *kvm)
>  
>  void kvm_arch_destroy_vm(struct kvm *kvm)
>  {
> +	u16 rc, rrc;

Nit: missing empty line.

>  	kvm_free_vcpus(kvm);
>  	sca_dispose(kvm);
> -	debug_unregister(kvm->arch.dbf);
>  	kvm_s390_gisa_destroy(kvm);
> +	/*
> +	 * We are already at the end of life and kvm->lock is not taken.
> +	 * This is ok as the file descriptor is closed by now and nobody
> +	 * can mess with the pv state. To avoid lockdep_assert_held from
> +	 * complaining we do not use kvm_s390_pv_is_protected.
> +	 */
> +	if (kvm_s390_pv_get_handle(kvm))
> +		kvm_s390_pv_deinit_vm(kvm, &rc, &rrc);
> +	debug_unregister(kvm->arch.dbf);
>  	free_page((unsigned long)kvm->arch.sie_page2);
>  	if (!kvm_is_ucontrol(kvm))
>  		gmap_remove(kvm->arch.gmap);

(...)

> @@ -4540,6 +4744,9 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
>  	if (mem->guest_phys_addr + mem->memory_size > kvm->arch.mem_limit)
>  		return -EINVAL;
>  
> +	/* When we are protected we should not change the memory slots */

s/protected/protected,/

> +	if (kvm_s390_pv_is_protected(kvm))
> +		return -EINVAL;
>  	return 0;
>  }
>  

(...)

> +static int kvm_s390_pv_alloc_vm(struct kvm *kvm)
> +{
> +	unsigned long base = uv_info.guest_base_stor_len;
> +	unsigned long virt = uv_info.guest_virt_var_stor_len;

base_len and virt_len? Makes the code below less confusing.

> +	unsigned long npages = 0, vlen = 0;
> +	struct kvm_memory_slot *memslot;
> +
> +	kvm->arch.pv.stor_var = NULL;
> +	kvm->arch.pv.stor_base = __get_free_pages(GFP_KERNEL, get_order(base));
> +	if (!kvm->arch.pv.stor_base)
> +		return -ENOMEM;
> +
> +	/*
> +	 * Calculate current guest storage for allocation of the
> +	 * variable storage, which is based on the length in MB.
> +	 *
> +	 * Slots are sorted by GFN
> +	 */
> +	mutex_lock(&kvm->slots_lock);
> +	memslot = kvm_memslots(kvm)->memslots;
> +	npages = memslot->base_gfn + memslot->npages;
> +	mutex_unlock(&kvm->slots_lock);
> +
> +	kvm->arch.pv.guest_len = npages * PAGE_SIZE;
> +
> +	/* Allocate variable storage */
> +	vlen = ALIGN(virt * ((npages * PAGE_SIZE) / HPAGE_SIZE), PAGE_SIZE);
> +	vlen += uv_info.guest_virt_base_stor_len;
> +	kvm->arch.pv.stor_var = vzalloc(vlen);
> +	if (!kvm->arch.pv.stor_var)
> +		goto out_err;
> +	return 0;
> +
> +out_err:
> +	kvm_s390_pv_dealloc_vm(kvm);
> +	return -ENOMEM;
> +}
> +
> +/* this should not fail, but if it does we must not free the donated memory */

s/does/does,/

> +int kvm_s390_pv_deinit_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
> +{
> +	int cc;
> +
> +	cc = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm),
> +			   UVC_CMD_DESTROY_SEC_CONF, rc, rrc);
> +	WRITE_ONCE(kvm->arch.gmap->guest_handle, 0);
> +	atomic_set(&kvm->mm->context.is_protected, 0);
> +	KVM_UV_EVENT(kvm, 3, "PROTVIRT DESTROY VM: rc %x rrc %x", *rc, *rrc);
> +	WARN_ONCE(cc, "protvirt destroy vm failed rc %x rrc %x", *rc, *rrc);
> +	/* Inteded memory leak on "impossible" error */
> +	if (!cc)
> +		kvm_s390_pv_dealloc_vm(kvm);
> +	return cc ? -EIO : 0;
> +}

(...)

> +struct kvm_pv_cmd {
> +	__u32 cmd;	/* Command to be executed */
> +	__u16 rc;	/* Ultravisor return code */
> +	__u16 rrc;	/* Ultravisor return reason code */
> +	__u64 data;	/* Data or address */
> +	__u32 flags;    /* flags for future extensions. Must be 0 for now */
> +	__u32 reserved[3];
> +};
> +
> +/* Available with KVM_CAP_S390_PROTECTED */
> +#define KVM_S390_PV_COMMAND		_IOWR(KVMIO, 0xc5, struct kvm_pv_cmd)
> +
>  /* Secure Encrypted Virtualization command */
>  enum sev_cmd_id {
>  	/* Guest initialization commands */
Christian Borntraeger Feb. 26, 2020, 1:31 p.m. UTC | #11
On 26.02.20 13:26, Cornelia Huck wrote:
> On Tue, 25 Feb 2020 16:48:22 -0500
> Christian Borntraeger <borntraeger@de.ibm.com> wrote:
> 
>> From: Janosch Frank <frankja@linux.ibm.com>
>>
>> This contains 3 main changes:
>> 1. changes in SIE control block handling for secure guests
>> 2. helper functions for create/destroy/unpack secure guests
>> 3. KVM_S390_PV_COMMAND ioctl to allow userspace dealing with secure
>> machines
>>
>> Signed-off-by: Janosch Frank <frankja@linux.ibm.com>
>> [borntraeger@de.ibm.com: patch merging, splitting, fixing]
>> Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
>> ---
>>  arch/s390/include/asm/kvm_host.h |  24 ++-
>>  arch/s390/include/asm/uv.h       |  69 ++++++++
>>  arch/s390/kvm/Makefile           |   2 +-
>>  arch/s390/kvm/kvm-s390.c         | 209 +++++++++++++++++++++++-
>>  arch/s390/kvm/kvm-s390.h         |  33 ++++
>>  arch/s390/kvm/pv.c               | 269 +++++++++++++++++++++++++++++++
>>  include/uapi/linux/kvm.h         |  31 ++++
>>  7 files changed, 633 insertions(+), 4 deletions(-)
>>  create mode 100644 arch/s390/kvm/pv.c
> 
> (...)
> 
>> +static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd)
>> +{
>> +	int r = 0;
>> +	u16 dummy;
>> +	void __user *argp = (void __user *)cmd->data;
>> +
>> +	switch (cmd->cmd) {
>> +	case KVM_PV_ENABLE: {
>> +		r = -EINVAL;
>> +		if (kvm_s390_pv_is_protected(kvm))
>> +			break;
>> +
>> +		/*
>> +		 *  FMT 4 SIE needs esca. As we never switch back to bsca from
>> +		 *  esca, we need no cleanup in the error cases below
>> +		 */
>> +		r = sca_switch_to_extended(kvm);
>> +		if (r)
>> +			break;
>> +
>> +		r = kvm_s390_pv_init_vm(kvm, &cmd->rc, &cmd->rrc);
>> +		if (r)
>> +			break;
>> +
>> +		r = kvm_s390_cpus_to_pv(kvm, &cmd->rc, &cmd->rrc);
>> +		if (r)
>> +			kvm_s390_pv_deinit_vm(kvm, &dummy, &dummy);
>> +		break;
>> +	}
>> +	case KVM_PV_DISABLE: {
>> +		r = -EINVAL;
>> +		if (!kvm_s390_pv_is_protected(kvm))
>> +			break;
>> +
>> +		r = kvm_s390_cpus_from_pv(kvm, &cmd->rc, &cmd->rrc);
>> +		/*
>> +		 * If a CPU could not be destroyed, destroy VM will also fail.
>> +		 * There is no point in trying to destroy it. Instead return
>> +		 * the rc and rrc from the first CPU that failed destroying.
>> +		 */
>> +		if (r)
>> +			break;
>> +		r = kvm_s390_pv_deinit_vm(kvm, &cmd->rc, &cmd->rrc);
>> +		break;
>> +	}
> 
> IIUC, we may end up in an odd state in the failure case, because we
> might not be able to free up the donated memory, depending on what goes
> wrong. Can userspace do anything useful with the vm if that happens?

The protected VM is still there. Depending on how many CPUs are still
available the secure guest can still run or not. But this is really
fine, no strange things will happen.
> 
> Even more important, userspace cannot cause repeated donations by
> repeatedly calling this ioctl, right?

No, we keep the handle exactly to avoid userspace to not be able
to do disable/enable in a loop.

> 
>> +	case KVM_PV_SET_SEC_PARMS: {
>> +		struct kvm_s390_pv_sec_parm parms = {};
>> +		void *hdr;
>> +
>> +		r = -EINVAL;
>> +		if (!kvm_s390_pv_is_protected(kvm))
>> +			break;
>> +
>> +		r = -EFAULT;
>> +		if (copy_from_user(&parms, argp, sizeof(parms)))
>> +			break;
>> +
>> +		/* Currently restricted to 8KB */
>> +		r = -EINVAL;
>> +		if (parms.length > PAGE_SIZE * 2)
>> +			break;
>> +
>> +		r = -ENOMEM;
>> +		hdr = vmalloc(parms.length);
>> +		if (!hdr)
>> +			break;
>> +
>> +		r = -EFAULT;
>> +		if (!copy_from_user(hdr, (void __user *)parms.origin,
>> +				    parms.length))
>> +			r = kvm_s390_pv_set_sec_parms(kvm, hdr, parms.length,
>> +						      &cmd->rc, &cmd->rrc);
>> +
>> +		vfree(hdr);
>> +		break;
>> +	}
>> +	case KVM_PV_UNPACK: {
>> +		struct kvm_s390_pv_unp unp = {};
>> +
>> +		r = -EINVAL;
>> +		if (!kvm_s390_pv_is_protected(kvm))
>> +			break;
>> +
>> +		r = -EFAULT;
>> +		if (copy_from_user(&unp, argp, sizeof(unp)))
>> +			break;
>> +
>> +		r = kvm_s390_pv_unpack(kvm, unp.addr, unp.size, unp.tweak,
>> +				       &cmd->rc, &cmd->rrc);
>> +		break;
>> +	}
>> +	case KVM_PV_VERIFY: {
>> +		r = -EINVAL;
>> +		if (!kvm_s390_pv_is_protected(kvm))
>> +			break;
>> +
>> +		r = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm),
>> +				  UVC_CMD_VERIFY_IMG, &cmd->rc, &cmd->rrc);
>> +		KVM_UV_EVENT(kvm, 3, "PROTVIRT VERIFY: rc %x rrc %x", cmd->rc,
>> +			     cmd->rrc);
>> +		break;
>> +	}
>> +	default:
>> +		return -ENOTTY;
> 
> Nit: why not r = -ENOTTY, so you get a single exit point?

can do.

> 
>> +	}
>> +	return r;
>> +}
>> +
>>  long kvm_arch_vm_ioctl(struct file *filp,
>>  		       unsigned int ioctl, unsigned long arg)
>>  {
>> @@ -2262,6 +2419,27 @@ long kvm_arch_vm_ioctl(struct file *filp,
>>  		mutex_unlock(&kvm->slots_lock);
>>  		break;
>>  	}
>> +	case KVM_S390_PV_COMMAND: {
>> +		struct kvm_pv_cmd args;
>> +
>> +		r = 0;
>> +		if (!is_prot_virt_host()) {
>> +			r = -EINVAL;
>> +			break;
>> +		}
>> +		if (copy_from_user(&args, argp, sizeof(args))) {
>> +			r = -EFAULT;
>> +			break;
>> +		}
> 
> The api states that args.flags must be 0... better enforce that?


yes
@@ -2431,6 +2431,10 @@ long kvm_arch_vm_ioctl(struct file *filp,
                        r = -EFAULT;
                        break;
                }
+               if (args.flags) {
+                       r = -EINVAL;
+                       break;
+               }
                mutex_lock(&kvm->lock);
                r = kvm_s390_handle_pv(kvm, &args);
                mutex_unlock(&kvm->lock);



> 
>> +		mutex_lock(&kvm->lock);
>> +		r = kvm_s390_handle_pv(kvm, &args);
>> +		mutex_unlock(&kvm->lock);
>> +		if (copy_to_user(argp, &args, sizeof(args))) {
>> +			r = -EFAULT;
>> +			break;
>> +		}
>> +		break;
>> +	}
>>  	default:
>>  		r = -ENOTTY;
>>  	}
> 
> (...)
> 
>> @@ -2558,10 +2741,19 @@ static void kvm_free_vcpus(struct kvm *kvm)
>>  
>>  void kvm_arch_destroy_vm(struct kvm *kvm)
>>  {
>> +	u16 rc, rrc;
> 
> Nit: missing empty line.

ack.

> 
>>  	kvm_free_vcpus(kvm);
>>  	sca_dispose(kvm);
>> -	debug_unregister(kvm->arch.dbf);
>>  	kvm_s390_gisa_destroy(kvm);
>> +	/*
>> +	 * We are already at the end of life and kvm->lock is not taken.
>> +	 * This is ok as the file descriptor is closed by now and nobody
>> +	 * can mess with the pv state. To avoid lockdep_assert_held from
>> +	 * complaining we do not use kvm_s390_pv_is_protected.
>> +	 */
>> +	if (kvm_s390_pv_get_handle(kvm))
>> +		kvm_s390_pv_deinit_vm(kvm, &rc, &rrc);
>> +	debug_unregister(kvm->arch.dbf);
>>  	free_page((unsigned long)kvm->arch.sie_page2);
>>  	if (!kvm_is_ucontrol(kvm))
>>  		gmap_remove(kvm->arch.gmap);
> 
> (...)
> 
>> @@ -4540,6 +4744,9 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
>>  	if (mem->guest_phys_addr + mem->memory_size > kvm->arch.mem_limit)
>>  		return -EINVAL;
>>  
>> +	/* When we are protected we should not change the memory slots */
> 
> s/protected/protected,/

ack.

> 
>> +	if (kvm_s390_pv_is_protected(kvm))
>> +		return -EINVAL;
>>  	return 0;
>>  }
>>  
> 
> (...)
> 
>> +static int kvm_s390_pv_alloc_vm(struct kvm *kvm)
>> +{
>> +	unsigned long base = uv_info.guest_base_stor_len;
>> +	unsigned long virt = uv_info.guest_virt_var_stor_len;
> 
> base_len and virt_len? Makes the code below less confusing.

this makes some lines longer than 80 for no obvious benefit. I think I will keep
the names.


> 
>> +	unsigned long npages = 0, vlen = 0;
>> +	struct kvm_memory_slot *memslot;
>> +
>> +	kvm->arch.pv.stor_var = NULL;
>> +	kvm->arch.pv.stor_base = __get_free_pages(GFP_KERNEL, get_order(base));
>> +	if (!kvm->arch.pv.stor_base)
>> +		return -ENOMEM;
>> +
>> +	/*
>> +	 * Calculate current guest storage for allocation of the
>> +	 * variable storage, which is based on the length in MB.
>> +	 *
>> +	 * Slots are sorted by GFN
>> +	 */
>> +	mutex_lock(&kvm->slots_lock);
>> +	memslot = kvm_memslots(kvm)->memslots;
>> +	npages = memslot->base_gfn + memslot->npages;
>> +	mutex_unlock(&kvm->slots_lock);
>> +
>> +	kvm->arch.pv.guest_len = npages * PAGE_SIZE;
>> +
>> +	/* Allocate variable storage */
>> +	vlen = ALIGN(virt * ((npages * PAGE_SIZE) / HPAGE_SIZE), PAGE_SIZE);
>> +	vlen += uv_info.guest_virt_base_stor_len;
>> +	kvm->arch.pv.stor_var = vzalloc(vlen);
>> +	if (!kvm->arch.pv.stor_var)
>> +		goto out_err;
>> +	return 0;
>> +
>> +out_err:
>> +	kvm_s390_pv_dealloc_vm(kvm);
>> +	return -ENOMEM;
>> +}
>> +
>> +/* this should not fail, but if it does we must not free the donated memory */
> 
> s/does/does,/

ack
Cornelia Huck Feb. 26, 2020, 4:54 p.m. UTC | #12
On Wed, 26 Feb 2020 14:31:36 +0100
Christian Borntraeger <borntraeger@de.ibm.com> wrote:

> On 26.02.20 13:26, Cornelia Huck wrote:
> > On Tue, 25 Feb 2020 16:48:22 -0500
> > Christian Borntraeger <borntraeger@de.ibm.com> wrote:
> >   
> >> From: Janosch Frank <frankja@linux.ibm.com>
> >>
> >> This contains 3 main changes:
> >> 1. changes in SIE control block handling for secure guests
> >> 2. helper functions for create/destroy/unpack secure guests
> >> 3. KVM_S390_PV_COMMAND ioctl to allow userspace dealing with secure
> >> machines
> >>
> >> Signed-off-by: Janosch Frank <frankja@linux.ibm.com>
> >> [borntraeger@de.ibm.com: patch merging, splitting, fixing]
> >> Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
> >> ---
> >>  arch/s390/include/asm/kvm_host.h |  24 ++-
> >>  arch/s390/include/asm/uv.h       |  69 ++++++++
> >>  arch/s390/kvm/Makefile           |   2 +-
> >>  arch/s390/kvm/kvm-s390.c         | 209 +++++++++++++++++++++++-
> >>  arch/s390/kvm/kvm-s390.h         |  33 ++++
> >>  arch/s390/kvm/pv.c               | 269 +++++++++++++++++++++++++++++++
> >>  include/uapi/linux/kvm.h         |  31 ++++
> >>  7 files changed, 633 insertions(+), 4 deletions(-)
> >>  create mode 100644 arch/s390/kvm/pv.c  

> >> @@ -2262,6 +2419,27 @@ long kvm_arch_vm_ioctl(struct file *filp,
> >>  		mutex_unlock(&kvm->slots_lock);
> >>  		break;
> >>  	}
> >> +	case KVM_S390_PV_COMMAND: {
> >> +		struct kvm_pv_cmd args;
> >> +
> >> +		r = 0;
> >> +		if (!is_prot_virt_host()) {
> >> +			r = -EINVAL;
> >> +			break;
> >> +		}
> >> +		if (copy_from_user(&args, argp, sizeof(args))) {
> >> +			r = -EFAULT;
> >> +			break;
> >> +		}  
> > 
> > The api states that args.flags must be 0... better enforce that?  
> 
> 
> yes
> @@ -2431,6 +2431,10 @@ long kvm_arch_vm_ioctl(struct file *filp,
>                         r = -EFAULT;
>                         break;
>                 }
> +               if (args.flags) {
> +                       r = -EINVAL;
> +                       break;
> +               }
>                 mutex_lock(&kvm->lock);
>                 r = kvm_s390_handle_pv(kvm, &args);
>                 mutex_unlock(&kvm->lock);

Looks good.
diff mbox series

Patch

diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index d058289385a5..1aa2382fe363 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -160,7 +160,13 @@  struct kvm_s390_sie_block {
 	__u8	reserved08[4];		/* 0x0008 */
 #define PROG_IN_SIE (1<<0)
 	__u32	prog0c;			/* 0x000c */
-	__u8	reserved10[16];		/* 0x0010 */
+	union {
+		__u8	reserved10[16];		/* 0x0010 */
+		struct {
+			__u64	pv_handle_cpu;
+			__u64	pv_handle_config;
+		};
+	};
 #define PROG_BLOCK_SIE	(1<<0)
 #define PROG_REQUEST	(1<<1)
 	atomic_t prog20;		/* 0x0020 */
@@ -233,7 +239,7 @@  struct kvm_s390_sie_block {
 #define ECB3_RI  0x01
 	__u8    ecb3;			/* 0x0063 */
 	__u32	scaol;			/* 0x0064 */
-	__u8	reserved68;		/* 0x0068 */
+	__u8	sdf;			/* 0x0068 */
 	__u8    epdx;			/* 0x0069 */
 	__u8    reserved6a[2];		/* 0x006a */
 	__u32	todpr;			/* 0x006c */
@@ -645,6 +651,11 @@  struct kvm_guestdbg_info_arch {
 	unsigned long last_bp;
 };
 
+struct kvm_s390_pv_vcpu {
+	u64 handle;
+	unsigned long stor_base;
+};
+
 struct kvm_vcpu_arch {
 	struct kvm_s390_sie_block *sie_block;
 	/* if vsie is active, currently executed shadow sie control block */
@@ -673,6 +684,7 @@  struct kvm_vcpu_arch {
 	__u64 cputm_start;
 	bool gs_enabled;
 	bool skey_enabled;
+	struct kvm_s390_pv_vcpu pv;
 };
 
 struct kvm_vm_stat {
@@ -843,6 +855,13 @@  struct kvm_s390_gisa_interrupt {
 	DECLARE_BITMAP(kicked_mask, KVM_MAX_VCPUS);
 };
 
+struct kvm_s390_pv {
+	u64 handle;
+	u64 guest_len;
+	unsigned long stor_base;
+	void *stor_var;
+};
+
 struct kvm_arch{
 	void *sca;
 	int use_esca;
@@ -878,6 +897,7 @@  struct kvm_arch{
 	DECLARE_BITMAP(cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS);
 	DECLARE_BITMAP(idle_mask, KVM_MAX_VCPUS);
 	struct kvm_s390_gisa_interrupt gisa_int;
+	struct kvm_s390_pv pv;
 };
 
 #define KVM_HVA_ERR_BAD		(-1UL)
diff --git a/arch/s390/include/asm/uv.h b/arch/s390/include/asm/uv.h
index a81af06507a9..09dc6dba94a4 100644
--- a/arch/s390/include/asm/uv.h
+++ b/arch/s390/include/asm/uv.h
@@ -23,11 +23,19 @@ 
 #define UVC_RC_INV_STATE	0x0003
 #define UVC_RC_INV_LEN		0x0005
 #define UVC_RC_NO_RESUME	0x0007
+#define UVC_RC_NEED_DESTROY	0x8000
 
 #define UVC_CMD_QUI			0x0001
 #define UVC_CMD_INIT_UV			0x000f
+#define UVC_CMD_CREATE_SEC_CONF		0x0100
+#define UVC_CMD_DESTROY_SEC_CONF	0x0101
+#define UVC_CMD_CREATE_SEC_CPU		0x0120
+#define UVC_CMD_DESTROY_SEC_CPU		0x0121
 #define UVC_CMD_CONV_TO_SEC_STOR	0x0200
 #define UVC_CMD_CONV_FROM_SEC_STOR	0x0201
+#define UVC_CMD_SET_SEC_CONF_PARAMS	0x0300
+#define UVC_CMD_UNPACK_IMG		0x0301
+#define UVC_CMD_VERIFY_IMG		0x0302
 #define UVC_CMD_PIN_PAGE_SHARED		0x0341
 #define UVC_CMD_UNPIN_PAGE_SHARED	0x0342
 #define UVC_CMD_SET_SHARED_ACCESS	0x1000
@@ -37,10 +45,17 @@ 
 enum uv_cmds_inst {
 	BIT_UVC_CMD_QUI = 0,
 	BIT_UVC_CMD_INIT_UV = 1,
+	BIT_UVC_CMD_CREATE_SEC_CONF = 2,
+	BIT_UVC_CMD_DESTROY_SEC_CONF = 3,
+	BIT_UVC_CMD_CREATE_SEC_CPU = 4,
+	BIT_UVC_CMD_DESTROY_SEC_CPU = 5,
 	BIT_UVC_CMD_CONV_TO_SEC_STOR = 6,
 	BIT_UVC_CMD_CONV_FROM_SEC_STOR = 7,
 	BIT_UVC_CMD_SET_SHARED_ACCESS = 8,
 	BIT_UVC_CMD_REMOVE_SHARED_ACCESS = 9,
+	BIT_UVC_CMD_SET_SEC_PARMS = 11,
+	BIT_UVC_CMD_UNPACK_IMG = 13,
+	BIT_UVC_CMD_VERIFY_IMG = 14,
 	BIT_UVC_CMD_PIN_PAGE_SHARED = 21,
 	BIT_UVC_CMD_UNPIN_PAGE_SHARED = 22,
 };
@@ -52,6 +67,7 @@  struct uv_cb_header {
 	u16 rrc;	/* Return Reason Code */
 } __packed __aligned(8);
 
+/* Query Ultravisor Information */
 struct uv_cb_qui {
 	struct uv_cb_header header;
 	u64 reserved08;
@@ -71,6 +87,7 @@  struct uv_cb_qui {
 	u8  reserveda0[200 - 160];
 } __packed __aligned(8);
 
+/* Initialize Ultravisor */
 struct uv_cb_init {
 	struct uv_cb_header header;
 	u64 reserved08[2];
@@ -79,6 +96,35 @@  struct uv_cb_init {
 	u64 reserved28[4];
 } __packed __aligned(8);
 
+/* Create Guest Configuration */
+struct uv_cb_cgc {
+	struct uv_cb_header header;
+	u64 reserved08[2];
+	u64 guest_handle;
+	u64 conf_base_stor_origin;
+	u64 conf_virt_stor_origin;
+	u64 reserved30;
+	u64 guest_stor_origin;
+	u64 guest_stor_len;
+	u64 guest_sca;
+	u64 guest_asce;
+	u64 reserved58[5];
+} __packed __aligned(8);
+
+/* Create Secure CPU */
+struct uv_cb_csc {
+	struct uv_cb_header header;
+	u64 reserved08[2];
+	u64 cpu_handle;
+	u64 guest_handle;
+	u64 stor_origin;
+	u8  reserved30[6];
+	u16 num;
+	u64 state_origin;
+	u64 reserved40[4];
+} __packed __aligned(8);
+
+/* Convert to Secure */
 struct uv_cb_cts {
 	struct uv_cb_header header;
 	u64 reserved08[2];
@@ -86,12 +132,34 @@  struct uv_cb_cts {
 	u64 gaddr;
 } __packed __aligned(8);
 
+/* Convert from Secure / Pin Page Shared */
 struct uv_cb_cfs {
 	struct uv_cb_header header;
 	u64 reserved08[2];
 	u64 paddr;
 } __packed __aligned(8);
 
+/* Set Secure Config Parameter */
+struct uv_cb_ssc {
+	struct uv_cb_header header;
+	u64 reserved08[2];
+	u64 guest_handle;
+	u64 sec_header_origin;
+	u32 sec_header_len;
+	u32 reserved2c;
+	u64 reserved30[4];
+} __packed __aligned(8);
+
+/* Unpack */
+struct uv_cb_unp {
+	struct uv_cb_header header;
+	u64 reserved08[2];
+	u64 guest_handle;
+	u64 gaddr;
+	u64 tweak[2];
+	u64 reserved38[3];
+} __packed __aligned(8);
+
 /*
  * A common UV call struct for calls that take no payload
  * Examples:
@@ -105,6 +173,7 @@  struct uv_cb_nodata {
 	u64 reserved20[4];
 } __packed __aligned(8);
 
+/* Set Shared Access */
 struct uv_cb_share {
 	struct uv_cb_header header;
 	u64 reserved08[3];
diff --git a/arch/s390/kvm/Makefile b/arch/s390/kvm/Makefile
index 05ee90a5ea08..12decca22e7c 100644
--- a/arch/s390/kvm/Makefile
+++ b/arch/s390/kvm/Makefile
@@ -9,6 +9,6 @@  common-objs = $(KVM)/kvm_main.o $(KVM)/eventfd.o  $(KVM)/async_pf.o $(KVM)/irqch
 ccflags-y := -Ivirt/kvm -Iarch/s390/kvm
 
 kvm-objs := $(common-objs) kvm-s390.o intercept.o interrupt.o priv.o sigp.o
-kvm-objs += diag.o gaccess.o guestdbg.o vsie.o
+kvm-objs += diag.o gaccess.o guestdbg.o vsie.o pv.o
 
 obj-$(CONFIG_KVM) += kvm.o
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 7e4a982bfea3..1e5e70d44748 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -44,6 +44,7 @@ 
 #include <asm/cpacf.h>
 #include <asm/timex.h>
 #include <asm/ap.h>
+#include <asm/uv.h>
 #include "kvm-s390.h"
 #include "gaccess.h"
 
@@ -234,8 +235,10 @@  int kvm_arch_check_processor_compat(void)
 	return 0;
 }
 
+/* forward declarations */
 static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start,
 			      unsigned long end);
+static int sca_switch_to_extended(struct kvm *kvm);
 
 static void kvm_clock_sync_scb(struct kvm_s390_sie_block *scb, u64 delta)
 {
@@ -2165,6 +2168,160 @@  static int kvm_s390_set_cmma_bits(struct kvm *kvm,
 	return r;
 }
 
+static int kvm_s390_cpus_from_pv(struct kvm *kvm, u16 *rcp, u16 *rrcp)
+{
+	struct kvm_vcpu *vcpu;
+	u16 rc, rrc;
+	int ret = 0;
+	int i;
+
+	/*
+	 * We ignore failures and try to destroy as many CPUs as possible.
+	 * At the same time we must not free the assigned resources when
+	 * this fails, as the ultravisor has still access to that memory.
+	 * So kvm_s390_pv_destroy_cpu can leave a "wanted" memory leak
+	 * behind.
+	 * We want to return the first failure rc and rrc, though.
+	 */
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		mutex_lock(&vcpu->mutex);
+		if (kvm_s390_pv_destroy_cpu(vcpu, &rc, &rrc) && !ret) {
+			*rcp = rc;
+			*rrcp = rrc;
+			ret = -EIO;
+		}
+		mutex_unlock(&vcpu->mutex);
+	}
+	return ret;
+}
+
+static int kvm_s390_cpus_to_pv(struct kvm *kvm, u16 *rc, u16 *rrc)
+{
+	int i, r = 0;
+	u16 dummy;
+
+	struct kvm_vcpu *vcpu;
+
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		mutex_lock(&vcpu->mutex);
+		r = kvm_s390_pv_create_cpu(vcpu, rc, rrc);
+		mutex_unlock(&vcpu->mutex);
+		if (r)
+			break;
+	}
+	if (r)
+		kvm_s390_cpus_from_pv(kvm, &dummy, &dummy);
+	return r;
+}
+
+static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd)
+{
+	int r = 0;
+	u16 dummy;
+	void __user *argp = (void __user *)cmd->data;
+
+	switch (cmd->cmd) {
+	case KVM_PV_ENABLE: {
+		r = -EINVAL;
+		if (kvm_s390_pv_is_protected(kvm))
+			break;
+
+		/*
+		 *  FMT 4 SIE needs esca. As we never switch back to bsca from
+		 *  esca, we need no cleanup in the error cases below
+		 */
+		r = sca_switch_to_extended(kvm);
+		if (r)
+			break;
+
+		r = kvm_s390_pv_init_vm(kvm, &cmd->rc, &cmd->rrc);
+		if (r)
+			break;
+
+		r = kvm_s390_cpus_to_pv(kvm, &cmd->rc, &cmd->rrc);
+		if (r)
+			kvm_s390_pv_deinit_vm(kvm, &dummy, &dummy);
+		break;
+	}
+	case KVM_PV_DISABLE: {
+		r = -EINVAL;
+		if (!kvm_s390_pv_is_protected(kvm))
+			break;
+
+		r = kvm_s390_cpus_from_pv(kvm, &cmd->rc, &cmd->rrc);
+		/*
+		 * If a CPU could not be destroyed, destroy VM will also fail.
+		 * There is no point in trying to destroy it. Instead return
+		 * the rc and rrc from the first CPU that failed destroying.
+		 */
+		if (r)
+			break;
+		r = kvm_s390_pv_deinit_vm(kvm, &cmd->rc, &cmd->rrc);
+		break;
+	}
+	case KVM_PV_SET_SEC_PARMS: {
+		struct kvm_s390_pv_sec_parm parms = {};
+		void *hdr;
+
+		r = -EINVAL;
+		if (!kvm_s390_pv_is_protected(kvm))
+			break;
+
+		r = -EFAULT;
+		if (copy_from_user(&parms, argp, sizeof(parms)))
+			break;
+
+		/* Currently restricted to 8KB */
+		r = -EINVAL;
+		if (parms.length > PAGE_SIZE * 2)
+			break;
+
+		r = -ENOMEM;
+		hdr = vmalloc(parms.length);
+		if (!hdr)
+			break;
+
+		r = -EFAULT;
+		if (!copy_from_user(hdr, (void __user *)parms.origin,
+				    parms.length))
+			r = kvm_s390_pv_set_sec_parms(kvm, hdr, parms.length,
+						      &cmd->rc, &cmd->rrc);
+
+		vfree(hdr);
+		break;
+	}
+	case KVM_PV_UNPACK: {
+		struct kvm_s390_pv_unp unp = {};
+
+		r = -EINVAL;
+		if (!kvm_s390_pv_is_protected(kvm))
+			break;
+
+		r = -EFAULT;
+		if (copy_from_user(&unp, argp, sizeof(unp)))
+			break;
+
+		r = kvm_s390_pv_unpack(kvm, unp.addr, unp.size, unp.tweak,
+				       &cmd->rc, &cmd->rrc);
+		break;
+	}
+	case KVM_PV_VERIFY: {
+		r = -EINVAL;
+		if (!kvm_s390_pv_is_protected(kvm))
+			break;
+
+		r = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm),
+				  UVC_CMD_VERIFY_IMG, &cmd->rc, &cmd->rrc);
+		KVM_UV_EVENT(kvm, 3, "PROTVIRT VERIFY: rc %x rrc %x", cmd->rc,
+			     cmd->rrc);
+		break;
+	}
+	default:
+		return -ENOTTY;
+	}
+	return r;
+}
+
 long kvm_arch_vm_ioctl(struct file *filp,
 		       unsigned int ioctl, unsigned long arg)
 {
@@ -2262,6 +2419,27 @@  long kvm_arch_vm_ioctl(struct file *filp,
 		mutex_unlock(&kvm->slots_lock);
 		break;
 	}
+	case KVM_S390_PV_COMMAND: {
+		struct kvm_pv_cmd args;
+
+		r = 0;
+		if (!is_prot_virt_host()) {
+			r = -EINVAL;
+			break;
+		}
+		if (copy_from_user(&args, argp, sizeof(args))) {
+			r = -EFAULT;
+			break;
+		}
+		mutex_lock(&kvm->lock);
+		r = kvm_s390_handle_pv(kvm, &args);
+		mutex_unlock(&kvm->lock);
+		if (copy_to_user(argp, &args, sizeof(args))) {
+			r = -EFAULT;
+			break;
+		}
+		break;
+	}
 	default:
 		r = -ENOTTY;
 	}
@@ -2525,6 +2703,8 @@  int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 
 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 {
+	u16 rc, rrc;
+
 	VCPU_EVENT(vcpu, 3, "%s", "free cpu");
 	trace_kvm_s390_destroy_vcpu(vcpu->vcpu_id);
 	kvm_s390_clear_local_irqs(vcpu);
@@ -2537,6 +2717,9 @@  void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 
 	if (vcpu->kvm->arch.use_cmma)
 		kvm_s390_vcpu_unsetup_cmma(vcpu);
+	/* We can not hold the vcpu mutex here, we are already dying */
+	if (kvm_s390_pv_cpu_get_handle(vcpu))
+		kvm_s390_pv_destroy_cpu(vcpu, &rc, &rrc);
 	free_page((unsigned long)(vcpu->arch.sie_block));
 }
 
@@ -2558,10 +2741,19 @@  static void kvm_free_vcpus(struct kvm *kvm)
 
 void kvm_arch_destroy_vm(struct kvm *kvm)
 {
+	u16 rc, rrc;
 	kvm_free_vcpus(kvm);
 	sca_dispose(kvm);
-	debug_unregister(kvm->arch.dbf);
 	kvm_s390_gisa_destroy(kvm);
+	/*
+	 * We are already at the end of life and kvm->lock is not taken.
+	 * This is ok as the file descriptor is closed by now and nobody
+	 * can mess with the pv state. To avoid lockdep_assert_held from
+	 * complaining we do not use kvm_s390_pv_is_protected.
+	 */
+	if (kvm_s390_pv_get_handle(kvm))
+		kvm_s390_pv_deinit_vm(kvm, &rc, &rrc);
+	debug_unregister(kvm->arch.dbf);
 	free_page((unsigned long)kvm->arch.sie_page2);
 	if (!kvm_is_ucontrol(kvm))
 		gmap_remove(kvm->arch.gmap);
@@ -2657,6 +2849,9 @@  static int sca_switch_to_extended(struct kvm *kvm)
 	unsigned int vcpu_idx;
 	u32 scaol, scaoh;
 
+	if (kvm->arch.use_esca)
+		return 0;
+
 	new_sca = alloc_pages_exact(sizeof(*new_sca), GFP_KERNEL|__GFP_ZERO);
 	if (!new_sca)
 		return -ENOMEM;
@@ -2908,6 +3103,7 @@  static void kvm_s390_vcpu_setup_model(struct kvm_vcpu *vcpu)
 static int kvm_s390_vcpu_setup(struct kvm_vcpu *vcpu)
 {
 	int rc = 0;
+	u16 uvrc, uvrrc;
 
 	atomic_set(&vcpu->arch.sie_block->cpuflags, CPUSTAT_ZARCH |
 						    CPUSTAT_SM |
@@ -2975,6 +3171,14 @@  static int kvm_s390_vcpu_setup(struct kvm_vcpu *vcpu)
 
 	kvm_s390_vcpu_crypto_setup(vcpu);
 
+	mutex_lock(&vcpu->kvm->lock);
+	if (kvm_s390_pv_is_protected(vcpu->kvm)) {
+		rc = kvm_s390_pv_create_cpu(vcpu, &uvrc, &uvrrc);
+		if (rc)
+			kvm_s390_vcpu_unsetup_cmma(vcpu);
+	}
+	mutex_unlock(&vcpu->kvm->lock);
+
 	return rc;
 }
 
@@ -4540,6 +4744,9 @@  int kvm_arch_prepare_memory_region(struct kvm *kvm,
 	if (mem->guest_phys_addr + mem->memory_size > kvm->arch.mem_limit)
 		return -EINVAL;
 
+	/* When we are protected we should not change the memory slots */
+	if (kvm_s390_pv_is_protected(kvm))
+		return -EINVAL;
 	return 0;
 }
 
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index be55b4b99bd3..13e6986596ed 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -15,6 +15,7 @@ 
 #include <linux/hrtimer.h>
 #include <linux/kvm.h>
 #include <linux/kvm_host.h>
+#include <linux/lockdep.h>
 #include <asm/facility.h>
 #include <asm/processor.h>
 #include <asm/sclp.h>
@@ -207,6 +208,38 @@  static inline int kvm_s390_user_cpu_state_ctrl(struct kvm *kvm)
 	return kvm->arch.user_cpu_state_ctrl != 0;
 }
 
+/* implemented in pv.c */
+int kvm_s390_pv_destroy_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc);
+int kvm_s390_pv_create_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc);
+int kvm_s390_pv_deinit_vm(struct kvm *kvm, u16 *rc, u16 *rrc);
+int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc);
+int kvm_s390_pv_set_sec_parms(struct kvm *kvm, void *hdr, u64 length, u16 *rc,
+			      u16 *rrc);
+int kvm_s390_pv_unpack(struct kvm *kvm, unsigned long addr, unsigned long size,
+		       unsigned long tweak, u16 *rc, u16 *rrc);
+
+static inline u64 kvm_s390_pv_get_handle(struct kvm *kvm)
+{
+	return kvm->arch.pv.handle;
+}
+
+static inline u64 kvm_s390_pv_cpu_get_handle(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.pv.handle;
+}
+
+static inline bool kvm_s390_pv_is_protected(struct kvm *kvm)
+{
+	lockdep_assert_held(&kvm->lock);
+	return !!kvm_s390_pv_get_handle(kvm);
+}
+
+static inline bool kvm_s390_pv_cpu_is_protected(struct kvm_vcpu *vcpu)
+{
+	lockdep_assert_held(&vcpu->mutex);
+	return !!kvm_s390_pv_cpu_get_handle(vcpu);
+}
+
 /* implemented in interrupt.c */
 int kvm_s390_handle_wait(struct kvm_vcpu *vcpu);
 void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu);
diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c
new file mode 100644
index 000000000000..e7b525d06e4c
--- /dev/null
+++ b/arch/s390/kvm/pv.c
@@ -0,0 +1,269 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Hosting Protected Virtual Machines
+ *
+ * Copyright IBM Corp. 2019, 2020
+ *    Author(s): Janosch Frank <frankja@linux.ibm.com>
+ */
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/pagemap.h>
+#include <linux/sched/signal.h>
+#include <asm/pgalloc.h>
+#include <asm/gmap.h>
+#include <asm/uv.h>
+#include <asm/gmap.h>
+#include <asm/mman.h>
+#include "kvm-s390.h"
+
+int kvm_s390_pv_destroy_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc)
+{
+	int cc = 0;
+
+	if (kvm_s390_pv_cpu_get_handle(vcpu)) {
+		cc = uv_cmd_nodata(kvm_s390_pv_cpu_get_handle(vcpu),
+				   UVC_CMD_DESTROY_SEC_CPU, rc, rrc);
+
+		KVM_UV_EVENT(vcpu->kvm, 3,
+			     "PROTVIRT DESTROY VCPU %d: rc %x rrc %x",
+			     vcpu->vcpu_id, *rc, *rrc);
+		WARN_ONCE(cc, "protvirt destroy cpu failed rc %x rrc %x",
+			  *rc, *rrc);
+	}
+	/* Intended memory leak for something that should never happen. */
+	if (!cc)
+		free_pages(vcpu->arch.pv.stor_base,
+			   get_order(uv_info.guest_cpu_stor_len));
+	vcpu->arch.sie_block->pv_handle_cpu = 0;
+	vcpu->arch.sie_block->pv_handle_config = 0;
+	memset(&vcpu->arch.pv, 0, sizeof(vcpu->arch.pv));
+	vcpu->arch.sie_block->sdf = 0;
+	kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+
+	return cc ? EIO : 0;
+}
+
+int kvm_s390_pv_create_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc)
+{
+	struct uv_cb_csc uvcb = {
+		.header.cmd = UVC_CMD_CREATE_SEC_CPU,
+		.header.len = sizeof(uvcb),
+	};
+	int cc;
+
+	if (kvm_s390_pv_cpu_get_handle(vcpu))
+		return -EINVAL;
+
+	vcpu->arch.pv.stor_base = __get_free_pages(GFP_KERNEL,
+						   get_order(uv_info.guest_cpu_stor_len));
+	if (!vcpu->arch.pv.stor_base)
+		return -ENOMEM;
+
+	/* Input */
+	uvcb.guest_handle = kvm_s390_pv_get_handle(vcpu->kvm);
+	uvcb.num = vcpu->arch.sie_block->icpua;
+	uvcb.state_origin = (u64)vcpu->arch.sie_block;
+	uvcb.stor_origin = (u64)vcpu->arch.pv.stor_base;
+
+	cc = uv_call(0, (u64)&uvcb);
+	*rc = uvcb.header.rc;
+	*rrc = uvcb.header.rrc;
+	KVM_UV_EVENT(vcpu->kvm, 3,
+		     "PROTVIRT CREATE VCPU: cpu %d handle %llx rc %x rrc %x",
+		     vcpu->vcpu_id, uvcb.cpu_handle, uvcb.header.rc,
+		     uvcb.header.rrc);
+
+	if (cc) {
+		u16 dummy;
+
+		kvm_s390_pv_destroy_cpu(vcpu, &dummy, &dummy);
+		return -EIO;
+	}
+
+	/* Output */
+	vcpu->arch.pv.handle = uvcb.cpu_handle;
+	vcpu->arch.sie_block->pv_handle_cpu = uvcb.cpu_handle;
+	vcpu->arch.sie_block->pv_handle_config = kvm_s390_pv_get_handle(vcpu->kvm);
+	vcpu->arch.sie_block->sdf = 2;
+	kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+	return 0;
+}
+
+/* only free resources when the destroy was successful */
+static void kvm_s390_pv_dealloc_vm(struct kvm *kvm)
+{
+	vfree(kvm->arch.pv.stor_var);
+	free_pages(kvm->arch.pv.stor_base,
+		   get_order(uv_info.guest_base_stor_len));
+	memset(&kvm->arch.pv, 0, sizeof(kvm->arch.pv));
+}
+
+static int kvm_s390_pv_alloc_vm(struct kvm *kvm)
+{
+	unsigned long base = uv_info.guest_base_stor_len;
+	unsigned long virt = uv_info.guest_virt_var_stor_len;
+	unsigned long npages = 0, vlen = 0;
+	struct kvm_memory_slot *memslot;
+
+	kvm->arch.pv.stor_var = NULL;
+	kvm->arch.pv.stor_base = __get_free_pages(GFP_KERNEL, get_order(base));
+	if (!kvm->arch.pv.stor_base)
+		return -ENOMEM;
+
+	/*
+	 * Calculate current guest storage for allocation of the
+	 * variable storage, which is based on the length in MB.
+	 *
+	 * Slots are sorted by GFN
+	 */
+	mutex_lock(&kvm->slots_lock);
+	memslot = kvm_memslots(kvm)->memslots;
+	npages = memslot->base_gfn + memslot->npages;
+	mutex_unlock(&kvm->slots_lock);
+
+	kvm->arch.pv.guest_len = npages * PAGE_SIZE;
+
+	/* Allocate variable storage */
+	vlen = ALIGN(virt * ((npages * PAGE_SIZE) / HPAGE_SIZE), PAGE_SIZE);
+	vlen += uv_info.guest_virt_base_stor_len;
+	kvm->arch.pv.stor_var = vzalloc(vlen);
+	if (!kvm->arch.pv.stor_var)
+		goto out_err;
+	return 0;
+
+out_err:
+	kvm_s390_pv_dealloc_vm(kvm);
+	return -ENOMEM;
+}
+
+/* this should not fail, but if it does we must not free the donated memory */
+int kvm_s390_pv_deinit_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
+{
+	int cc;
+
+	cc = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm),
+			   UVC_CMD_DESTROY_SEC_CONF, rc, rrc);
+	WRITE_ONCE(kvm->arch.gmap->guest_handle, 0);
+	atomic_set(&kvm->mm->context.is_protected, 0);
+	KVM_UV_EVENT(kvm, 3, "PROTVIRT DESTROY VM: rc %x rrc %x", *rc, *rrc);
+	WARN_ONCE(cc, "protvirt destroy vm failed rc %x rrc %x", *rc, *rrc);
+	/* Inteded memory leak on "impossible" error */
+	if (!cc)
+		kvm_s390_pv_dealloc_vm(kvm);
+	return cc ? -EIO : 0;
+}
+
+int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
+{
+		struct uv_cb_cgc uvcb = {
+		.header.cmd = UVC_CMD_CREATE_SEC_CONF,
+		.header.len = sizeof(uvcb)
+	};
+	int cc, ret;
+	u16 dummy;
+
+	ret = kvm_s390_pv_alloc_vm(kvm);
+	if (ret)
+		return ret;
+
+	/* Inputs */
+	uvcb.guest_stor_origin = 0; /* MSO is 0 for KVM */
+	uvcb.guest_stor_len = kvm->arch.pv.guest_len;
+	uvcb.guest_asce = kvm->arch.gmap->asce;
+	uvcb.guest_sca = (unsigned long)kvm->arch.sca;
+	uvcb.conf_base_stor_origin = (u64)kvm->arch.pv.stor_base;
+	uvcb.conf_virt_stor_origin = (u64)kvm->arch.pv.stor_var;
+
+	cc = uv_call(0, (u64)&uvcb);
+	*rc = uvcb.header.rc;
+	*rrc = uvcb.header.rrc;
+	KVM_UV_EVENT(kvm, 3, "PROTVIRT CREATE VM: handle %llx len %llx rc %x rrc %x",
+		     uvcb.guest_handle, uvcb.guest_stor_len, *rc, *rrc);
+
+	/* Outputs */
+	kvm->arch.pv.handle = uvcb.guest_handle;
+
+	if (cc) {
+		if (uvcb.header.rc & UVC_RC_NEED_DESTROY)
+			kvm_s390_pv_deinit_vm(kvm, &dummy, &dummy);
+		else
+			kvm_s390_pv_dealloc_vm(kvm);
+		return -EIO;
+	}
+	kvm->arch.gmap->guest_handle = uvcb.guest_handle;
+	atomic_set(&kvm->mm->context.is_protected, 1);
+	return 0;
+}
+
+int kvm_s390_pv_set_sec_parms(struct kvm *kvm, void *hdr, u64 length, u16 *rc,
+			      u16 *rrc)
+{
+	struct uv_cb_ssc uvcb = {
+		.header.cmd = UVC_CMD_SET_SEC_CONF_PARAMS,
+		.header.len = sizeof(uvcb),
+		.sec_header_origin = (u64)hdr,
+		.sec_header_len = length,
+		.guest_handle = kvm_s390_pv_get_handle(kvm),
+	};
+	int cc = uv_call(0, (u64)&uvcb);
+
+	*rc = uvcb.header.rc;
+	*rrc = uvcb.header.rrc;
+	KVM_UV_EVENT(kvm, 3, "PROTVIRT VM SET PARMS: rc %x rrc %x",
+		     *rc, *rrc);
+	if (cc)
+		return -EINVAL;
+	return 0;
+}
+
+static int unpack_one(struct kvm *kvm, unsigned long addr, u64 tweak,
+		      u64 offset, u16 *rc, u16 *rrc)
+{
+	struct uv_cb_unp uvcb = {
+		.header.cmd = UVC_CMD_UNPACK_IMG,
+		.header.len = sizeof(uvcb),
+		.guest_handle = kvm_s390_pv_get_handle(kvm),
+		.gaddr = addr,
+		.tweak[0] = tweak,
+		.tweak[1] = offset,
+	};
+	int ret = gmap_make_secure(kvm->arch.gmap, addr, &uvcb);
+
+	*rc = uvcb.header.rc;
+	*rrc = uvcb.header.rrc;
+
+	if (ret && ret != -EAGAIN)
+		KVM_UV_EVENT(kvm, 3, "PROTVIRT VM UNPACK: failed addr %llx with rc %x rrc %x",
+			     uvcb.gaddr, *rc, *rrc);
+	return ret;
+}
+
+int kvm_s390_pv_unpack(struct kvm *kvm, unsigned long addr, unsigned long size,
+		       unsigned long tweak, u16 *rc, u16 *rrc)
+{
+	u64 offset = 0;
+	int ret = 0;
+
+	if (addr & ~PAGE_MASK || !size || size & ~PAGE_MASK)
+		return -EINVAL;
+
+	KVM_UV_EVENT(kvm, 3, "PROTVIRT VM UNPACK: start addr %lx size %lx",
+		     addr, size);
+
+	while (offset < size) {
+		ret = unpack_one(kvm, addr, tweak, offset, rc, rrc);
+		if (ret == -EAGAIN) {
+			cond_resched();
+			if (fatal_signal_pending(current))
+				break;
+			continue;
+		}
+		if (ret)
+			break;
+		addr += PAGE_SIZE;
+		offset += PAGE_SIZE;
+	}
+	if (!ret)
+		KVM_UV_EVENT(kvm, 3, "%s", "PROTVIRT VM UNPACK: successful");
+	return ret;
+}
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 4b95f9a31a2f..ad69817f7792 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1478,6 +1478,37 @@  struct kvm_enc_region {
 #define KVM_S390_NORMAL_RESET	_IO(KVMIO,   0xc3)
 #define KVM_S390_CLEAR_RESET	_IO(KVMIO,   0xc4)
 
+struct kvm_s390_pv_sec_parm {
+	__u64 origin;
+	__u64 length;
+};
+
+struct kvm_s390_pv_unp {
+	__u64 addr;
+	__u64 size;
+	__u64 tweak;
+};
+
+enum pv_cmd_id {
+	KVM_PV_ENABLE,
+	KVM_PV_DISABLE,
+	KVM_PV_SET_SEC_PARMS,
+	KVM_PV_UNPACK,
+	KVM_PV_VERIFY,
+};
+
+struct kvm_pv_cmd {
+	__u32 cmd;	/* Command to be executed */
+	__u16 rc;	/* Ultravisor return code */
+	__u16 rrc;	/* Ultravisor return reason code */
+	__u64 data;	/* Data or address */
+	__u32 flags;    /* flags for future extensions. Must be 0 for now */
+	__u32 reserved[3];
+};
+
+/* Available with KVM_CAP_S390_PROTECTED */
+#define KVM_S390_PV_COMMAND		_IOWR(KVMIO, 0xc5, struct kvm_pv_cmd)
+
 /* Secure Encrypted Virtualization command */
 enum sev_cmd_id {
 	/* Guest initialization commands */