diff mbox series

[v2,06/11] KVM: arm64: Add support for SYSTEM_SUSPEND PSCI call

Message ID 20210923191610.3814698-7-oupton@google.com (mailing list archive)
State New, archived
Headers show
Series KVM: arm64: Implement PSCI SYSTEM_SUSPEND support | expand

Commit Message

Oliver Upton Sept. 23, 2021, 7:16 p.m. UTC
ARM DEN0022D 5.19 "SYSTEM_SUSPEND" describes a PSCI call that may be
used to request a system be suspended. This is optional for PSCI v1.0
and to date KVM has elected to not implement the call. However, a
VMM/operator may wish to provide their guests with the ability to
suspend/resume, necessitating this PSCI call.

Implement support for SYSTEM_SUSPEND according to the prescribed
behavior in the specification. Add a new system event exit type,
KVM_SYSTEM_EVENT_SUSPEND, to notify userspace when a VM has requested a
system suspend. Make KVM_MP_STATE_HALTED a valid state on arm64.
Userspace can set this to request an in-kernel emulation of the suspend.

Signed-off-by: Oliver Upton <oupton@google.com>
---
 Documentation/virt/kvm/api.rst    |  6 ++++
 arch/arm64/include/asm/kvm_host.h |  3 ++
 arch/arm64/kvm/arm.c              |  8 +++++
 arch/arm64/kvm/psci.c             | 60 +++++++++++++++++++++++++++++++
 include/uapi/linux/kvm.h          |  2 ++
 5 files changed, 79 insertions(+)

Comments

Marc Zyngier Sept. 30, 2021, 12:29 p.m. UTC | #1
Hi Oliver,

On Thu, 23 Sep 2021 20:16:05 +0100,
Oliver Upton <oupton@google.com> wrote:
> 
> ARM DEN0022D 5.19 "SYSTEM_SUSPEND" describes a PSCI call that may be
> used to request a system be suspended. This is optional for PSCI v1.0
> and to date KVM has elected to not implement the call. However, a
> VMM/operator may wish to provide their guests with the ability to
> suspend/resume, necessitating this PSCI call.
> 
> Implement support for SYSTEM_SUSPEND according to the prescribed
> behavior in the specification. Add a new system event exit type,
> KVM_SYSTEM_EVENT_SUSPEND, to notify userspace when a VM has requested a
> system suspend. Make KVM_MP_STATE_HALTED a valid state on arm64.

KVM_MP_STATE_HALTED is a per-CPU state on x86 (it denotes HLT). Does
it make really sense to hijack this for something that is more of a
VM-wide state? I can see that it is tempting to do so as we're using
the WFI semantics (which are close to HLT's, in a twisted kind of
way), but I'm also painfully aware that gluing x86 expectations on
arm64 rarely leads to a palatable result.

> Userspace can set this to request an in-kernel emulation of the suspend.
> 
> Signed-off-by: Oliver Upton <oupton@google.com>
> ---
>  Documentation/virt/kvm/api.rst    |  6 ++++
>  arch/arm64/include/asm/kvm_host.h |  3 ++
>  arch/arm64/kvm/arm.c              |  8 +++++
>  arch/arm64/kvm/psci.c             | 60 +++++++++++++++++++++++++++++++
>  include/uapi/linux/kvm.h          |  2 ++
>  5 files changed, 79 insertions(+)

This patch needs splitting. PSCI interface in one, mpstate stuff in
another, userspace management last.

> 
> diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
> index a6729c8cf063..361a57061b8f 100644
> --- a/Documentation/virt/kvm/api.rst
> +++ b/Documentation/virt/kvm/api.rst
> @@ -5656,6 +5656,7 @@ should put the acknowledged interrupt vector into the 'epr' field.
>    #define KVM_SYSTEM_EVENT_SHUTDOWN       1
>    #define KVM_SYSTEM_EVENT_RESET          2
>    #define KVM_SYSTEM_EVENT_CRASH          3
> +  #define KVM_SYSTEM_EVENT_SUSPEND        4
>  			__u32 type;
>  			__u64 flags;
>  		} system_event;
> @@ -5680,6 +5681,11 @@ Valid values for 'type' are:
>     has requested a crash condition maintenance. Userspace can choose
>     to ignore the request, or to gather VM memory core dump and/or
>     reset/shutdown of the VM.
> + - KVM_SYSTEM_EVENT_SUSPEND -- the guest has requested that the VM
> +   suspends. Userspace is not obliged to honor this, and may call KVM_RUN
> +   again. Doing so will cause the guest to resume at its requested entry
> +   point. For ARM64, userspace can request in-kernel suspend emulation
> +   by setting the vCPU's MP state to KVM_MP_STATE_HALTED.
>  
>  ::
>  
> diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
> index 1beda1189a15..441eb6fa7adc 100644
> --- a/arch/arm64/include/asm/kvm_host.h
> +++ b/arch/arm64/include/asm/kvm_host.h
> @@ -137,6 +137,9 @@ struct kvm_arch {
>  
>  	/* Memory Tagging Extension enabled for the guest */
>  	bool mte_enabled;
> +
> +	/* PSCI SYSTEM_SUSPEND call enabled for the guest */
> +	bool suspend_enabled;
>  };
>  
>  struct kvm_vcpu_fault_info {
> diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
> index f1a375648e25..d875d3bcf3c5 100644
> --- a/arch/arm64/kvm/arm.c
> +++ b/arch/arm64/kvm/arm.c
> @@ -101,6 +101,10 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
>  		}
>  		mutex_unlock(&kvm->lock);
>  		break;
> +	case KVM_CAP_ARM_SYSTEM_SUSPEND:
> +		r = 0;
> +		kvm->arch.suspend_enabled = true;

I don't really fancy adding another control here. Given that we now
have the PSCI version being controlled by a firmware pseudo-register,
I'd rather have that there.

And if we do that, I wonder whether there would be any benefit in
bumping the PSCI version to 1.1.

> +		break;
>  	default:
>  		r = -EINVAL;
>  		break;
> @@ -215,6 +219,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
>  	case KVM_CAP_SET_GUEST_DEBUG:
>  	case KVM_CAP_VCPU_ATTRIBUTES:
>  	case KVM_CAP_PTP_KVM:
> +	case KVM_CAP_ARM_SYSTEM_SUSPEND:
>  		r = 1;
>  		break;
>  	case KVM_CAP_SET_GUEST_DEBUG2:
> @@ -470,6 +475,9 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
>  	int ret = 0;
>  
>  	switch (mp_state->mp_state) {
> +	case KVM_MP_STATE_HALTED:
> +		kvm_make_request(KVM_REQ_SUSPEND, vcpu);
> +		fallthrough;
>  	case KVM_MP_STATE_RUNNABLE:
>  		vcpu->arch.power_off = false;
>  		break;

> diff --git a/arch/arm64/kvm/psci.c b/arch/arm64/kvm/psci.c
> index d453666ddb83..cf869f1f8615 100644
> --- a/arch/arm64/kvm/psci.c
> +++ b/arch/arm64/kvm/psci.c
> @@ -203,6 +203,46 @@ static void kvm_psci_system_reset(struct kvm_vcpu *vcpu)
>  	kvm_prepare_system_event(vcpu, KVM_SYSTEM_EVENT_RESET);
>  }
>  
> +static int kvm_psci_system_suspend(struct kvm_vcpu *vcpu)
> +{
> +	unsigned long entry_addr, context_id;
> +	struct kvm *kvm = vcpu->kvm;
> +	unsigned long psci_ret = 0;
> +	struct kvm_vcpu *tmp;
> +	int ret = 0;
> +	int i;
> +
> +	/*
> +	 * The SYSTEM_SUSPEND PSCI call requires that all vCPUs (except the
> +	 * calling vCPU) be in an OFF state, as determined by the
> +	 * implementation.
> +	 *
> +	 * See ARM DEN0022D, 5.19 "SYSTEM_SUSPEND" for more details.
> +	 */
> +	mutex_lock(&kvm->lock);
> +	kvm_for_each_vcpu(i, tmp, kvm) {
> +		if (tmp != vcpu && !tmp->arch.power_off) {
> +			psci_ret = PSCI_RET_DENIED;
> +			ret = 1;
> +			goto out;
> +		}
> +	}
> +
> +	entry_addr = smccc_get_arg1(vcpu);
> +	context_id = smccc_get_arg2(vcpu);
> +
> +	kvm_psci_vcpu_request_reset(vcpu, entry_addr, context_id,
> +				    kvm_vcpu_is_be(vcpu));

So even if userspace doesn't want to honor the suspend request, the
CPU ends up resetting? That's not wrong, but maybe a bit surprising.

> +
> +	memset(&vcpu->run->system_event, 0, sizeof(vcpu->run->system_event));
> +	vcpu->run->system_event.type = KVM_SYSTEM_EVENT_SUSPEND;
> +	vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;

Consider spinning out a helper common to this and kvm_prepare_system_event().

> +out:
> +	mutex_unlock(&kvm->lock);
> +	smccc_set_retval(vcpu, psci_ret, 0, 0, 0);
> +	return ret;
> +}
> +
>  static void kvm_psci_narrow_to_32bit(struct kvm_vcpu *vcpu)
>  {
>  	int i;
> @@ -223,6 +263,14 @@ static unsigned long kvm_psci_check_allowed_function(struct kvm_vcpu *vcpu, u32
>  	if ((fn & PSCI_0_2_64BIT) && vcpu_mode_is_32bit(vcpu))
>  		return PSCI_RET_NOT_SUPPORTED;
>  
> +	switch (fn) {
> +	case PSCI_1_0_FN_SYSTEM_SUSPEND:
> +	case PSCI_1_0_FN64_SYSTEM_SUSPEND:
> +		if (!vcpu->kvm->arch.suspend_enabled)
> +			return PSCI_RET_NOT_SUPPORTED;
> +		break;
> +	}
> +
>  	return 0;
>  }
>  
> @@ -316,6 +364,10 @@ static int kvm_psci_1_0_call(struct kvm_vcpu *vcpu)
>  	unsigned long val;
>  	int ret = 1;
>  
> +	val = kvm_psci_check_allowed_function(vcpu, psci_fn);
> +	if (val)
> +		goto out;
> +
>  	switch(psci_fn) {
>  	case PSCI_0_2_FN_PSCI_VERSION:
>  		val = KVM_ARM_PSCI_1_0;
> @@ -339,6 +391,8 @@ static int kvm_psci_1_0_call(struct kvm_vcpu *vcpu)
>  		case PSCI_0_2_FN_SYSTEM_OFF:
>  		case PSCI_0_2_FN_SYSTEM_RESET:
>  		case PSCI_1_0_FN_PSCI_FEATURES:
> +		case PSCI_1_0_FN_SYSTEM_SUSPEND:
> +		case PSCI_1_0_FN64_SYSTEM_SUSPEND:
>  		case ARM_SMCCC_VERSION_FUNC_ID:
>  			val = 0;
>  			break;
> @@ -347,10 +401,16 @@ static int kvm_psci_1_0_call(struct kvm_vcpu *vcpu)
>  			break;
>  		}
>  		break;
> +	case PSCI_1_0_FN_SYSTEM_SUSPEND:
> +		kvm_psci_narrow_to_32bit(vcpu);
> +		fallthrough;
> +	case PSCI_1_0_FN64_SYSTEM_SUSPEND:
> +		return kvm_psci_system_suspend(vcpu);
>  	default:
>  		return kvm_psci_0_2_call(vcpu);
>  	}
>  
> +out:
>  	smccc_set_retval(vcpu, val, 0, 0, 0);
>  	return ret;
>  }
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index a067410ebea5..052b0e717b08 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -433,6 +433,7 @@ struct kvm_run {
>  #define KVM_SYSTEM_EVENT_SHUTDOWN       1
>  #define KVM_SYSTEM_EVENT_RESET          2
>  #define KVM_SYSTEM_EVENT_CRASH          3
> +#define KVM_SYSTEM_EVENT_SUSPEND        4
>  			__u32 type;
>  			__u64 flags;
>  		} system_event;
> @@ -1112,6 +1113,7 @@ struct kvm_ppc_resize_hpt {
>  #define KVM_CAP_BINARY_STATS_FD 203
>  #define KVM_CAP_EXIT_ON_EMULATION_FAILURE 204
>  #define KVM_CAP_ARM_MTE 205
> +#define KVM_CAP_ARM_SYSTEM_SUSPEND 206
>  
>  #ifdef KVM_CAP_IRQ_ROUTING
>  

I think there is an additional feature we need, which is to give
control back to userspace every time a wake-up condition occurs (I did
touch on that in [1]). This would give the opportunity to the VMM to
do whatever it needs to perform.

A typical use case would be to implement wake-up from certain
interrupts only (mask non-wake-up IRQs on suspend request, return to
the guest for WFI, wake-up returns to the VMM to restore the previous
interrupt configuration, resume).

Userspace would be free to clear the suspend state and resume the
guest, or to reenter WFI.

Thanks,

	M.

[1] https://lkml.kernel.org/kvm/87k0jauurx.wl-maz@kernel.org/
Sean Christopherson Sept. 30, 2021, 5:19 p.m. UTC | #2
On Thu, Sep 30, 2021, Marc Zyngier wrote:
> Hi Oliver,
> 
> On Thu, 23 Sep 2021 20:16:05 +0100,
> Oliver Upton <oupton@google.com> wrote:
> > 
> > ARM DEN0022D 5.19 "SYSTEM_SUSPEND" describes a PSCI call that may be
> > used to request a system be suspended. This is optional for PSCI v1.0
> > and to date KVM has elected to not implement the call. However, a
> > VMM/operator may wish to provide their guests with the ability to
> > suspend/resume, necessitating this PSCI call.
> > 
> > Implement support for SYSTEM_SUSPEND according to the prescribed
> > behavior in the specification. Add a new system event exit type,
> > KVM_SYSTEM_EVENT_SUSPEND, to notify userspace when a VM has requested a
> > system suspend. Make KVM_MP_STATE_HALTED a valid state on arm64.
> 
> KVM_MP_STATE_HALTED is a per-CPU state on x86 (it denotes HLT). Does
> it make really sense to hijack this for something that is more of a
> VM-wide state? I can see that it is tempting to do so as we're using
> the WFI semantics (which are close to HLT's, in a twisted kind of
> way), but I'm also painfully aware that gluing x86 expectations on
> arm64 rarely leads to a palatable result.

Agreed, we literally have billions of possible KVM_MP_STATE_* values, and I'm pretty
sure all of the existing states are arch-specific.  Some are common to multiple
architectures, but I don't think _any_ are common to all architectures.
Oliver Upton Sept. 30, 2021, 5:35 p.m. UTC | #3
On Thu, Sep 30, 2021 at 10:19 AM Sean Christopherson <seanjc@google.com> wrote:
>
> On Thu, Sep 30, 2021, Marc Zyngier wrote:
> > Hi Oliver,
> >
> > On Thu, 23 Sep 2021 20:16:05 +0100,
> > Oliver Upton <oupton@google.com> wrote:
> > >
> > > ARM DEN0022D 5.19 "SYSTEM_SUSPEND" describes a PSCI call that may be
> > > used to request a system be suspended. This is optional for PSCI v1.0
> > > and to date KVM has elected to not implement the call. However, a
> > > VMM/operator may wish to provide their guests with the ability to
> > > suspend/resume, necessitating this PSCI call.
> > >
> > > Implement support for SYSTEM_SUSPEND according to the prescribed
> > > behavior in the specification. Add a new system event exit type,
> > > KVM_SYSTEM_EVENT_SUSPEND, to notify userspace when a VM has requested a
> > > system suspend. Make KVM_MP_STATE_HALTED a valid state on arm64.
> >
> > KVM_MP_STATE_HALTED is a per-CPU state on x86 (it denotes HLT). Does
> > it make really sense to hijack this for something that is more of a
> > VM-wide state? I can see that it is tempting to do so as we're using
> > the WFI semantics (which are close to HLT's, in a twisted kind of
> > way), but I'm also painfully aware that gluing x86 expectations on
> > arm64 rarely leads to a palatable result.
>
> Agreed, we literally have billions of possible KVM_MP_STATE_* values, and I'm pretty
> sure all of the existing states are arch-specific.  Some are common to multiple
> architectures, but I don't think _any_ are common to all architectures.

Yeah, I was debating this as well when cooking up the series. No need
to overload the x86-ism when we can have a precisely named state for
ARM.
Oliver Upton Sept. 30, 2021, 5:40 p.m. UTC | #4
On Thu, Sep 30, 2021 at 5:29 AM Marc Zyngier <maz@kernel.org> wrote:
>
> Hi Oliver,
>
> On Thu, 23 Sep 2021 20:16:05 +0100,
> Oliver Upton <oupton@google.com> wrote:
> >
> > ARM DEN0022D 5.19 "SYSTEM_SUSPEND" describes a PSCI call that may be
> > used to request a system be suspended. This is optional for PSCI v1.0
> > and to date KVM has elected to not implement the call. However, a
> > VMM/operator may wish to provide their guests with the ability to
> > suspend/resume, necessitating this PSCI call.
> >
> > Implement support for SYSTEM_SUSPEND according to the prescribed
> > behavior in the specification. Add a new system event exit type,
> > KVM_SYSTEM_EVENT_SUSPEND, to notify userspace when a VM has requested a
> > system suspend. Make KVM_MP_STATE_HALTED a valid state on arm64.
>
> KVM_MP_STATE_HALTED is a per-CPU state on x86 (it denotes HLT). Does
> it make really sense to hijack this for something that is more of a
> VM-wide state? I can see that it is tempting to do so as we're using
> the WFI semantics (which are close to HLT's, in a twisted kind of
> way), but I'm also painfully aware that gluing x86 expectations on
> arm64 rarely leads to a palatable result.
>
> > Userspace can set this to request an in-kernel emulation of the suspend.
> >
> > Signed-off-by: Oliver Upton <oupton@google.com>
> > ---
> >  Documentation/virt/kvm/api.rst    |  6 ++++
> >  arch/arm64/include/asm/kvm_host.h |  3 ++
> >  arch/arm64/kvm/arm.c              |  8 +++++
> >  arch/arm64/kvm/psci.c             | 60 +++++++++++++++++++++++++++++++
> >  include/uapi/linux/kvm.h          |  2 ++
> >  5 files changed, 79 insertions(+)
>
> This patch needs splitting. PSCI interface in one, mpstate stuff in
> another, userspace management last.
>
> >
> > diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
> > index a6729c8cf063..361a57061b8f 100644
> > --- a/Documentation/virt/kvm/api.rst
> > +++ b/Documentation/virt/kvm/api.rst
> > @@ -5656,6 +5656,7 @@ should put the acknowledged interrupt vector into the 'epr' field.
> >    #define KVM_SYSTEM_EVENT_SHUTDOWN       1
> >    #define KVM_SYSTEM_EVENT_RESET          2
> >    #define KVM_SYSTEM_EVENT_CRASH          3
> > +  #define KVM_SYSTEM_EVENT_SUSPEND        4
> >                       __u32 type;
> >                       __u64 flags;
> >               } system_event;
> > @@ -5680,6 +5681,11 @@ Valid values for 'type' are:
> >     has requested a crash condition maintenance. Userspace can choose
> >     to ignore the request, or to gather VM memory core dump and/or
> >     reset/shutdown of the VM.
> > + - KVM_SYSTEM_EVENT_SUSPEND -- the guest has requested that the VM
> > +   suspends. Userspace is not obliged to honor this, and may call KVM_RUN
> > +   again. Doing so will cause the guest to resume at its requested entry
> > +   point. For ARM64, userspace can request in-kernel suspend emulation
> > +   by setting the vCPU's MP state to KVM_MP_STATE_HALTED.
> >
> >  ::
> >
> > diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
> > index 1beda1189a15..441eb6fa7adc 100644
> > --- a/arch/arm64/include/asm/kvm_host.h
> > +++ b/arch/arm64/include/asm/kvm_host.h
> > @@ -137,6 +137,9 @@ struct kvm_arch {
> >
> >       /* Memory Tagging Extension enabled for the guest */
> >       bool mte_enabled;
> > +
> > +     /* PSCI SYSTEM_SUSPEND call enabled for the guest */
> > +     bool suspend_enabled;
> >  };
> >
> >  struct kvm_vcpu_fault_info {
> > diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
> > index f1a375648e25..d875d3bcf3c5 100644
> > --- a/arch/arm64/kvm/arm.c
> > +++ b/arch/arm64/kvm/arm.c
> > @@ -101,6 +101,10 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
> >               }
> >               mutex_unlock(&kvm->lock);
> >               break;
> > +     case KVM_CAP_ARM_SYSTEM_SUSPEND:
> > +             r = 0;
> > +             kvm->arch.suspend_enabled = true;
>
> I don't really fancy adding another control here. Given that we now
> have the PSCI version being controlled by a firmware pseudo-register,
> I'd rather have that there.
>
> And if we do that, I wonder whether there would be any benefit in
> bumping the PSCI version to 1.1.
>
> > +             break;
> >       default:
> >               r = -EINVAL;
> >               break;
> > @@ -215,6 +219,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
> >       case KVM_CAP_SET_GUEST_DEBUG:
> >       case KVM_CAP_VCPU_ATTRIBUTES:
> >       case KVM_CAP_PTP_KVM:
> > +     case KVM_CAP_ARM_SYSTEM_SUSPEND:
> >               r = 1;
> >               break;
> >       case KVM_CAP_SET_GUEST_DEBUG2:
> > @@ -470,6 +475,9 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
> >       int ret = 0;
> >
> >       switch (mp_state->mp_state) {
> > +     case KVM_MP_STATE_HALTED:
> > +             kvm_make_request(KVM_REQ_SUSPEND, vcpu);
> > +             fallthrough;
> >       case KVM_MP_STATE_RUNNABLE:
> >               vcpu->arch.power_off = false;
> >               break;
>
> > diff --git a/arch/arm64/kvm/psci.c b/arch/arm64/kvm/psci.c
> > index d453666ddb83..cf869f1f8615 100644
> > --- a/arch/arm64/kvm/psci.c
> > +++ b/arch/arm64/kvm/psci.c
> > @@ -203,6 +203,46 @@ static void kvm_psci_system_reset(struct kvm_vcpu *vcpu)
> >       kvm_prepare_system_event(vcpu, KVM_SYSTEM_EVENT_RESET);
> >  }
> >
> > +static int kvm_psci_system_suspend(struct kvm_vcpu *vcpu)
> > +{
> > +     unsigned long entry_addr, context_id;
> > +     struct kvm *kvm = vcpu->kvm;
> > +     unsigned long psci_ret = 0;
> > +     struct kvm_vcpu *tmp;
> > +     int ret = 0;
> > +     int i;
> > +
> > +     /*
> > +      * The SYSTEM_SUSPEND PSCI call requires that all vCPUs (except the
> > +      * calling vCPU) be in an OFF state, as determined by the
> > +      * implementation.
> > +      *
> > +      * See ARM DEN0022D, 5.19 "SYSTEM_SUSPEND" for more details.
> > +      */
> > +     mutex_lock(&kvm->lock);
> > +     kvm_for_each_vcpu(i, tmp, kvm) {
> > +             if (tmp != vcpu && !tmp->arch.power_off) {
> > +                     psci_ret = PSCI_RET_DENIED;
> > +                     ret = 1;
> > +                     goto out;
> > +             }
> > +     }
> > +
> > +     entry_addr = smccc_get_arg1(vcpu);
> > +     context_id = smccc_get_arg2(vcpu);
> > +
> > +     kvm_psci_vcpu_request_reset(vcpu, entry_addr, context_id,
> > +                                 kvm_vcpu_is_be(vcpu));
>
> So even if userspace doesn't want to honor the suspend request, the
> CPU ends up resetting? That's not wrong, but maybe a bit surprising.
>
> > +
> > +     memset(&vcpu->run->system_event, 0, sizeof(vcpu->run->system_event));
> > +     vcpu->run->system_event.type = KVM_SYSTEM_EVENT_SUSPEND;
> > +     vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
>
> Consider spinning out a helper common to this and kvm_prepare_system_event().
>
> > +out:
> > +     mutex_unlock(&kvm->lock);
> > +     smccc_set_retval(vcpu, psci_ret, 0, 0, 0);
> > +     return ret;
> > +}
> > +
> >  static void kvm_psci_narrow_to_32bit(struct kvm_vcpu *vcpu)
> >  {
> >       int i;
> > @@ -223,6 +263,14 @@ static unsigned long kvm_psci_check_allowed_function(struct kvm_vcpu *vcpu, u32
> >       if ((fn & PSCI_0_2_64BIT) && vcpu_mode_is_32bit(vcpu))
> >               return PSCI_RET_NOT_SUPPORTED;
> >
> > +     switch (fn) {
> > +     case PSCI_1_0_FN_SYSTEM_SUSPEND:
> > +     case PSCI_1_0_FN64_SYSTEM_SUSPEND:
> > +             if (!vcpu->kvm->arch.suspend_enabled)
> > +                     return PSCI_RET_NOT_SUPPORTED;
> > +             break;
> > +     }
> > +
> >       return 0;
> >  }
> >
> > @@ -316,6 +364,10 @@ static int kvm_psci_1_0_call(struct kvm_vcpu *vcpu)
> >       unsigned long val;
> >       int ret = 1;
> >
> > +     val = kvm_psci_check_allowed_function(vcpu, psci_fn);
> > +     if (val)
> > +             goto out;
> > +
> >       switch(psci_fn) {
> >       case PSCI_0_2_FN_PSCI_VERSION:
> >               val = KVM_ARM_PSCI_1_0;
> > @@ -339,6 +391,8 @@ static int kvm_psci_1_0_call(struct kvm_vcpu *vcpu)
> >               case PSCI_0_2_FN_SYSTEM_OFF:
> >               case PSCI_0_2_FN_SYSTEM_RESET:
> >               case PSCI_1_0_FN_PSCI_FEATURES:
> > +             case PSCI_1_0_FN_SYSTEM_SUSPEND:
> > +             case PSCI_1_0_FN64_SYSTEM_SUSPEND:
> >               case ARM_SMCCC_VERSION_FUNC_ID:
> >                       val = 0;
> >                       break;
> > @@ -347,10 +401,16 @@ static int kvm_psci_1_0_call(struct kvm_vcpu *vcpu)
> >                       break;
> >               }
> >               break;
> > +     case PSCI_1_0_FN_SYSTEM_SUSPEND:
> > +             kvm_psci_narrow_to_32bit(vcpu);
> > +             fallthrough;
> > +     case PSCI_1_0_FN64_SYSTEM_SUSPEND:
> > +             return kvm_psci_system_suspend(vcpu);
> >       default:
> >               return kvm_psci_0_2_call(vcpu);
> >       }
> >
> > +out:
> >       smccc_set_retval(vcpu, val, 0, 0, 0);
> >       return ret;
> >  }
> > diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> > index a067410ebea5..052b0e717b08 100644
> > --- a/include/uapi/linux/kvm.h
> > +++ b/include/uapi/linux/kvm.h
> > @@ -433,6 +433,7 @@ struct kvm_run {
> >  #define KVM_SYSTEM_EVENT_SHUTDOWN       1
> >  #define KVM_SYSTEM_EVENT_RESET          2
> >  #define KVM_SYSTEM_EVENT_CRASH          3
> > +#define KVM_SYSTEM_EVENT_SUSPEND        4
> >                       __u32 type;
> >                       __u64 flags;
> >               } system_event;
> > @@ -1112,6 +1113,7 @@ struct kvm_ppc_resize_hpt {
> >  #define KVM_CAP_BINARY_STATS_FD 203
> >  #define KVM_CAP_EXIT_ON_EMULATION_FAILURE 204
> >  #define KVM_CAP_ARM_MTE 205
> > +#define KVM_CAP_ARM_SYSTEM_SUSPEND 206
> >
> >  #ifdef KVM_CAP_IRQ_ROUTING
> >
>
> I think there is an additional feature we need, which is to give
> control back to userspace every time a wake-up condition occurs (I did
> touch on that in [1]). This would give the opportunity to the VMM to
> do whatever it needs to perform.
>
> A typical use case would be to implement wake-up from certain
> interrupts only (mask non-wake-up IRQs on suspend request, return to
> the guest for WFI, wake-up returns to the VMM to restore the previous
> interrupt configuration, resume).
>
> Userspace would be free to clear the suspend state and resume the
> guest, or to reenter WFI.

Yeah, this is definitely needed for the series.

Just to make sure it's clear, what should the default behavior be if
userspace doesn't touch state at all and it calls KVM_RUN again? It
would seem to me that we should resume the guest by default, much like
we would do for the SUSPEND event exit.

--
Thanks,
Oliver
Marc Zyngier Oct. 1, 2021, 2:02 p.m. UTC | #5
On Thu, 30 Sep 2021 18:40:43 +0100,
Oliver Upton <oupton@google.com> wrote:
> 
> On Thu, Sep 30, 2021 at 5:29 AM Marc Zyngier <maz@kernel.org> wrote:
> >
> > I think there is an additional feature we need, which is to give
> > control back to userspace every time a wake-up condition occurs (I did
> > touch on that in [1]). This would give the opportunity to the VMM to
> > do whatever it needs to perform.
> >
> > A typical use case would be to implement wake-up from certain
> > interrupts only (mask non-wake-up IRQs on suspend request, return to
> > the guest for WFI, wake-up returns to the VMM to restore the previous
> > interrupt configuration, resume).
> >
> > Userspace would be free to clear the suspend state and resume the
> > guest, or to reenter WFI.
> 
> Yeah, this is definitely needed for the series.
> 
> Just to make sure it's clear, what should the default behavior be if
> userspace doesn't touch state at all and it calls KVM_RUN again? It
> would seem to me that we should resume the guest by default, much like
> we would do for the SUSPEND event exit.

My mental model is that the suspend state is "sticky". Once set, it
stays and the guest doesn't execute any instruction until cleared by
the VMM.

It has the advantage that the VMM always knows the state the vcpu is
in, because that's what it asked for, and the vcpu can't change state
on its own.

It means that the VMM would have to set the vcpu state to 'RUNNABLE'
once it is satisfied that it can be run.

Thanks,

	M.
Oliver Upton Oct. 5, 2021, 4:02 p.m. UTC | #6
Hey Marc,

On Thu, Sep 30, 2021 at 5:29 AM Marc Zyngier <maz@kernel.org> wrote:
>
> Hi Oliver,
>
> On Thu, 23 Sep 2021 20:16:05 +0100,
> Oliver Upton <oupton@google.com> wrote:
> >
> > ARM DEN0022D 5.19 "SYSTEM_SUSPEND" describes a PSCI call that may be
> > used to request a system be suspended. This is optional for PSCI v1.0
> > and to date KVM has elected to not implement the call. However, a
> > VMM/operator may wish to provide their guests with the ability to
> > suspend/resume, necessitating this PSCI call.
> >
> > Implement support for SYSTEM_SUSPEND according to the prescribed
> > behavior in the specification. Add a new system event exit type,
> > KVM_SYSTEM_EVENT_SUSPEND, to notify userspace when a VM has requested a
> > system suspend. Make KVM_MP_STATE_HALTED a valid state on arm64.
>
> KVM_MP_STATE_HALTED is a per-CPU state on x86 (it denotes HLT). Does
> it make really sense to hijack this for something that is more of a
> VM-wide state? I can see that it is tempting to do so as we're using
> the WFI semantics (which are close to HLT's, in a twisted kind of
> way), but I'm also painfully aware that gluing x86 expectations on
> arm64 rarely leads to a palatable result.
>
> > Userspace can set this to request an in-kernel emulation of the suspend.
> >
> > Signed-off-by: Oliver Upton <oupton@google.com>
> > ---
> >  Documentation/virt/kvm/api.rst    |  6 ++++
> >  arch/arm64/include/asm/kvm_host.h |  3 ++
> >  arch/arm64/kvm/arm.c              |  8 +++++
> >  arch/arm64/kvm/psci.c             | 60 +++++++++++++++++++++++++++++++
> >  include/uapi/linux/kvm.h          |  2 ++
> >  5 files changed, 79 insertions(+)
>
> This patch needs splitting. PSCI interface in one, mpstate stuff in
> another, userspace management last.
>

I agree that the MP state could be spun off into a new patch, but I
think the PSCI interface and UAPI are tightly bound. Explanation
below.

<snip>

> > diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
> > index f1a375648e25..d875d3bcf3c5 100644
> > --- a/arch/arm64/kvm/arm.c
> > +++ b/arch/arm64/kvm/arm.c
> > @@ -101,6 +101,10 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
> >               }
> >               mutex_unlock(&kvm->lock);
> >               break;
> > +     case KVM_CAP_ARM_SYSTEM_SUSPEND:
> > +             r = 0;
> > +             kvm->arch.suspend_enabled = true;
>
> I don't really fancy adding another control here. Given that we now
> have the PSCI version being controlled by a firmware pseudo-register,
> I'd rather have that there.
>

I would generally agree, but I believe we need a default-off switch
for this new UAPI. Otherwise, I can see this change blowing up old
VMMs that are clueless about KVM_EXIT_SYSTEM_SUSPEND.

Parallel to this, it is my understanding that firmware
pseudo-registers should default to the maximum value, such that
userspace can discover what features are available on KVM and old VMMs
can provide new KVM features to its guests.

> And if we do that, I wonder whether there would be any benefit in
> bumping the PSCI version to 1.1.
>

I mean, we already do implement PSCI v1.1, we just don't pick up any
of the optional functions added to the spec. IMO, the reported PSCI
version should have some relation to the spirit of a particular
revision (such as the optional functions).

To that end, since the SYSTEM_SUSPEND PSCI call is 1.0, I didn't think
bumping the FW register was the right call.

> > +             break;
> >       default:
> >               r = -EINVAL;
> >               break;
> > @@ -215,6 +219,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
> >       case KVM_CAP_SET_GUEST_DEBUG:
> >       case KVM_CAP_VCPU_ATTRIBUTES:
> >       case KVM_CAP_PTP_KVM:
> > +     case KVM_CAP_ARM_SYSTEM_SUSPEND:
> >               r = 1;
> >               break;
> >       case KVM_CAP_SET_GUEST_DEBUG2:
> > @@ -470,6 +475,9 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
> >       int ret = 0;
> >
> >       switch (mp_state->mp_state) {
> > +     case KVM_MP_STATE_HALTED:
> > +             kvm_make_request(KVM_REQ_SUSPEND, vcpu);
> > +             fallthrough;
> >       case KVM_MP_STATE_RUNNABLE:
> >               vcpu->arch.power_off = false;
> >               break;
>
> > diff --git a/arch/arm64/kvm/psci.c b/arch/arm64/kvm/psci.c
> > index d453666ddb83..cf869f1f8615 100644
> > --- a/arch/arm64/kvm/psci.c
> > +++ b/arch/arm64/kvm/psci.c
> > @@ -203,6 +203,46 @@ static void kvm_psci_system_reset(struct kvm_vcpu *vcpu)
> >       kvm_prepare_system_event(vcpu, KVM_SYSTEM_EVENT_RESET);
> >  }
> >
> > +static int kvm_psci_system_suspend(struct kvm_vcpu *vcpu)
> > +{
> > +     unsigned long entry_addr, context_id;
> > +     struct kvm *kvm = vcpu->kvm;
> > +     unsigned long psci_ret = 0;
> > +     struct kvm_vcpu *tmp;
> > +     int ret = 0;
> > +     int i;
> > +
> > +     /*
> > +      * The SYSTEM_SUSPEND PSCI call requires that all vCPUs (except the
> > +      * calling vCPU) be in an OFF state, as determined by the
> > +      * implementation.
> > +      *
> > +      * See ARM DEN0022D, 5.19 "SYSTEM_SUSPEND" for more details.
> > +      */
> > +     mutex_lock(&kvm->lock);
> > +     kvm_for_each_vcpu(i, tmp, kvm) {
> > +             if (tmp != vcpu && !tmp->arch.power_off) {
> > +                     psci_ret = PSCI_RET_DENIED;
> > +                     ret = 1;
> > +                     goto out;
> > +             }
> > +     }
> > +
> > +     entry_addr = smccc_get_arg1(vcpu);
> > +     context_id = smccc_get_arg2(vcpu);
> > +
> > +     kvm_psci_vcpu_request_reset(vcpu, entry_addr, context_id,
> > +                                 kvm_vcpu_is_be(vcpu));
>
> So even if userspace doesn't want to honor the suspend request, the
> CPU ends up resetting? That's not wrong, but maybe a bit surprising.
>

I think it's the only consistent state that we can really put the vCPU
in. If, by default, we refuse the guest's request (returning
INTERNAL_ERROR), then we must also stash the reset context for later
use on the next KVM_RUN if userspace honors the guest. Then we are in
the weird state where vcpu->arch.reset_state is valid, but
KVM_REQ_RESET is not set. In this case, if userspace were to save the
vCPU, the reset context is lost, missing the check from commit
6826c6849b46 ("KVM: arm64: Handle PSCI resets before userspace touches
vCPU state").

Sorry if this is a bit too drawn out, but wanted to share the thought
process behind it in case you had any other ideas.

--
Thanks,
Oliver
diff mbox series

Patch

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index a6729c8cf063..361a57061b8f 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -5656,6 +5656,7 @@  should put the acknowledged interrupt vector into the 'epr' field.
   #define KVM_SYSTEM_EVENT_SHUTDOWN       1
   #define KVM_SYSTEM_EVENT_RESET          2
   #define KVM_SYSTEM_EVENT_CRASH          3
+  #define KVM_SYSTEM_EVENT_SUSPEND        4
 			__u32 type;
 			__u64 flags;
 		} system_event;
@@ -5680,6 +5681,11 @@  Valid values for 'type' are:
    has requested a crash condition maintenance. Userspace can choose
    to ignore the request, or to gather VM memory core dump and/or
    reset/shutdown of the VM.
+ - KVM_SYSTEM_EVENT_SUSPEND -- the guest has requested that the VM
+   suspends. Userspace is not obliged to honor this, and may call KVM_RUN
+   again. Doing so will cause the guest to resume at its requested entry
+   point. For ARM64, userspace can request in-kernel suspend emulation
+   by setting the vCPU's MP state to KVM_MP_STATE_HALTED.
 
 ::
 
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 1beda1189a15..441eb6fa7adc 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -137,6 +137,9 @@  struct kvm_arch {
 
 	/* Memory Tagging Extension enabled for the guest */
 	bool mte_enabled;
+
+	/* PSCI SYSTEM_SUSPEND call enabled for the guest */
+	bool suspend_enabled;
 };
 
 struct kvm_vcpu_fault_info {
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index f1a375648e25..d875d3bcf3c5 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -101,6 +101,10 @@  int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
 		}
 		mutex_unlock(&kvm->lock);
 		break;
+	case KVM_CAP_ARM_SYSTEM_SUSPEND:
+		r = 0;
+		kvm->arch.suspend_enabled = true;
+		break;
 	default:
 		r = -EINVAL;
 		break;
@@ -215,6 +219,7 @@  int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_SET_GUEST_DEBUG:
 	case KVM_CAP_VCPU_ATTRIBUTES:
 	case KVM_CAP_PTP_KVM:
+	case KVM_CAP_ARM_SYSTEM_SUSPEND:
 		r = 1;
 		break;
 	case KVM_CAP_SET_GUEST_DEBUG2:
@@ -470,6 +475,9 @@  int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
 	int ret = 0;
 
 	switch (mp_state->mp_state) {
+	case KVM_MP_STATE_HALTED:
+		kvm_make_request(KVM_REQ_SUSPEND, vcpu);
+		fallthrough;
 	case KVM_MP_STATE_RUNNABLE:
 		vcpu->arch.power_off = false;
 		break;
diff --git a/arch/arm64/kvm/psci.c b/arch/arm64/kvm/psci.c
index d453666ddb83..cf869f1f8615 100644
--- a/arch/arm64/kvm/psci.c
+++ b/arch/arm64/kvm/psci.c
@@ -203,6 +203,46 @@  static void kvm_psci_system_reset(struct kvm_vcpu *vcpu)
 	kvm_prepare_system_event(vcpu, KVM_SYSTEM_EVENT_RESET);
 }
 
+static int kvm_psci_system_suspend(struct kvm_vcpu *vcpu)
+{
+	unsigned long entry_addr, context_id;
+	struct kvm *kvm = vcpu->kvm;
+	unsigned long psci_ret = 0;
+	struct kvm_vcpu *tmp;
+	int ret = 0;
+	int i;
+
+	/*
+	 * The SYSTEM_SUSPEND PSCI call requires that all vCPUs (except the
+	 * calling vCPU) be in an OFF state, as determined by the
+	 * implementation.
+	 *
+	 * See ARM DEN0022D, 5.19 "SYSTEM_SUSPEND" for more details.
+	 */
+	mutex_lock(&kvm->lock);
+	kvm_for_each_vcpu(i, tmp, kvm) {
+		if (tmp != vcpu && !tmp->arch.power_off) {
+			psci_ret = PSCI_RET_DENIED;
+			ret = 1;
+			goto out;
+		}
+	}
+
+	entry_addr = smccc_get_arg1(vcpu);
+	context_id = smccc_get_arg2(vcpu);
+
+	kvm_psci_vcpu_request_reset(vcpu, entry_addr, context_id,
+				    kvm_vcpu_is_be(vcpu));
+
+	memset(&vcpu->run->system_event, 0, sizeof(vcpu->run->system_event));
+	vcpu->run->system_event.type = KVM_SYSTEM_EVENT_SUSPEND;
+	vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
+out:
+	mutex_unlock(&kvm->lock);
+	smccc_set_retval(vcpu, psci_ret, 0, 0, 0);
+	return ret;
+}
+
 static void kvm_psci_narrow_to_32bit(struct kvm_vcpu *vcpu)
 {
 	int i;
@@ -223,6 +263,14 @@  static unsigned long kvm_psci_check_allowed_function(struct kvm_vcpu *vcpu, u32
 	if ((fn & PSCI_0_2_64BIT) && vcpu_mode_is_32bit(vcpu))
 		return PSCI_RET_NOT_SUPPORTED;
 
+	switch (fn) {
+	case PSCI_1_0_FN_SYSTEM_SUSPEND:
+	case PSCI_1_0_FN64_SYSTEM_SUSPEND:
+		if (!vcpu->kvm->arch.suspend_enabled)
+			return PSCI_RET_NOT_SUPPORTED;
+		break;
+	}
+
 	return 0;
 }
 
@@ -316,6 +364,10 @@  static int kvm_psci_1_0_call(struct kvm_vcpu *vcpu)
 	unsigned long val;
 	int ret = 1;
 
+	val = kvm_psci_check_allowed_function(vcpu, psci_fn);
+	if (val)
+		goto out;
+
 	switch(psci_fn) {
 	case PSCI_0_2_FN_PSCI_VERSION:
 		val = KVM_ARM_PSCI_1_0;
@@ -339,6 +391,8 @@  static int kvm_psci_1_0_call(struct kvm_vcpu *vcpu)
 		case PSCI_0_2_FN_SYSTEM_OFF:
 		case PSCI_0_2_FN_SYSTEM_RESET:
 		case PSCI_1_0_FN_PSCI_FEATURES:
+		case PSCI_1_0_FN_SYSTEM_SUSPEND:
+		case PSCI_1_0_FN64_SYSTEM_SUSPEND:
 		case ARM_SMCCC_VERSION_FUNC_ID:
 			val = 0;
 			break;
@@ -347,10 +401,16 @@  static int kvm_psci_1_0_call(struct kvm_vcpu *vcpu)
 			break;
 		}
 		break;
+	case PSCI_1_0_FN_SYSTEM_SUSPEND:
+		kvm_psci_narrow_to_32bit(vcpu);
+		fallthrough;
+	case PSCI_1_0_FN64_SYSTEM_SUSPEND:
+		return kvm_psci_system_suspend(vcpu);
 	default:
 		return kvm_psci_0_2_call(vcpu);
 	}
 
+out:
 	smccc_set_retval(vcpu, val, 0, 0, 0);
 	return ret;
 }
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index a067410ebea5..052b0e717b08 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -433,6 +433,7 @@  struct kvm_run {
 #define KVM_SYSTEM_EVENT_SHUTDOWN       1
 #define KVM_SYSTEM_EVENT_RESET          2
 #define KVM_SYSTEM_EVENT_CRASH          3
+#define KVM_SYSTEM_EVENT_SUSPEND        4
 			__u32 type;
 			__u64 flags;
 		} system_event;
@@ -1112,6 +1113,7 @@  struct kvm_ppc_resize_hpt {
 #define KVM_CAP_BINARY_STATS_FD 203
 #define KVM_CAP_EXIT_ON_EMULATION_FAILURE 204
 #define KVM_CAP_ARM_MTE 205
+#define KVM_CAP_ARM_SYSTEM_SUSPEND 206
 
 #ifdef KVM_CAP_IRQ_ROUTING