diff mbox series

[PATCHv2,4/4] arm64: add host pv-vcpu-state support

Message ID 20210709043713.887098-5-senozhatsky@chromium.org (mailing list archive)
State New, archived
Headers show
Series arm64:kvm: teach guest sched that VCPUs can be preempted | expand

Commit Message

Sergey Senozhatsky July 9, 2021, 4:37 a.m. UTC
Add PV-vcpu-state support bits to the host. Host uses the guest
PV-state per-CPU pointers to update the VCPU state each time
it kvm_arch_vcpu_load() or kvm_arch_vcpu_put() the VCPU, so
that guest scheduler can become aware of the fact that not
all VCPUs are always available. Currently guest scheduler
on amr64 always assumes that all CPUs are available because
vcpu_is_preempted() is not implemented on arm64.

- schbench -t 3 -m 3 -p 4096

  Latency percentiles (usec)
        BASE
================================================
        50.0th: 1 (3556427 samples)
        75.0th: 13 (879210 samples)
        90.0th: 15 (893311 samples)
        95.0th: 18 (159594 samples)
        *99.0th: 118 (224187 samples)
        99.5th: 691 (28555 samples)
        99.9th: 7384 (23076 samples)
        min=1, max=104218
avg worker transfer: 25192.00 ops/sec 98.41MB/s

        PATCHED
================================================
        50.0th: 1 (3507010 samples)
        75.0th: 13 (1635775 samples)
        90.0th: 16 (901271 samples)
        95.0th: 24 (281051 samples)
        *99.0th: 114 (255581 samples)
        99.5th: 382 (33051 samples)
        99.9th: 6392 (26592 samples)
        min=1, max=83877
avg worker transfer: 28613.39 ops/sec 111.77MB/s

- perf bench sched all

  ops/sec
        BASE                 PATCHED
================================================
        33452		     36485
        33541		     39405
        33365		     36858
        33455		     38047
        33449		     37866
        33616		     34922
        33479		     34388
        33594		     37203
        33458		     35363
        33704		     35180

Student's T-test

         N           Min           Max        Median           Avg        Stddev
base     10         33365         33704         33479       33511.3     100.92467
patched  10         34388         39405         36858       36571.7      1607.454
Difference at 95.0% confidence
	3060.4 +/- 1070.09
	9.13244% +/- 3.19321%
	(Student's t, pooled s = 1138.88)

Signed-off-by: Sergey Senozhatsky <senozhatsky@chromium.org>
---
 arch/arm64/include/asm/kvm_host.h | 23 +++++++++++
 arch/arm64/kvm/Makefile           |  3 +-
 arch/arm64/kvm/arm.c              |  3 ++
 arch/arm64/kvm/hypercalls.c       | 11 ++++++
 arch/arm64/kvm/pv-vcpu-state.c    | 64 +++++++++++++++++++++++++++++++
 5 files changed, 103 insertions(+), 1 deletion(-)
 create mode 100644 arch/arm64/kvm/pv-vcpu-state.c

Comments

Marc Zyngier July 12, 2021, 4:24 p.m. UTC | #1
On Fri, 09 Jul 2021 05:37:13 +0100,
Sergey Senozhatsky <senozhatsky@chromium.org> wrote:
> 
> Add PV-vcpu-state support bits to the host. Host uses the guest
> PV-state per-CPU pointers to update the VCPU state each time
> it kvm_arch_vcpu_load() or kvm_arch_vcpu_put() the VCPU, so
> that guest scheduler can become aware of the fact that not
> all VCPUs are always available. Currently guest scheduler
> on amr64 always assumes that all CPUs are available because

arm64

> vcpu_is_preempted() is not implemented on arm64.
> 
> - schbench -t 3 -m 3 -p 4096
> 
>   Latency percentiles (usec)
>         BASE
> ================================================
>         50.0th: 1 (3556427 samples)
>         75.0th: 13 (879210 samples)
>         90.0th: 15 (893311 samples)
>         95.0th: 18 (159594 samples)
>         *99.0th: 118 (224187 samples)
>         99.5th: 691 (28555 samples)
>         99.9th: 7384 (23076 samples)
>         min=1, max=104218
> avg worker transfer: 25192.00 ops/sec 98.41MB/s
> 
>         PATCHED
> ================================================
>         50.0th: 1 (3507010 samples)
>         75.0th: 13 (1635775 samples)
>         90.0th: 16 (901271 samples)
>         95.0th: 24 (281051 samples)
>         *99.0th: 114 (255581 samples)
>         99.5th: 382 (33051 samples)
>         99.9th: 6392 (26592 samples)
>         min=1, max=83877
> avg worker transfer: 28613.39 ops/sec 111.77MB/s
> 
> - perf bench sched all
> 
>   ops/sec
>         BASE                 PATCHED
> ================================================
>         33452		     36485
>         33541		     39405
>         33365		     36858
>         33455		     38047
>         33449		     37866
>         33616		     34922
>         33479		     34388
>         33594		     37203
>         33458		     35363
>         33704		     35180
> 
> Student's T-test
> 
>          N           Min           Max        Median           Avg        Stddev
> base     10         33365         33704         33479       33511.3     100.92467
> patched  10         34388         39405         36858       36571.7      1607.454
> Difference at 95.0% confidence
> 	3060.4 +/- 1070.09
> 	9.13244% +/- 3.19321%
> 	(Student's t, pooled s = 1138.88)
> 
> Signed-off-by: Sergey Senozhatsky <senozhatsky@chromium.org>
> ---
>  arch/arm64/include/asm/kvm_host.h | 23 +++++++++++
>  arch/arm64/kvm/Makefile           |  3 +-
>  arch/arm64/kvm/arm.c              |  3 ++
>  arch/arm64/kvm/hypercalls.c       | 11 ++++++
>  arch/arm64/kvm/pv-vcpu-state.c    | 64 +++++++++++++++++++++++++++++++
>  5 files changed, 103 insertions(+), 1 deletion(-)
>  create mode 100644 arch/arm64/kvm/pv-vcpu-state.c
> 
> diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
> index 41911585ae0c..e782f4d0c916 100644
> --- a/arch/arm64/include/asm/kvm_host.h
> +++ b/arch/arm64/include/asm/kvm_host.h
> @@ -381,6 +381,12 @@ struct kvm_vcpu_arch {
>  		u64 last_steal;
>  		gpa_t base;
>  	} steal;
> +
> +	/* PV state of the VCPU */
> +	struct {
> +		gpa_t base;
> +		struct gfn_to_hva_cache ghc;

Please stay consistent with what we use of steal time accounting
(i.e. don't use gfn_to_hva_cache, but directly use a gpa with
kvm_put_guest()). If you can demonstrate that there is an unacceptable
overhead in doing so, then both steal time and preemption will be
updated at the same time.


> +	} vcpu_state;
>  };
>  
>  /* Pointer to the vcpu's SVE FFR for sve_{save,load}_state() */
> @@ -695,6 +701,23 @@ static inline bool kvm_arm_is_pvtime_enabled(struct kvm_vcpu_arch *vcpu_arch)
>  	return (vcpu_arch->steal.base != GPA_INVALID);
>  }
>  
> +int kvm_init_vcpu_state(struct kvm_vcpu *vcpu, gfn_t addr);
> +int kvm_release_vcpu_state(struct kvm_vcpu *vcpu);
> +
> +static inline void kvm_arm_vcpu_state_init(struct kvm_vcpu_arch *vcpu_arch)
> +{
> +	vcpu_arch->vcpu_state.base = GPA_INVALID;
> +	memset(&vcpu_arch->vcpu_state.ghc, 0, sizeof(struct gfn_to_hva_cache));

Does it really need to be inlined?

> +}
> +
> +static inline bool
> +kvm_arm_is_vcpu_state_enabled(struct kvm_vcpu_arch *vcpu_arch)
> +{
> +	return (vcpu_arch->vcpu_state.base != GPA_INVALID);
> +}
> +
> +void kvm_update_vcpu_preempted(struct kvm_vcpu *vcpu, bool preempted);
> +
>  void kvm_set_sei_esr(struct kvm_vcpu *vcpu, u64 syndrome);
>  
>  struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr);
> diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
> index 989bb5dad2c8..2a3ee82c6d90 100644
> --- a/arch/arm64/kvm/Makefile
> +++ b/arch/arm64/kvm/Makefile
> @@ -12,7 +12,8 @@ obj-$(CONFIG_KVM) += hyp/
>  
>  kvm-y := $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o $(KVM)/eventfd.o \
>  	 $(KVM)/vfio.o $(KVM)/irqchip.o $(KVM)/binary_stats.o \
> -	 arm.o mmu.o mmio.o psci.o perf.o hypercalls.o pvtime.o \
> +	 arm.o mmu.o mmio.o psci.o perf.o hypercalls.o \
> +	 pvtime.o pv-vcpu-state.o \
>  	 inject_fault.o va_layout.o handle_exit.o \
>  	 guest.o debug.o reset.o sys_regs.o \
>  	 vgic-sys-reg-v3.o fpsimd.o pmu.o \
> diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
> index e9a2b8f27792..43e995c9fddb 100644
> --- a/arch/arm64/kvm/arm.c
> +++ b/arch/arm64/kvm/arm.c
> @@ -332,6 +332,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
>  	kvm_arm_reset_debug_ptr(vcpu);
>  
>  	kvm_arm_pvtime_vcpu_init(&vcpu->arch);
> +	kvm_arm_vcpu_state_init(&vcpu->arch);
>  
>  	vcpu->arch.hw_mmu = &vcpu->kvm->arch.mmu;
>  
> @@ -429,10 +430,12 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
>  	if (vcpu_has_ptrauth(vcpu))
>  		vcpu_ptrauth_disable(vcpu);
>  	kvm_arch_vcpu_load_debug_state_flags(vcpu);
> +	kvm_update_vcpu_preempted(vcpu, false);
>  }
>  
>  void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
>  {
> +	kvm_update_vcpu_preempted(vcpu, true);

This doesn't look right. With this, you are now telling the guest that
a vcpu that is blocked on WFI is preempted. This really isn't the
case, as it has voluntarily entered a low-power mode while waiting for
an interrupt. Indeed, the vcpu isn't running. A physical CPU wouldn't
be running either.

>  	kvm_arch_vcpu_put_debug_state_flags(vcpu);
>  	kvm_arch_vcpu_put_fp(vcpu);
>  	if (has_vhe())
> diff --git a/arch/arm64/kvm/hypercalls.c b/arch/arm64/kvm/hypercalls.c
> index 30da78f72b3b..95bcf86e0b6f 100644
> --- a/arch/arm64/kvm/hypercalls.c
> +++ b/arch/arm64/kvm/hypercalls.c
> @@ -110,6 +110,9 @@ int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
>  		case ARM_SMCCC_HV_PV_TIME_FEATURES:
>  			val[0] = SMCCC_RET_SUCCESS;
>  			break;
> +		case ARM_SMCCC_HV_PV_VCPU_STATE_FEATURES:
> +			val[0] = SMCCC_RET_SUCCESS;
> +			break;
>  		}
>  		break;
>  	case ARM_SMCCC_HV_PV_TIME_FEATURES:
> @@ -139,6 +142,14 @@ int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
>  	case ARM_SMCCC_TRNG_RND32:
>  	case ARM_SMCCC_TRNG_RND64:
>  		return kvm_trng_call(vcpu);
> +	case ARM_SMCCC_HV_PV_VCPU_STATE_INIT:
> +		if (kvm_init_vcpu_state(vcpu, smccc_get_arg1(vcpu)) == 0)
> +			val[0] = SMCCC_RET_SUCCESS;
> +		break;
> +	case ARM_SMCCC_HV_PV_VCPU_STATE_RELEASE:
> +		if (kvm_release_vcpu_state(vcpu) == 0)
> +			val[0] = SMCCC_RET_SUCCESS;
> +		break;
>  	default:
>  		return kvm_psci_call(vcpu);
>  	}
> diff --git a/arch/arm64/kvm/pv-vcpu-state.c b/arch/arm64/kvm/pv-vcpu-state.c
> new file mode 100644
> index 000000000000..8496bb2a5966
> --- /dev/null
> +++ b/arch/arm64/kvm/pv-vcpu-state.c
> @@ -0,0 +1,64 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +
> +#include <linux/arm-smccc.h>
> +#include <linux/kvm_host.h>
> +
> +#include <asm/kvm_mmu.h>
> +#include <asm/paravirt.h>
> +
> +#include <kvm/arm_psci.h>

Why this include?

> +
> +int kvm_init_vcpu_state(struct kvm_vcpu *vcpu, gpa_t addr)
> +{
> +	struct kvm *kvm = vcpu->kvm;
> +	int ret;
> +	u64 idx;
> +
> +	if (kvm_arm_is_vcpu_state_enabled(&vcpu->arch))
> +		return 0;
> +
> +	idx = srcu_read_lock(&kvm->srcu);
> +	ret = kvm_gfn_to_hva_cache_init(vcpu->kvm,
> +					&vcpu->arch.vcpu_state.ghc,
> +					addr,
> +					sizeof(struct vcpu_state));
> +	srcu_read_unlock(&kvm->srcu, idx);
> +
> +	if (!ret)
> +		vcpu->arch.vcpu_state.base = addr;
> +	return ret;
> +}
> +
> +int kvm_release_vcpu_state(struct kvm_vcpu *vcpu)
> +{
> +	if (!kvm_arm_is_vcpu_state_enabled(&vcpu->arch))
> +		return 0;
> +
> +	kvm_arm_vcpu_state_init(&vcpu->arch);
> +	return 0;
> +}
> +
> +void kvm_update_vcpu_preempted(struct kvm_vcpu *vcpu, bool preempted)
> +{
> +	struct kvm *kvm = vcpu->kvm;
> +	u64 idx;
> +
> +	if (!kvm_arm_is_vcpu_state_enabled(&vcpu->arch))
> +		return;
> +
> +	/*
> +	 * This function is called from atomic context, so we need to
> +	 * disable page faults. kvm_write_guest_cached() will call
> +	 * might_fault().
> +	 */
> +	pagefault_disable();
> +	/*
> +	 * Need to take the SRCU lock because kvm_write_guest_offset_cached()
> +	 * calls kvm_memslots();
> +	 */
> +	idx = srcu_read_lock(&kvm->srcu);
> +	kvm_write_guest_cached(kvm, &vcpu->arch.vcpu_state.ghc,
> +			       &preempted, sizeof(bool));
> +	srcu_read_unlock(&kvm->srcu, idx);
> +	pagefault_enable();
> +}

Before rushing into an implementation, this really could do with some
specification:

- where is the memory allocated?

- what is the lifetime of the memory?

- what is its expected memory type?

- what is the coherency model (what if the guest runs with caching
  disabled, for example)?

- is the functionality preserved across a VM reset?

- what is the strategy w.r.t live migration?

- can userspace detect that the feature is implemented?

- can userspace prevent the feature from being used?

- where is the documentation?

Thanks,

	M.
Joel Fernandes July 20, 2021, 6:44 p.m. UTC | #2
On Mon, Jul 12, 2021 at 12:24 PM Marc Zyngier <maz@kernel.org> wrote:
>
[...]
> > +}
> > +
> > +static inline bool
> > +kvm_arm_is_vcpu_state_enabled(struct kvm_vcpu_arch *vcpu_arch)
> > +{
> > +     return (vcpu_arch->vcpu_state.base != GPA_INVALID);
> > +}
> > +
> > +void kvm_update_vcpu_preempted(struct kvm_vcpu *vcpu, bool preempted);
> > +
> >  void kvm_set_sei_esr(struct kvm_vcpu *vcpu, u64 syndrome);
> >
> >  struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr);
> > diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
> > index 989bb5dad2c8..2a3ee82c6d90 100644
> > --- a/arch/arm64/kvm/Makefile
> > +++ b/arch/arm64/kvm/Makefile
> > @@ -12,7 +12,8 @@ obj-$(CONFIG_KVM) += hyp/
> >
> >  kvm-y := $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o $(KVM)/eventfd.o \
> >        $(KVM)/vfio.o $(KVM)/irqchip.o $(KVM)/binary_stats.o \
> > -      arm.o mmu.o mmio.o psci.o perf.o hypercalls.o pvtime.o \
> > +      arm.o mmu.o mmio.o psci.o perf.o hypercalls.o \
> > +      pvtime.o pv-vcpu-state.o \
> >        inject_fault.o va_layout.o handle_exit.o \
> >        guest.o debug.o reset.o sys_regs.o \
> >        vgic-sys-reg-v3.o fpsimd.o pmu.o \
> > diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
> > index e9a2b8f27792..43e995c9fddb 100644
> > --- a/arch/arm64/kvm/arm.c
> > +++ b/arch/arm64/kvm/arm.c
> > @@ -332,6 +332,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
> >       kvm_arm_reset_debug_ptr(vcpu);
> >
> >       kvm_arm_pvtime_vcpu_init(&vcpu->arch);
> > +     kvm_arm_vcpu_state_init(&vcpu->arch);
> >
> >       vcpu->arch.hw_mmu = &vcpu->kvm->arch.mmu;
> >
> > @@ -429,10 +430,12 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
> >       if (vcpu_has_ptrauth(vcpu))
> >               vcpu_ptrauth_disable(vcpu);
> >       kvm_arch_vcpu_load_debug_state_flags(vcpu);
> > +     kvm_update_vcpu_preempted(vcpu, false);
> >  }
> >
> >  void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
> >  {
> > +     kvm_update_vcpu_preempted(vcpu, true);
>
> This doesn't look right. With this, you are now telling the guest that
> a vcpu that is blocked on WFI is preempted. This really isn't the
> case, as it has voluntarily entered a low-power mode while waiting for
> an interrupt. Indeed, the vcpu isn't running. A physical CPU wouldn't
> be running either.

Can that be cured by just checking vcpu->preempted before calling
kvm_update_vcpu_preempted() ?

- Joel
Sergey Senozhatsky July 21, 2021, 1:15 a.m. UTC | #3
On (21/07/12 17:24), Marc Zyngier wrote:
> >  
> >  void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
> >  {
> > +	kvm_update_vcpu_preempted(vcpu, true);
> 
> This doesn't look right. With this, you are now telling the guest that
> a vcpu that is blocked on WFI is preempted. This really isn't the
> case, as it has voluntarily entered a low-power mode while waiting for
> an interrupt. Indeed, the vcpu isn't running. A physical CPU wouldn't
> be running either.

I suppose you are talking about kvm_vcpu_block(). Well, it checks
kvm_vcpu_check_block() but then it simply schedule() out the vcpu
process, which does look like "the vcpu is preempted". Once we
sched_in() that vcpu process again we mark it as non-preempted,
even though it remains in kvm wfx handler. Why isn't it right?

Another call path is iret:

<iret>
__schedule()
 context_switch()
  prepare_task_switch()
   fire_sched_in_preempt_notifiers()
    kvm_sched_out()
     kvm_arch_vcpu_put()
Marc Zyngier July 21, 2021, 8:40 a.m. UTC | #4
On Tue, 20 Jul 2021 19:44:53 +0100,
Joel Fernandes <joelaf@google.com> wrote:
> 
> On Mon, Jul 12, 2021 at 12:24 PM Marc Zyngier <maz@kernel.org> wrote:
> >
> [...]
> > >  void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
> > >  {
> > > +     kvm_update_vcpu_preempted(vcpu, true);
> >
> > This doesn't look right. With this, you are now telling the guest that
> > a vcpu that is blocked on WFI is preempted. This really isn't the
> > case, as it has voluntarily entered a low-power mode while waiting for
> > an interrupt. Indeed, the vcpu isn't running. A physical CPU wouldn't
> > be running either.
> 
> Can that be cured by just checking vcpu->preempted before calling
> kvm_update_vcpu_preempted() ?

It isn't obvious to me that this is the right thing to do.
vcpu->preempted is always updated on sched-out from the preempt
notifier if the vcpu was on the run-queue, so my guess is that it will
always be set when switching to another task.

What you probably want is to check whether the vcpu is blocked by
introspecting the wait-queue with:

	scuwait_active(kvm_arch_vcpu_get_wait(vcpu)

which will tell you whether you are blocking or not. We are already
using a similar construct for arming a background timer in this case.

	M.
Marc Zyngier July 21, 2021, 9:10 a.m. UTC | #5
On Wed, 21 Jul 2021 02:15:47 +0100,
Sergey Senozhatsky <senozhatsky@chromium.org> wrote:
> 
> On (21/07/12 17:24), Marc Zyngier wrote:
> > >  
> > >  void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
> > >  {
> > > +	kvm_update_vcpu_preempted(vcpu, true);
> > 
> > This doesn't look right. With this, you are now telling the guest that
> > a vcpu that is blocked on WFI is preempted. This really isn't the
> > case, as it has voluntarily entered a low-power mode while waiting for
> > an interrupt. Indeed, the vcpu isn't running. A physical CPU wouldn't
> > be running either.
> 
> I suppose you are talking about kvm_vcpu_block().

kvm_vcpu_block() is how things are implemented. WFI is the instruction
I'm concerned about.

> Well, it checks kvm_vcpu_check_block() but then it simply schedule()
> out the vcpu process, which does look like "the vcpu is
> preempted". Once we sched_in() that vcpu process again we mark it as
> non-preempted, even though it remains in kvm wfx handler. Why isn't
> it right?

Because the vcpu hasn't been "preempted". It has *voluntarily* gone
into a low-power mode, and how KVM implements this "low-power mode" is
none of the guest's business. This is exactly the same behaviour that
you will have on bare metal. From a Linux guest perspective, the vcpu
is *idle*, not doing anything, and only waiting for an interrupt to
start executing again.

This is a fundamentally different concept from preempting a vcpu
because its time-slice is up. In this second case, you can indeed
mitigate things by exposing steal time and preemption status as you
break the illusion of a machine that is completely controlled by the
guest.

If the "reched on interrupt delivery while blocked on WFI" is too slow
for you, then *that* is the thing that needs addressing. Feeding extra
state to the guest doesn't help.

> Another call path is iret:
> 
> <iret>
> __schedule()
>  context_switch()
>   prepare_task_switch()
>    fire_sched_in_preempt_notifiers()
>     kvm_sched_out()
>      kvm_arch_vcpu_put()

I'm not sure how a x86 concept is relevant here.

Thanks,

	M.
Sergey Senozhatsky July 21, 2021, 10:38 a.m. UTC | #6
On (21/07/21 09:40), Marc Zyngier wrote:
> > 
> > Can that be cured by just checking vcpu->preempted before calling
> > kvm_update_vcpu_preempted() ?
> 
> It isn't obvious to me that this is the right thing to do.
> vcpu->preempted is always updated on sched-out from the preempt
> notifier if the vcpu was on the run-queue, so my guess is that it will
> always be set when switching to another task.
> 
> What you probably want is to check whether the vcpu is blocked by
> introspecting the wait-queue with:
> 
> 	scuwait_active(kvm_arch_vcpu_get_wait(vcpu)
> 
> which will tell you whether you are blocking or not. We are already
> using a similar construct for arming a background timer in this case.

Can we examine if vcpu->run->exit_reason == WFE/WFI and avoid setting
preempted state if so?
Marc Zyngier July 21, 2021, 11:08 a.m. UTC | #7
On Wed, 21 Jul 2021 11:38:40 +0100,
Sergey Senozhatsky <senozhatsky@chromium.org> wrote:
> 
> On (21/07/21 09:40), Marc Zyngier wrote:
> > > 
> > > Can that be cured by just checking vcpu->preempted before calling
> > > kvm_update_vcpu_preempted() ?
> > 
> > It isn't obvious to me that this is the right thing to do.
> > vcpu->preempted is always updated on sched-out from the preempt
> > notifier if the vcpu was on the run-queue, so my guess is that it will
> > always be set when switching to another task.
> > 
> > What you probably want is to check whether the vcpu is blocked by
> > introspecting the wait-queue with:
> > 
> > 	scuwait_active(kvm_arch_vcpu_get_wait(vcpu)
> > 
> > which will tell you whether you are blocking or not. We are already
> > using a similar construct for arming a background timer in this case.
> 
> Can we examine if vcpu->run->exit_reason == WFE/WFI and avoid setting
> preempted state if so?

We never go back to userspace for WFI/WFE, so no reason to populate
the run structure.

Checking for the blocked state is the right thing to do, and we
already have the primitive for this. Just use it.

	M.
diff mbox series

Patch

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 41911585ae0c..e782f4d0c916 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -381,6 +381,12 @@  struct kvm_vcpu_arch {
 		u64 last_steal;
 		gpa_t base;
 	} steal;
+
+	/* PV state of the VCPU */
+	struct {
+		gpa_t base;
+		struct gfn_to_hva_cache ghc;
+	} vcpu_state;
 };
 
 /* Pointer to the vcpu's SVE FFR for sve_{save,load}_state() */
@@ -695,6 +701,23 @@  static inline bool kvm_arm_is_pvtime_enabled(struct kvm_vcpu_arch *vcpu_arch)
 	return (vcpu_arch->steal.base != GPA_INVALID);
 }
 
+int kvm_init_vcpu_state(struct kvm_vcpu *vcpu, gfn_t addr);
+int kvm_release_vcpu_state(struct kvm_vcpu *vcpu);
+
+static inline void kvm_arm_vcpu_state_init(struct kvm_vcpu_arch *vcpu_arch)
+{
+	vcpu_arch->vcpu_state.base = GPA_INVALID;
+	memset(&vcpu_arch->vcpu_state.ghc, 0, sizeof(struct gfn_to_hva_cache));
+}
+
+static inline bool
+kvm_arm_is_vcpu_state_enabled(struct kvm_vcpu_arch *vcpu_arch)
+{
+	return (vcpu_arch->vcpu_state.base != GPA_INVALID);
+}
+
+void kvm_update_vcpu_preempted(struct kvm_vcpu *vcpu, bool preempted);
+
 void kvm_set_sei_esr(struct kvm_vcpu *vcpu, u64 syndrome);
 
 struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr);
diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
index 989bb5dad2c8..2a3ee82c6d90 100644
--- a/arch/arm64/kvm/Makefile
+++ b/arch/arm64/kvm/Makefile
@@ -12,7 +12,8 @@  obj-$(CONFIG_KVM) += hyp/
 
 kvm-y := $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o $(KVM)/eventfd.o \
 	 $(KVM)/vfio.o $(KVM)/irqchip.o $(KVM)/binary_stats.o \
-	 arm.o mmu.o mmio.o psci.o perf.o hypercalls.o pvtime.o \
+	 arm.o mmu.o mmio.o psci.o perf.o hypercalls.o \
+	 pvtime.o pv-vcpu-state.o \
 	 inject_fault.o va_layout.o handle_exit.o \
 	 guest.o debug.o reset.o sys_regs.o \
 	 vgic-sys-reg-v3.o fpsimd.o pmu.o \
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index e9a2b8f27792..43e995c9fddb 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -332,6 +332,7 @@  int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
 	kvm_arm_reset_debug_ptr(vcpu);
 
 	kvm_arm_pvtime_vcpu_init(&vcpu->arch);
+	kvm_arm_vcpu_state_init(&vcpu->arch);
 
 	vcpu->arch.hw_mmu = &vcpu->kvm->arch.mmu;
 
@@ -429,10 +430,12 @@  void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 	if (vcpu_has_ptrauth(vcpu))
 		vcpu_ptrauth_disable(vcpu);
 	kvm_arch_vcpu_load_debug_state_flags(vcpu);
+	kvm_update_vcpu_preempted(vcpu, false);
 }
 
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 {
+	kvm_update_vcpu_preempted(vcpu, true);
 	kvm_arch_vcpu_put_debug_state_flags(vcpu);
 	kvm_arch_vcpu_put_fp(vcpu);
 	if (has_vhe())
diff --git a/arch/arm64/kvm/hypercalls.c b/arch/arm64/kvm/hypercalls.c
index 30da78f72b3b..95bcf86e0b6f 100644
--- a/arch/arm64/kvm/hypercalls.c
+++ b/arch/arm64/kvm/hypercalls.c
@@ -110,6 +110,9 @@  int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
 		case ARM_SMCCC_HV_PV_TIME_FEATURES:
 			val[0] = SMCCC_RET_SUCCESS;
 			break;
+		case ARM_SMCCC_HV_PV_VCPU_STATE_FEATURES:
+			val[0] = SMCCC_RET_SUCCESS;
+			break;
 		}
 		break;
 	case ARM_SMCCC_HV_PV_TIME_FEATURES:
@@ -139,6 +142,14 @@  int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
 	case ARM_SMCCC_TRNG_RND32:
 	case ARM_SMCCC_TRNG_RND64:
 		return kvm_trng_call(vcpu);
+	case ARM_SMCCC_HV_PV_VCPU_STATE_INIT:
+		if (kvm_init_vcpu_state(vcpu, smccc_get_arg1(vcpu)) == 0)
+			val[0] = SMCCC_RET_SUCCESS;
+		break;
+	case ARM_SMCCC_HV_PV_VCPU_STATE_RELEASE:
+		if (kvm_release_vcpu_state(vcpu) == 0)
+			val[0] = SMCCC_RET_SUCCESS;
+		break;
 	default:
 		return kvm_psci_call(vcpu);
 	}
diff --git a/arch/arm64/kvm/pv-vcpu-state.c b/arch/arm64/kvm/pv-vcpu-state.c
new file mode 100644
index 000000000000..8496bb2a5966
--- /dev/null
+++ b/arch/arm64/kvm/pv-vcpu-state.c
@@ -0,0 +1,64 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/arm-smccc.h>
+#include <linux/kvm_host.h>
+
+#include <asm/kvm_mmu.h>
+#include <asm/paravirt.h>
+
+#include <kvm/arm_psci.h>
+
+int kvm_init_vcpu_state(struct kvm_vcpu *vcpu, gpa_t addr)
+{
+	struct kvm *kvm = vcpu->kvm;
+	int ret;
+	u64 idx;
+
+	if (kvm_arm_is_vcpu_state_enabled(&vcpu->arch))
+		return 0;
+
+	idx = srcu_read_lock(&kvm->srcu);
+	ret = kvm_gfn_to_hva_cache_init(vcpu->kvm,
+					&vcpu->arch.vcpu_state.ghc,
+					addr,
+					sizeof(struct vcpu_state));
+	srcu_read_unlock(&kvm->srcu, idx);
+
+	if (!ret)
+		vcpu->arch.vcpu_state.base = addr;
+	return ret;
+}
+
+int kvm_release_vcpu_state(struct kvm_vcpu *vcpu)
+{
+	if (!kvm_arm_is_vcpu_state_enabled(&vcpu->arch))
+		return 0;
+
+	kvm_arm_vcpu_state_init(&vcpu->arch);
+	return 0;
+}
+
+void kvm_update_vcpu_preempted(struct kvm_vcpu *vcpu, bool preempted)
+{
+	struct kvm *kvm = vcpu->kvm;
+	u64 idx;
+
+	if (!kvm_arm_is_vcpu_state_enabled(&vcpu->arch))
+		return;
+
+	/*
+	 * This function is called from atomic context, so we need to
+	 * disable page faults. kvm_write_guest_cached() will call
+	 * might_fault().
+	 */
+	pagefault_disable();
+	/*
+	 * Need to take the SRCU lock because kvm_write_guest_offset_cached()
+	 * calls kvm_memslots();
+	 */
+	idx = srcu_read_lock(&kvm->srcu);
+	kvm_write_guest_cached(kvm, &vcpu->arch.vcpu_state.ghc,
+			       &preempted, sizeof(bool));
+	srcu_read_unlock(&kvm->srcu, idx);
+	pagefault_enable();
+}