diff mbox series

[v2,09/18] arm64: KVM: enable conditional save/restore full SPE profiling buffer controls

Message ID 20191220143025.33853-10-andrew.murray@arm.com (mailing list archive)
State New, archived
Headers show
Series arm64: KVM: add SPE profiling support | expand

Commit Message

Andrew Murray Dec. 20, 2019, 2:30 p.m. UTC
From: Sudeep Holla <sudeep.holla@arm.com>

Now that we can save/restore the full SPE controls, we can enable it
if SPE is setup and ready to use in KVM. It's supported in KVM only if
all the CPUs in the system supports SPE.

However to support heterogenous systems, we need to move the check if
host supports SPE and do a partial save/restore.

Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
Signed-off-by: Andrew Murray <andrew.murray@arm.com>
---
 arch/arm64/kvm/hyp/debug-sr.c | 33 ++++++++++++++++-----------------
 include/kvm/arm_spe.h         |  6 ++++++
 2 files changed, 22 insertions(+), 17 deletions(-)

Comments

Mark Rutland Dec. 20, 2019, 6:06 p.m. UTC | #1
On Fri, Dec 20, 2019 at 02:30:16PM +0000, Andrew Murray wrote:
> From: Sudeep Holla <sudeep.holla@arm.com>
> 
> Now that we can save/restore the full SPE controls, we can enable it
> if SPE is setup and ready to use in KVM. It's supported in KVM only if
> all the CPUs in the system supports SPE.
> 
> However to support heterogenous systems, we need to move the check if
> host supports SPE and do a partial save/restore.

I don't think that it makes sense to support this for heterogeneous
systems, given their SPE capabilities and IMP DEF details will differ.

Is there some way we can limit this to homogeneous systems?

Thanks,
Mark.

> 
> Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
> Signed-off-by: Andrew Murray <andrew.murray@arm.com>
> ---
>  arch/arm64/kvm/hyp/debug-sr.c | 33 ++++++++++++++++-----------------
>  include/kvm/arm_spe.h         |  6 ++++++
>  2 files changed, 22 insertions(+), 17 deletions(-)
> 
> diff --git a/arch/arm64/kvm/hyp/debug-sr.c b/arch/arm64/kvm/hyp/debug-sr.c
> index 12429b212a3a..d8d857067e6d 100644
> --- a/arch/arm64/kvm/hyp/debug-sr.c
> +++ b/arch/arm64/kvm/hyp/debug-sr.c
> @@ -86,18 +86,13 @@
>  	}
>  
>  static void __hyp_text
> -__debug_save_spe_nvhe(struct kvm_cpu_context *ctxt, bool full_ctxt)
> +__debug_save_spe_context(struct kvm_cpu_context *ctxt, bool full_ctxt)
>  {
>  	u64 reg;
>  
>  	/* Clear pmscr in case of early return */
>  	ctxt->sys_regs[PMSCR_EL1] = 0;
>  
> -	/* SPE present on this CPU? */
> -	if (!cpuid_feature_extract_unsigned_field(read_sysreg(id_aa64dfr0_el1),
> -						  ID_AA64DFR0_PMSVER_SHIFT))
> -		return;
> -
>  	/* Yes; is it owned by higher EL? */
>  	reg = read_sysreg_s(SYS_PMBIDR_EL1);
>  	if (reg & BIT(SYS_PMBIDR_EL1_P_SHIFT))
> @@ -142,7 +137,7 @@ __debug_save_spe_nvhe(struct kvm_cpu_context *ctxt, bool full_ctxt)
>  }
>  
>  static void __hyp_text
> -__debug_restore_spe_nvhe(struct kvm_cpu_context *ctxt, bool full_ctxt)
> +__debug_restore_spe_context(struct kvm_cpu_context *ctxt, bool full_ctxt)
>  {
>  	if (!ctxt->sys_regs[PMSCR_EL1])
>  		return;
> @@ -210,11 +205,14 @@ void __hyp_text __debug_restore_guest_context(struct kvm_vcpu *vcpu)
>  	struct kvm_guest_debug_arch *host_dbg;
>  	struct kvm_guest_debug_arch *guest_dbg;
>  
> +	host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context);
> +	guest_ctxt = &vcpu->arch.ctxt;
> +
> +	__debug_restore_spe_context(guest_ctxt, kvm_arm_spe_v1_ready(vcpu));
> +
>  	if (!(vcpu->arch.flags & KVM_ARM64_DEBUG_DIRTY))
>  		return;
>  
> -	host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context);
> -	guest_ctxt = &vcpu->arch.ctxt;
>  	host_dbg = &vcpu->arch.host_debug_state.regs;
>  	guest_dbg = kern_hyp_va(vcpu->arch.debug_ptr);
>  
> @@ -232,8 +230,7 @@ void __hyp_text __debug_restore_host_context(struct kvm_vcpu *vcpu)
>  	host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context);
>  	guest_ctxt = &vcpu->arch.ctxt;
>  
> -	if (!has_vhe())
> -		__debug_restore_spe_nvhe(host_ctxt, false);
> +	__debug_restore_spe_context(host_ctxt, kvm_arm_spe_v1_ready(vcpu));
>  
>  	if (!(vcpu->arch.flags & KVM_ARM64_DEBUG_DIRTY))
>  		return;
> @@ -249,19 +246,21 @@ void __hyp_text __debug_restore_host_context(struct kvm_vcpu *vcpu)
>  
>  void __hyp_text __debug_save_host_context(struct kvm_vcpu *vcpu)
>  {
> -	/*
> -	 * Non-VHE: Disable and flush SPE data generation
> -	 * VHE: The vcpu can run, but it can't hide.
> -	 */
>  	struct kvm_cpu_context *host_ctxt;
>  
>  	host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context);
> -	if (!has_vhe())
> -		__debug_save_spe_nvhe(host_ctxt, false);
> +	if (cpuid_feature_extract_unsigned_field(read_sysreg(id_aa64dfr0_el1),
> +						 ID_AA64DFR0_PMSVER_SHIFT))
> +		__debug_save_spe_context(host_ctxt, kvm_arm_spe_v1_ready(vcpu));
>  }
>  
>  void __hyp_text __debug_save_guest_context(struct kvm_vcpu *vcpu)
>  {
> +	bool kvm_spe_ready = kvm_arm_spe_v1_ready(vcpu);
> +
> +	/* SPE present on this vCPU? */
> +	if (kvm_spe_ready)
> +		__debug_save_spe_context(&vcpu->arch.ctxt, kvm_spe_ready);
>  }
>  
>  u32 __hyp_text __kvm_get_mdcr_el2(void)
> diff --git a/include/kvm/arm_spe.h b/include/kvm/arm_spe.h
> index 48d118fdb174..30c40b1bc385 100644
> --- a/include/kvm/arm_spe.h
> +++ b/include/kvm/arm_spe.h
> @@ -16,4 +16,10 @@ struct kvm_spe {
>  	bool irq_level;
>  };
>  
> +#ifdef CONFIG_KVM_ARM_SPE
> +#define kvm_arm_spe_v1_ready(v)		((v)->arch.spe.ready)
> +#else
> +#define kvm_arm_spe_v1_ready(v)		(false)
> +#endif /* CONFIG_KVM_ARM_SPE */
> +
>  #endif /* __ASM_ARM_KVM_SPE_H */
> -- 
> 2.21.0
>
Marc Zyngier Dec. 21, 2019, 2:13 p.m. UTC | #2
On Fri, 20 Dec 2019 14:30:16 +0000
Andrew Murray <andrew.murray@arm.com> wrote:

[somehow managed not to do a reply all, re-sending]

> From: Sudeep Holla <sudeep.holla@arm.com>
> 
> Now that we can save/restore the full SPE controls, we can enable it
> if SPE is setup and ready to use in KVM. It's supported in KVM only if
> all the CPUs in the system supports SPE.
> 
> However to support heterogenous systems, we need to move the check if
> host supports SPE and do a partial save/restore.

No. Let's just not go down that path. For now, KVM on heterogeneous
systems do not get SPE. If SPE has been enabled on a guest and a CPU
comes up without SPE, this CPU should fail to boot (same as exposing a
feature to userspace).

> 
> Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
> Signed-off-by: Andrew Murray <andrew.murray@arm.com>
> ---
>  arch/arm64/kvm/hyp/debug-sr.c | 33 ++++++++++++++++-----------------
>  include/kvm/arm_spe.h         |  6 ++++++
>  2 files changed, 22 insertions(+), 17 deletions(-)
> 
> diff --git a/arch/arm64/kvm/hyp/debug-sr.c b/arch/arm64/kvm/hyp/debug-sr.c
> index 12429b212a3a..d8d857067e6d 100644
> --- a/arch/arm64/kvm/hyp/debug-sr.c
> +++ b/arch/arm64/kvm/hyp/debug-sr.c
> @@ -86,18 +86,13 @@
>  	}
>  
>  static void __hyp_text
> -__debug_save_spe_nvhe(struct kvm_cpu_context *ctxt, bool full_ctxt)
> +__debug_save_spe_context(struct kvm_cpu_context *ctxt, bool full_ctxt)
>  {
>  	u64 reg;
>  
>  	/* Clear pmscr in case of early return */
>  	ctxt->sys_regs[PMSCR_EL1] = 0;
>  
> -	/* SPE present on this CPU? */
> -	if (!cpuid_feature_extract_unsigned_field(read_sysreg(id_aa64dfr0_el1),
> -						  ID_AA64DFR0_PMSVER_SHIFT))
> -		return;
> -
>  	/* Yes; is it owned by higher EL? */
>  	reg = read_sysreg_s(SYS_PMBIDR_EL1);
>  	if (reg & BIT(SYS_PMBIDR_EL1_P_SHIFT))
> @@ -142,7 +137,7 @@ __debug_save_spe_nvhe(struct kvm_cpu_context *ctxt, bool full_ctxt)
>  }
>  
>  static void __hyp_text
> -__debug_restore_spe_nvhe(struct kvm_cpu_context *ctxt, bool full_ctxt)
> +__debug_restore_spe_context(struct kvm_cpu_context *ctxt, bool full_ctxt)
>  {
>  	if (!ctxt->sys_regs[PMSCR_EL1])
>  		return;
> @@ -210,11 +205,14 @@ void __hyp_text __debug_restore_guest_context(struct kvm_vcpu *vcpu)
>  	struct kvm_guest_debug_arch *host_dbg;
>  	struct kvm_guest_debug_arch *guest_dbg;
>  
> +	host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context);
> +	guest_ctxt = &vcpu->arch.ctxt;
> +
> +	__debug_restore_spe_context(guest_ctxt, kvm_arm_spe_v1_ready(vcpu));
> +
>  	if (!(vcpu->arch.flags & KVM_ARM64_DEBUG_DIRTY))
>  		return;
>  
> -	host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context);
> -	guest_ctxt = &vcpu->arch.ctxt;
>  	host_dbg = &vcpu->arch.host_debug_state.regs;
>  	guest_dbg = kern_hyp_va(vcpu->arch.debug_ptr);
>  
> @@ -232,8 +230,7 @@ void __hyp_text __debug_restore_host_context(struct kvm_vcpu *vcpu)
>  	host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context);
>  	guest_ctxt = &vcpu->arch.ctxt;
>  
> -	if (!has_vhe())
> -		__debug_restore_spe_nvhe(host_ctxt, false);
> +	__debug_restore_spe_context(host_ctxt, kvm_arm_spe_v1_ready(vcpu));

So you now do an unconditional save/restore on the exit path for VHE as
well? Even if the host isn't using the SPE HW? That's not acceptable
as, in most cases, only the host /or/ the guest will use SPE. Here, you
put a measurable overhead on each exit.

If the host is not using SPE, then the restore/save should happen in
vcpu_load/vcpu_put. Only if the host is using SPE should you do
something in the run loop. Of course, this only applies to VHE and
non-VHE must switch eagerly.

>  
>  	if (!(vcpu->arch.flags & KVM_ARM64_DEBUG_DIRTY))
>  		return;
> @@ -249,19 +246,21 @@ void __hyp_text __debug_restore_host_context(struct kvm_vcpu *vcpu)
>  
>  void __hyp_text __debug_save_host_context(struct kvm_vcpu *vcpu)
>  {
> -	/*
> -	 * Non-VHE: Disable and flush SPE data generation
> -	 * VHE: The vcpu can run, but it can't hide.
> -	 */
>  	struct kvm_cpu_context *host_ctxt;
>  
>  	host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context);
> -	if (!has_vhe())
> -		__debug_save_spe_nvhe(host_ctxt, false);
> +	if (cpuid_feature_extract_unsigned_field(read_sysreg(id_aa64dfr0_el1),
> +						 ID_AA64DFR0_PMSVER_SHIFT))
> +		__debug_save_spe_context(host_ctxt, kvm_arm_spe_v1_ready(vcpu));
>  }
>  
>  void __hyp_text __debug_save_guest_context(struct kvm_vcpu *vcpu)
>  {
> +	bool kvm_spe_ready = kvm_arm_spe_v1_ready(vcpu);
> +
> +	/* SPE present on this vCPU? */
> +	if (kvm_spe_ready)
> +		__debug_save_spe_context(&vcpu->arch.ctxt, kvm_spe_ready);
>  }
>  
>  u32 __hyp_text __kvm_get_mdcr_el2(void)
> diff --git a/include/kvm/arm_spe.h b/include/kvm/arm_spe.h
> index 48d118fdb174..30c40b1bc385 100644
> --- a/include/kvm/arm_spe.h
> +++ b/include/kvm/arm_spe.h
> @@ -16,4 +16,10 @@ struct kvm_spe {
>  	bool irq_level;
>  };
>  
> +#ifdef CONFIG_KVM_ARM_SPE
> +#define kvm_arm_spe_v1_ready(v)		((v)->arch.spe.ready)
> +#else
> +#define kvm_arm_spe_v1_ready(v)		(false)
> +#endif /* CONFIG_KVM_ARM_SPE */
> +
>  #endif /* __ASM_ARM_KVM_SPE_H */

Thanks,

	M.
Andrew Murray Dec. 24, 2019, 12:15 p.m. UTC | #3
On Fri, Dec 20, 2019 at 06:06:58PM +0000, Mark Rutland wrote:
> On Fri, Dec 20, 2019 at 02:30:16PM +0000, Andrew Murray wrote:
> > From: Sudeep Holla <sudeep.holla@arm.com>
> > 
> > Now that we can save/restore the full SPE controls, we can enable it
> > if SPE is setup and ready to use in KVM. It's supported in KVM only if
> > all the CPUs in the system supports SPE.
> > 
> > However to support heterogenous systems, we need to move the check if
> > host supports SPE and do a partial save/restore.
> 
> I don't think that it makes sense to support this for heterogeneous
> systems, given their SPE capabilities and IMP DEF details will differ.
> 
> Is there some way we can limit this to homogeneous systems?

No problem, I'll see how to limit this.

Thanks,

Andrew Murray

> 
> Thanks,
> Mark.
> 
> > 
> > Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
> > Signed-off-by: Andrew Murray <andrew.murray@arm.com>
> > ---
> >  arch/arm64/kvm/hyp/debug-sr.c | 33 ++++++++++++++++-----------------
> >  include/kvm/arm_spe.h         |  6 ++++++
> >  2 files changed, 22 insertions(+), 17 deletions(-)
> > 
> > diff --git a/arch/arm64/kvm/hyp/debug-sr.c b/arch/arm64/kvm/hyp/debug-sr.c
> > index 12429b212a3a..d8d857067e6d 100644
> > --- a/arch/arm64/kvm/hyp/debug-sr.c
> > +++ b/arch/arm64/kvm/hyp/debug-sr.c
> > @@ -86,18 +86,13 @@
> >  	}
> >  
> >  static void __hyp_text
> > -__debug_save_spe_nvhe(struct kvm_cpu_context *ctxt, bool full_ctxt)
> > +__debug_save_spe_context(struct kvm_cpu_context *ctxt, bool full_ctxt)
> >  {
> >  	u64 reg;
> >  
> >  	/* Clear pmscr in case of early return */
> >  	ctxt->sys_regs[PMSCR_EL1] = 0;
> >  
> > -	/* SPE present on this CPU? */
> > -	if (!cpuid_feature_extract_unsigned_field(read_sysreg(id_aa64dfr0_el1),
> > -						  ID_AA64DFR0_PMSVER_SHIFT))
> > -		return;
> > -
> >  	/* Yes; is it owned by higher EL? */
> >  	reg = read_sysreg_s(SYS_PMBIDR_EL1);
> >  	if (reg & BIT(SYS_PMBIDR_EL1_P_SHIFT))
> > @@ -142,7 +137,7 @@ __debug_save_spe_nvhe(struct kvm_cpu_context *ctxt, bool full_ctxt)
> >  }
> >  
> >  static void __hyp_text
> > -__debug_restore_spe_nvhe(struct kvm_cpu_context *ctxt, bool full_ctxt)
> > +__debug_restore_spe_context(struct kvm_cpu_context *ctxt, bool full_ctxt)
> >  {
> >  	if (!ctxt->sys_regs[PMSCR_EL1])
> >  		return;
> > @@ -210,11 +205,14 @@ void __hyp_text __debug_restore_guest_context(struct kvm_vcpu *vcpu)
> >  	struct kvm_guest_debug_arch *host_dbg;
> >  	struct kvm_guest_debug_arch *guest_dbg;
> >  
> > +	host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context);
> > +	guest_ctxt = &vcpu->arch.ctxt;
> > +
> > +	__debug_restore_spe_context(guest_ctxt, kvm_arm_spe_v1_ready(vcpu));
> > +
> >  	if (!(vcpu->arch.flags & KVM_ARM64_DEBUG_DIRTY))
> >  		return;
> >  
> > -	host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context);
> > -	guest_ctxt = &vcpu->arch.ctxt;
> >  	host_dbg = &vcpu->arch.host_debug_state.regs;
> >  	guest_dbg = kern_hyp_va(vcpu->arch.debug_ptr);
> >  
> > @@ -232,8 +230,7 @@ void __hyp_text __debug_restore_host_context(struct kvm_vcpu *vcpu)
> >  	host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context);
> >  	guest_ctxt = &vcpu->arch.ctxt;
> >  
> > -	if (!has_vhe())
> > -		__debug_restore_spe_nvhe(host_ctxt, false);
> > +	__debug_restore_spe_context(host_ctxt, kvm_arm_spe_v1_ready(vcpu));
> >  
> >  	if (!(vcpu->arch.flags & KVM_ARM64_DEBUG_DIRTY))
> >  		return;
> > @@ -249,19 +246,21 @@ void __hyp_text __debug_restore_host_context(struct kvm_vcpu *vcpu)
> >  
> >  void __hyp_text __debug_save_host_context(struct kvm_vcpu *vcpu)
> >  {
> > -	/*
> > -	 * Non-VHE: Disable and flush SPE data generation
> > -	 * VHE: The vcpu can run, but it can't hide.
> > -	 */
> >  	struct kvm_cpu_context *host_ctxt;
> >  
> >  	host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context);
> > -	if (!has_vhe())
> > -		__debug_save_spe_nvhe(host_ctxt, false);
> > +	if (cpuid_feature_extract_unsigned_field(read_sysreg(id_aa64dfr0_el1),
> > +						 ID_AA64DFR0_PMSVER_SHIFT))
> > +		__debug_save_spe_context(host_ctxt, kvm_arm_spe_v1_ready(vcpu));
> >  }
> >  
> >  void __hyp_text __debug_save_guest_context(struct kvm_vcpu *vcpu)
> >  {
> > +	bool kvm_spe_ready = kvm_arm_spe_v1_ready(vcpu);
> > +
> > +	/* SPE present on this vCPU? */
> > +	if (kvm_spe_ready)
> > +		__debug_save_spe_context(&vcpu->arch.ctxt, kvm_spe_ready);
> >  }
> >  
> >  u32 __hyp_text __kvm_get_mdcr_el2(void)
> > diff --git a/include/kvm/arm_spe.h b/include/kvm/arm_spe.h
> > index 48d118fdb174..30c40b1bc385 100644
> > --- a/include/kvm/arm_spe.h
> > +++ b/include/kvm/arm_spe.h
> > @@ -16,4 +16,10 @@ struct kvm_spe {
> >  	bool irq_level;
> >  };
> >  
> > +#ifdef CONFIG_KVM_ARM_SPE
> > +#define kvm_arm_spe_v1_ready(v)		((v)->arch.spe.ready)
> > +#else
> > +#define kvm_arm_spe_v1_ready(v)		(false)
> > +#endif /* CONFIG_KVM_ARM_SPE */
> > +
> >  #endif /* __ASM_ARM_KVM_SPE_H */
> > -- 
> > 2.21.0
> >
Andrew Murray Jan. 7, 2020, 3:13 p.m. UTC | #4
On Sat, Dec 21, 2019 at 02:13:25PM +0000, Marc Zyngier wrote:
> On Fri, 20 Dec 2019 14:30:16 +0000
> Andrew Murray <andrew.murray@arm.com> wrote:
> 
> [somehow managed not to do a reply all, re-sending]
> 
> > From: Sudeep Holla <sudeep.holla@arm.com>
> > 
> > Now that we can save/restore the full SPE controls, we can enable it
> > if SPE is setup and ready to use in KVM. It's supported in KVM only if
> > all the CPUs in the system supports SPE.
> > 
> > However to support heterogenous systems, we need to move the check if
> > host supports SPE and do a partial save/restore.
> 
> No. Let's just not go down that path. For now, KVM on heterogeneous
> systems do not get SPE.

At present these patches only offer the SPE feature to VCPU's where the
sanitised AA64DFR0 register indicates that all CPUs have this support
(kvm_arm_support_spe_v1) at the time of setting the attribute
(KVM_SET_DEVICE_ATTR).

Therefore if a new CPU comes online without SPE support, and an
existing VCPU is scheduled onto it, then bad things happen - which I guess
must have been the intention behind this patch.


> If SPE has been enabled on a guest and a CPU
> comes up without SPE, this CPU should fail to boot (same as exposing a
> feature to userspace).

I'm unclear as how to prevent this. We can set the FTR_STRICT flag on
the sanitised register - thus tainting the kernel if such a non-SPE CPU
comes online - thought that doesn't prevent KVM from blowing up. Though
I don't believe we can prevent a CPU coming up. At the moment this is
my preferred approach.

Looking at the vcpu_load and related code, I don't see a way of saying
'don't schedule this VCPU on this CPU' or bailing in any way.

One solution could be to allow scheduling onto non-SPE VCPUs but wrap the
SPE save/restore code in a macro (much like kvm_arm_spe_v1_ready) that
reads the non-sanitised feature register. Therefore we don't go bang, but
we also increase the size of any black-holes in SPE capturing. Though this
feels like something that will cause grief down the line.

Is there something else that can be done?

Thanks,

Andrew Murray

> 
> > 
> > Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
> > Signed-off-by: Andrew Murray <andrew.murray@arm.com>
> > ---
> >  arch/arm64/kvm/hyp/debug-sr.c | 33 ++++++++++++++++-----------------
> >  include/kvm/arm_spe.h         |  6 ++++++
> >  2 files changed, 22 insertions(+), 17 deletions(-)
> > 
> > diff --git a/arch/arm64/kvm/hyp/debug-sr.c b/arch/arm64/kvm/hyp/debug-sr.c
> > index 12429b212a3a..d8d857067e6d 100644
> > --- a/arch/arm64/kvm/hyp/debug-sr.c
> > +++ b/arch/arm64/kvm/hyp/debug-sr.c
> > @@ -86,18 +86,13 @@
> >  	}
> >  
> >  static void __hyp_text
> > -__debug_save_spe_nvhe(struct kvm_cpu_context *ctxt, bool full_ctxt)
> > +__debug_save_spe_context(struct kvm_cpu_context *ctxt, bool full_ctxt)
> >  {
> >  	u64 reg;
> >  
> >  	/* Clear pmscr in case of early return */
> >  	ctxt->sys_regs[PMSCR_EL1] = 0;
> >  
> > -	/* SPE present on this CPU? */
> > -	if (!cpuid_feature_extract_unsigned_field(read_sysreg(id_aa64dfr0_el1),
> > -						  ID_AA64DFR0_PMSVER_SHIFT))
> > -		return;
> > -
> >  	/* Yes; is it owned by higher EL? */
> >  	reg = read_sysreg_s(SYS_PMBIDR_EL1);
> >  	if (reg & BIT(SYS_PMBIDR_EL1_P_SHIFT))
> > @@ -142,7 +137,7 @@ __debug_save_spe_nvhe(struct kvm_cpu_context *ctxt, bool full_ctxt)
> >  }
> >  
> >  static void __hyp_text
> > -__debug_restore_spe_nvhe(struct kvm_cpu_context *ctxt, bool full_ctxt)
> > +__debug_restore_spe_context(struct kvm_cpu_context *ctxt, bool full_ctxt)
> >  {
> >  	if (!ctxt->sys_regs[PMSCR_EL1])
> >  		return;
> > @@ -210,11 +205,14 @@ void __hyp_text __debug_restore_guest_context(struct kvm_vcpu *vcpu)
> >  	struct kvm_guest_debug_arch *host_dbg;
> >  	struct kvm_guest_debug_arch *guest_dbg;
> >  
> > +	host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context);
> > +	guest_ctxt = &vcpu->arch.ctxt;
> > +
> > +	__debug_restore_spe_context(guest_ctxt, kvm_arm_spe_v1_ready(vcpu));
> > +
> >  	if (!(vcpu->arch.flags & KVM_ARM64_DEBUG_DIRTY))
> >  		return;
> >  
> > -	host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context);
> > -	guest_ctxt = &vcpu->arch.ctxt;
> >  	host_dbg = &vcpu->arch.host_debug_state.regs;
> >  	guest_dbg = kern_hyp_va(vcpu->arch.debug_ptr);
> >  
> > @@ -232,8 +230,7 @@ void __hyp_text __debug_restore_host_context(struct kvm_vcpu *vcpu)
> >  	host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context);
> >  	guest_ctxt = &vcpu->arch.ctxt;
> >  
> > -	if (!has_vhe())
> > -		__debug_restore_spe_nvhe(host_ctxt, false);
> > +	__debug_restore_spe_context(host_ctxt, kvm_arm_spe_v1_ready(vcpu));
> 
> So you now do an unconditional save/restore on the exit path for VHE as
> well? Even if the host isn't using the SPE HW? That's not acceptable
> as, in most cases, only the host /or/ the guest will use SPE. Here, you
> put a measurable overhead on each exit.
> 
> If the host is not using SPE, then the restore/save should happen in
> vcpu_load/vcpu_put. Only if the host is using SPE should you do
> something in the run loop. Of course, this only applies to VHE and
> non-VHE must switch eagerly.
> 
> >  
> >  	if (!(vcpu->arch.flags & KVM_ARM64_DEBUG_DIRTY))
> >  		return;
> > @@ -249,19 +246,21 @@ void __hyp_text __debug_restore_host_context(struct kvm_vcpu *vcpu)
> >  
> >  void __hyp_text __debug_save_host_context(struct kvm_vcpu *vcpu)
> >  {
> > -	/*
> > -	 * Non-VHE: Disable and flush SPE data generation
> > -	 * VHE: The vcpu can run, but it can't hide.
> > -	 */
> >  	struct kvm_cpu_context *host_ctxt;
> >  
> >  	host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context);
> > -	if (!has_vhe())
> > -		__debug_save_spe_nvhe(host_ctxt, false);
> > +	if (cpuid_feature_extract_unsigned_field(read_sysreg(id_aa64dfr0_el1),
> > +						 ID_AA64DFR0_PMSVER_SHIFT))
> > +		__debug_save_spe_context(host_ctxt, kvm_arm_spe_v1_ready(vcpu));
> >  }
> >  
> >  void __hyp_text __debug_save_guest_context(struct kvm_vcpu *vcpu)
> >  {
> > +	bool kvm_spe_ready = kvm_arm_spe_v1_ready(vcpu);
> > +
> > +	/* SPE present on this vCPU? */
> > +	if (kvm_spe_ready)
> > +		__debug_save_spe_context(&vcpu->arch.ctxt, kvm_spe_ready);
> >  }
> >  
> >  u32 __hyp_text __kvm_get_mdcr_el2(void)
> > diff --git a/include/kvm/arm_spe.h b/include/kvm/arm_spe.h
> > index 48d118fdb174..30c40b1bc385 100644
> > --- a/include/kvm/arm_spe.h
> > +++ b/include/kvm/arm_spe.h
> > @@ -16,4 +16,10 @@ struct kvm_spe {
> >  	bool irq_level;
> >  };
> >  
> > +#ifdef CONFIG_KVM_ARM_SPE
> > +#define kvm_arm_spe_v1_ready(v)		((v)->arch.spe.ready)
> > +#else
> > +#define kvm_arm_spe_v1_ready(v)		(false)
> > +#endif /* CONFIG_KVM_ARM_SPE */
> > +
> >  #endif /* __ASM_ARM_KVM_SPE_H */
> 
> Thanks,
> 
> 	M.
> -- 
> Jazz is not dead. It just smells funny...
Marc Zyngier Jan. 8, 2020, 11:17 a.m. UTC | #5
On 2020-01-07 15:13, Andrew Murray wrote:
> On Sat, Dec 21, 2019 at 02:13:25PM +0000, Marc Zyngier wrote:
>> On Fri, 20 Dec 2019 14:30:16 +0000
>> Andrew Murray <andrew.murray@arm.com> wrote:
>> 
>> [somehow managed not to do a reply all, re-sending]
>> 
>> > From: Sudeep Holla <sudeep.holla@arm.com>
>> >
>> > Now that we can save/restore the full SPE controls, we can enable it
>> > if SPE is setup and ready to use in KVM. It's supported in KVM only if
>> > all the CPUs in the system supports SPE.
>> >
>> > However to support heterogenous systems, we need to move the check if
>> > host supports SPE and do a partial save/restore.
>> 
>> No. Let's just not go down that path. For now, KVM on heterogeneous
>> systems do not get SPE.
> 
> At present these patches only offer the SPE feature to VCPU's where the
> sanitised AA64DFR0 register indicates that all CPUs have this support
> (kvm_arm_support_spe_v1) at the time of setting the attribute
> (KVM_SET_DEVICE_ATTR).
> 
> Therefore if a new CPU comes online without SPE support, and an
> existing VCPU is scheduled onto it, then bad things happen - which I 
> guess
> must have been the intention behind this patch.

I guess that was the intent.

>> If SPE has been enabled on a guest and a CPU
>> comes up without SPE, this CPU should fail to boot (same as exposing a
>> feature to userspace).
> 
> I'm unclear as how to prevent this. We can set the FTR_STRICT flag on
> the sanitised register - thus tainting the kernel if such a non-SPE CPU
> comes online - thought that doesn't prevent KVM from blowing up. Though
> I don't believe we can prevent a CPU coming up. At the moment this is
> my preferred approach.

I'd be OK with this as a stop-gap measure. Do we know of any existing
design where only half of the CPUs have SPE?

> Looking at the vcpu_load and related code, I don't see a way of saying
> 'don't schedule this VCPU on this CPU' or bailing in any way.

That would actually be pretty easy to implement. In vcpu_load(), check
that that the CPU physical has SPE. If not, raise a request for that 
vcpu.
In the run loop, check for that request and abort if raised, returning
to userspace.

Userspace can always check /sys/devices/arm_spe_0/cpumask and work out
where to run that particular vcpu.

> 
> One solution could be to allow scheduling onto non-SPE VCPUs but wrap 
> the
> SPE save/restore code in a macro (much like kvm_arm_spe_v1_ready) that
> reads the non-sanitised feature register. Therefore we don't go bang, 
> but
> we also increase the size of any black-holes in SPE capturing. Though 
> this
> feels like something that will cause grief down the line.
> 
> Is there something else that can be done?

How does userspace deal with this? When SPE is only available on half of
the CPUs, how does perf work in these conditions?

Thanks,

         M.
Will Deacon Jan. 8, 2020, 11:58 a.m. UTC | #6
On Wed, Jan 08, 2020 at 11:17:16AM +0000, Marc Zyngier wrote:
> On 2020-01-07 15:13, Andrew Murray wrote:
> > On Sat, Dec 21, 2019 at 02:13:25PM +0000, Marc Zyngier wrote:
> > > On Fri, 20 Dec 2019 14:30:16 +0000
> > > Andrew Murray <andrew.murray@arm.com> wrote:
> > > 
> > > [somehow managed not to do a reply all, re-sending]
> > > 
> > > > From: Sudeep Holla <sudeep.holla@arm.com>
> > > >
> > > > Now that we can save/restore the full SPE controls, we can enable it
> > > > if SPE is setup and ready to use in KVM. It's supported in KVM only if
> > > > all the CPUs in the system supports SPE.
> > > >
> > > > However to support heterogenous systems, we need to move the check if
> > > > host supports SPE and do a partial save/restore.
> > > 
> > > No. Let's just not go down that path. For now, KVM on heterogeneous
> > > systems do not get SPE.
> > 
> > At present these patches only offer the SPE feature to VCPU's where the
> > sanitised AA64DFR0 register indicates that all CPUs have this support
> > (kvm_arm_support_spe_v1) at the time of setting the attribute
> > (KVM_SET_DEVICE_ATTR).
> > 
> > Therefore if a new CPU comes online without SPE support, and an
> > existing VCPU is scheduled onto it, then bad things happen - which I
> > guess
> > must have been the intention behind this patch.
> 
> I guess that was the intent.
> 
> > > If SPE has been enabled on a guest and a CPU
> > > comes up without SPE, this CPU should fail to boot (same as exposing a
> > > feature to userspace).
> > 
> > I'm unclear as how to prevent this. We can set the FTR_STRICT flag on
> > the sanitised register - thus tainting the kernel if such a non-SPE CPU
> > comes online - thought that doesn't prevent KVM from blowing up. Though
> > I don't believe we can prevent a CPU coming up. At the moment this is
> > my preferred approach.
> 
> I'd be OK with this as a stop-gap measure. Do we know of any existing
> design where only half of the CPUs have SPE?

No, but given how few CPUs implement SPE I'd say that this configuration
is inevitable. I certainly went out of my way to support it in the driver.

> > Looking at the vcpu_load and related code, I don't see a way of saying
> > 'don't schedule this VCPU on this CPU' or bailing in any way.
> 
> That would actually be pretty easy to implement. In vcpu_load(), check
> that that the CPU physical has SPE. If not, raise a request for that vcpu.
> In the run loop, check for that request and abort if raised, returning
> to userspace.
> 
> Userspace can always check /sys/devices/arm_spe_0/cpumask and work out
> where to run that particular vcpu.

It's also worth considering systems where there are multiple implementations
of SPE in play. Assuming we don't want to expose this to a guest, then the
right interface here is probably for userspace to pick one SPE
implementation and expose that to the guest. That fits with your idea above,
where you basically get an immediate exit if we try to schedule a vCPU onto
a CPU that isn't part of the SPE mask.

> > One solution could be to allow scheduling onto non-SPE VCPUs but wrap
> > the
> > SPE save/restore code in a macro (much like kvm_arm_spe_v1_ready) that
> > reads the non-sanitised feature register. Therefore we don't go bang,
> > but
> > we also increase the size of any black-holes in SPE capturing. Though
> > this
> > feels like something that will cause grief down the line.
> > 
> > Is there something else that can be done?
> 
> How does userspace deal with this? When SPE is only available on half of
> the CPUs, how does perf work in these conditions?

Not sure about userspace, but the kernel driver works by instantiating an
SPE PMU instance only for the CPUs that have it and then that instance
profiles for only those CPUs. You also need to do something similar if
you had two CPU types with SPE, since the SPE configuration is likely to be
different between them.

Will
Marc Zyngier Jan. 8, 2020, 12:36 p.m. UTC | #7
On 2020-01-08 11:58, Will Deacon wrote:
> On Wed, Jan 08, 2020 at 11:17:16AM +0000, Marc Zyngier wrote:
>> On 2020-01-07 15:13, Andrew Murray wrote:
>> > On Sat, Dec 21, 2019 at 02:13:25PM +0000, Marc Zyngier wrote:
>> > > On Fri, 20 Dec 2019 14:30:16 +0000
>> > > Andrew Murray <andrew.murray@arm.com> wrote:
>> > >
>> > > [somehow managed not to do a reply all, re-sending]
>> > >
>> > > > From: Sudeep Holla <sudeep.holla@arm.com>
>> > > >
>> > > > Now that we can save/restore the full SPE controls, we can enable it
>> > > > if SPE is setup and ready to use in KVM. It's supported in KVM only if
>> > > > all the CPUs in the system supports SPE.
>> > > >
>> > > > However to support heterogenous systems, we need to move the check if
>> > > > host supports SPE and do a partial save/restore.
>> > >
>> > > No. Let's just not go down that path. For now, KVM on heterogeneous
>> > > systems do not get SPE.
>> >
>> > At present these patches only offer the SPE feature to VCPU's where the
>> > sanitised AA64DFR0 register indicates that all CPUs have this support
>> > (kvm_arm_support_spe_v1) at the time of setting the attribute
>> > (KVM_SET_DEVICE_ATTR).
>> >
>> > Therefore if a new CPU comes online without SPE support, and an
>> > existing VCPU is scheduled onto it, then bad things happen - which I
>> > guess
>> > must have been the intention behind this patch.
>> 
>> I guess that was the intent.
>> 
>> > > If SPE has been enabled on a guest and a CPU
>> > > comes up without SPE, this CPU should fail to boot (same as exposing a
>> > > feature to userspace).
>> >
>> > I'm unclear as how to prevent this. We can set the FTR_STRICT flag on
>> > the sanitised register - thus tainting the kernel if such a non-SPE CPU
>> > comes online - thought that doesn't prevent KVM from blowing up. Though
>> > I don't believe we can prevent a CPU coming up. At the moment this is
>> > my preferred approach.
>> 
>> I'd be OK with this as a stop-gap measure. Do we know of any existing
>> design where only half of the CPUs have SPE?
> 
> No, but given how few CPUs implement SPE I'd say that this 
> configuration
> is inevitable. I certainly went out of my way to support it in the 
> driver.
> 
>> > Looking at the vcpu_load and related code, I don't see a way of saying
>> > 'don't schedule this VCPU on this CPU' or bailing in any way.
>> 
>> That would actually be pretty easy to implement. In vcpu_load(), check
>> that that the CPU physical has SPE. If not, raise a request for that 
>> vcpu.
>> In the run loop, check for that request and abort if raised, returning
>> to userspace.
>> 
>> Userspace can always check /sys/devices/arm_spe_0/cpumask and work out
>> where to run that particular vcpu.
> 
> It's also worth considering systems where there are multiple 
> implementations
> of SPE in play. Assuming we don't want to expose this to a guest, then 
> the
> right interface here is probably for userspace to pick one SPE
> implementation and expose that to the guest. That fits with your idea 
> above,
> where you basically get an immediate exit if we try to schedule a vCPU 
> onto
> a CPU that isn't part of the SPE mask.

Then it means that the VM should be configured with a mask indicating
which CPUs it is intended to run on, and setting such a mask is 
mandatory
for SPE.

> 
>> > One solution could be to allow scheduling onto non-SPE VCPUs but wrap
>> > the
>> > SPE save/restore code in a macro (much like kvm_arm_spe_v1_ready) that
>> > reads the non-sanitised feature register. Therefore we don't go bang,
>> > but
>> > we also increase the size of any black-holes in SPE capturing. Though
>> > this
>> > feels like something that will cause grief down the line.
>> >
>> > Is there something else that can be done?
>> 
>> How does userspace deal with this? When SPE is only available on half 
>> of
>> the CPUs, how does perf work in these conditions?
> 
> Not sure about userspace, but the kernel driver works by instantiating 
> an
> SPE PMU instance only for the CPUs that have it and then that instance
> profiles for only those CPUs. You also need to do something similar if
> you had two CPU types with SPE, since the SPE configuration is likely 
> to be
> different between them.

So that's closer to what Andrew was suggesting above (running a guest on 
a
non-SPE CPU creates a profiling black hole). Except that we can't really
run a SPE-enabled guest on a non-SPE CPU, as the SPE sysregs will UNDEF
at EL1.

Conclusion: we need a mix of a cpumask to indicate which CPUs we want to
run on (generic, not-SPE related), and a check for SPE-capable CPUs.
If any of these condition is not satisfied, the vcpu exits for userspace
to sort out the affinity.

I hate heterogeneous systems.

         M.
Will Deacon Jan. 8, 2020, 1:10 p.m. UTC | #8
On Wed, Jan 08, 2020 at 12:36:11PM +0000, Marc Zyngier wrote:
> On 2020-01-08 11:58, Will Deacon wrote:
> > On Wed, Jan 08, 2020 at 11:17:16AM +0000, Marc Zyngier wrote:
> > > On 2020-01-07 15:13, Andrew Murray wrote:
> > > > Looking at the vcpu_load and related code, I don't see a way of saying
> > > > 'don't schedule this VCPU on this CPU' or bailing in any way.
> > > 
> > > That would actually be pretty easy to implement. In vcpu_load(), check
> > > that that the CPU physical has SPE. If not, raise a request for that
> > > vcpu.
> > > In the run loop, check for that request and abort if raised, returning
> > > to userspace.
> > > 
> > > Userspace can always check /sys/devices/arm_spe_0/cpumask and work out
> > > where to run that particular vcpu.
> > 
> > It's also worth considering systems where there are multiple
> > implementations
> > of SPE in play. Assuming we don't want to expose this to a guest, then
> > the
> > right interface here is probably for userspace to pick one SPE
> > implementation and expose that to the guest. That fits with your idea
> > above,
> > where you basically get an immediate exit if we try to schedule a vCPU
> > onto
> > a CPU that isn't part of the SPE mask.
> 
> Then it means that the VM should be configured with a mask indicating
> which CPUs it is intended to run on, and setting such a mask is mandatory
> for SPE.

Yeah, and this could probably all be wrapped up by userspace so you just
pass the SPE PMU name or something and it grabs the corresponding cpumask
for you.

> > > > One solution could be to allow scheduling onto non-SPE VCPUs but wrap
> > > > the
> > > > SPE save/restore code in a macro (much like kvm_arm_spe_v1_ready) that
> > > > reads the non-sanitised feature register. Therefore we don't go bang,
> > > > but
> > > > we also increase the size of any black-holes in SPE capturing. Though
> > > > this
> > > > feels like something that will cause grief down the line.
> > > >
> > > > Is there something else that can be done?
> > > 
> > > How does userspace deal with this? When SPE is only available on
> > > half of
> > > the CPUs, how does perf work in these conditions?
> > 
> > Not sure about userspace, but the kernel driver works by instantiating
> > an
> > SPE PMU instance only for the CPUs that have it and then that instance
> > profiles for only those CPUs. You also need to do something similar if
> > you had two CPU types with SPE, since the SPE configuration is likely to
> > be
> > different between them.
> 
> So that's closer to what Andrew was suggesting above (running a guest on a
> non-SPE CPU creates a profiling black hole). Except that we can't really
> run a SPE-enabled guest on a non-SPE CPU, as the SPE sysregs will UNDEF
> at EL1.

Right. I wouldn't suggest the "black hole" approach for VMs, but it works
for userspace so that's why the driver does it that way.

> Conclusion: we need a mix of a cpumask to indicate which CPUs we want to
> run on (generic, not-SPE related), and a check for SPE-capable CPUs.
> If any of these condition is not satisfied, the vcpu exits for userspace
> to sort out the affinity.
> 
> I hate heterogeneous systems.

They hate you too ;)

Will
Andrew Murray Jan. 9, 2020, 11:23 a.m. UTC | #9
On Wed, Jan 08, 2020 at 01:10:21PM +0000, Will Deacon wrote:
> On Wed, Jan 08, 2020 at 12:36:11PM +0000, Marc Zyngier wrote:
> > On 2020-01-08 11:58, Will Deacon wrote:
> > > On Wed, Jan 08, 2020 at 11:17:16AM +0000, Marc Zyngier wrote:
> > > > On 2020-01-07 15:13, Andrew Murray wrote:
> > > > > Looking at the vcpu_load and related code, I don't see a way of saying
> > > > > 'don't schedule this VCPU on this CPU' or bailing in any way.
> > > > 
> > > > That would actually be pretty easy to implement. In vcpu_load(), check
> > > > that that the CPU physical has SPE. If not, raise a request for that
> > > > vcpu.
> > > > In the run loop, check for that request and abort if raised, returning
> > > > to userspace.

I hadn't really noticed the kvm_make_request mechanism - however it's now
clear how this could be implemented.

This approach gives responsibility for which CPUs should be used to userspace
and if userspace gets it wrong then the KVM_RUN ioctl won't do very much.


> > > > 
> > > > Userspace can always check /sys/devices/arm_spe_0/cpumask and work out
> > > > where to run that particular vcpu.
> > > 
> > > It's also worth considering systems where there are multiple
> > > implementations
> > > of SPE in play. Assuming we don't want to expose this to a guest, then
> > > the
> > > right interface here is probably for userspace to pick one SPE
> > > implementation and expose that to the guest.

If I understand correctly then this implies the following:

 - If the host userspace indicates it wants support for SPE in the guest (via 
   KVM_SET_DEVICE_ATTR at start of day) - then we should check in vcpu_load that
   the minimum version of SPE is present on the current CPU. 'minimum' because
   we don't know why userspace has selected the given cpumask.

 - Userspace can get it wrong, i.e. it can create a CPU mask with CPUs that
   have SPE with differing versions. If it does, and all CPUs have some form of
   SPE then errors may occur in the guest. Perhaps this is OK and userspace
   shouldn't get it wrong?


> > >  That fits with your idea
> > > above,
> > > where you basically get an immediate exit if we try to schedule a vCPU
> > > onto
> > > a CPU that isn't part of the SPE mask.
> > 
> > Then it means that the VM should be configured with a mask indicating
> > which CPUs it is intended to run on, and setting such a mask is mandatory
> > for SPE.
> 
> Yeah, and this could probably all be wrapped up by userspace so you just
> pass the SPE PMU name or something and it grabs the corresponding cpumask
> for you.
> 
> > > > > One solution could be to allow scheduling onto non-SPE VCPUs but wrap
> > > > > the
> > > > > SPE save/restore code in a macro (much like kvm_arm_spe_v1_ready) that
> > > > > reads the non-sanitised feature register. Therefore we don't go bang,
> > > > > but
> > > > > we also increase the size of any black-holes in SPE capturing. Though
> > > > > this
> > > > > feels like something that will cause grief down the line.
> > > > >
> > > > > Is there something else that can be done?
> > > > 
> > > > How does userspace deal with this? When SPE is only available on
> > > > half of
> > > > the CPUs, how does perf work in these conditions?
> > > 
> > > Not sure about userspace, but the kernel driver works by instantiating
> > > an
> > > SPE PMU instance only for the CPUs that have it and then that instance
> > > profiles for only those CPUs. You also need to do something similar if
> > > you had two CPU types with SPE, since the SPE configuration is likely to
> > > be
> > > different between them.
> > 
> > So that's closer to what Andrew was suggesting above (running a guest on a
> > non-SPE CPU creates a profiling black hole). Except that we can't really
> > run a SPE-enabled guest on a non-SPE CPU, as the SPE sysregs will UNDEF
> > at EL1.
> 
> Right. I wouldn't suggest the "black hole" approach for VMs, but it works
> for userspace so that's why the driver does it that way.
> 
> > Conclusion: we need a mix of a cpumask to indicate which CPUs we want to
> > run on (generic, not-SPE related), 

If I understand correctly this mask isn't exposed to KVM (in the kernel) and
KVM (in the kernel) is unware of how the CPUs that have KVM_RUN called are
selected.

Thus this implies the cpumask is a feature of KVM tool or QEMU that would
need to be added there. (E.g. kvm_cmd_run_work would set some affinity when
creating pthreads - based on a CPU mask triggered by setting the --spe flag)?

Thanks,

Andrew Murray

> and a check for SPE-capable CPUs.
> > If any of these condition is not satisfied, the vcpu exits for userspace
> > to sort out the affinity.
> > 
> > I hate heterogeneous systems.
> 
> They hate you too ;)
> 
> Will
Andrew Murray Jan. 9, 2020, 11:25 a.m. UTC | #10
On Thu, Jan 09, 2020 at 11:23:37AM +0000, Andrew Murray wrote:
> On Wed, Jan 08, 2020 at 01:10:21PM +0000, Will Deacon wrote:
> > On Wed, Jan 08, 2020 at 12:36:11PM +0000, Marc Zyngier wrote:
> > > On 2020-01-08 11:58, Will Deacon wrote:
> > > > On Wed, Jan 08, 2020 at 11:17:16AM +0000, Marc Zyngier wrote:
> > > > > On 2020-01-07 15:13, Andrew Murray wrote:
> > > > > > Looking at the vcpu_load and related code, I don't see a way of saying
> > > > > > 'don't schedule this VCPU on this CPU' or bailing in any way.
> > > > > 
> > > > > That would actually be pretty easy to implement. In vcpu_load(), check
> > > > > that that the CPU physical has SPE. If not, raise a request for that
> > > > > vcpu.
> > > > > In the run loop, check for that request and abort if raised, returning
> > > > > to userspace.
> 
> I hadn't really noticed the kvm_make_request mechanism - however it's now
> clear how this could be implemented.
> 
> This approach gives responsibility for which CPUs should be used to userspace
> and if userspace gets it wrong then the KVM_RUN ioctl won't do very much.
> 
> 
> > > > > 
> > > > > Userspace can always check /sys/devices/arm_spe_0/cpumask and work out
> > > > > where to run that particular vcpu.
> > > > 
> > > > It's also worth considering systems where there are multiple
> > > > implementations
> > > > of SPE in play. Assuming we don't want to expose this to a guest, then
> > > > the
> > > > right interface here is probably for userspace to pick one SPE
> > > > implementation and expose that to the guest.
> 
> If I understand correctly then this implies the following:
> 
>  - If the host userspace indicates it wants support for SPE in the guest (via 
>    KVM_SET_DEVICE_ATTR at start of day) - then we should check in vcpu_load that
>    the minimum version of SPE is present on the current CPU. 'minimum' because
>    we don't know why userspace has selected the given cpumask.
> 
>  - Userspace can get it wrong, i.e. it can create a CPU mask with CPUs that
>    have SPE with differing versions. If it does, and all CPUs have some form of
>    SPE then errors may occur in the guest. Perhaps this is OK and userspace
>    shouldn't get it wrong?

Actually this could be guarded against by emulating the ID_AA64DFR0_EL1 such to
cap the version to the minimum SPE version - if absolutely required.

Thanks,

Andrew Murray

> 
> 
> > > >  That fits with your idea
> > > > above,
> > > > where you basically get an immediate exit if we try to schedule a vCPU
> > > > onto
> > > > a CPU that isn't part of the SPE mask.
> > > 
> > > Then it means that the VM should be configured with a mask indicating
> > > which CPUs it is intended to run on, and setting such a mask is mandatory
> > > for SPE.
> > 
> > Yeah, and this could probably all be wrapped up by userspace so you just
> > pass the SPE PMU name or something and it grabs the corresponding cpumask
> > for you.
> > 
> > > > > > One solution could be to allow scheduling onto non-SPE VCPUs but wrap
> > > > > > the
> > > > > > SPE save/restore code in a macro (much like kvm_arm_spe_v1_ready) that
> > > > > > reads the non-sanitised feature register. Therefore we don't go bang,
> > > > > > but
> > > > > > we also increase the size of any black-holes in SPE capturing. Though
> > > > > > this
> > > > > > feels like something that will cause grief down the line.
> > > > > >
> > > > > > Is there something else that can be done?
> > > > > 
> > > > > How does userspace deal with this? When SPE is only available on
> > > > > half of
> > > > > the CPUs, how does perf work in these conditions?
> > > > 
> > > > Not sure about userspace, but the kernel driver works by instantiating
> > > > an
> > > > SPE PMU instance only for the CPUs that have it and then that instance
> > > > profiles for only those CPUs. You also need to do something similar if
> > > > you had two CPU types with SPE, since the SPE configuration is likely to
> > > > be
> > > > different between them.
> > > 
> > > So that's closer to what Andrew was suggesting above (running a guest on a
> > > non-SPE CPU creates a profiling black hole). Except that we can't really
> > > run a SPE-enabled guest on a non-SPE CPU, as the SPE sysregs will UNDEF
> > > at EL1.
> > 
> > Right. I wouldn't suggest the "black hole" approach for VMs, but it works
> > for userspace so that's why the driver does it that way.
> > 
> > > Conclusion: we need a mix of a cpumask to indicate which CPUs we want to
> > > run on (generic, not-SPE related), 
> 
> If I understand correctly this mask isn't exposed to KVM (in the kernel) and
> KVM (in the kernel) is unware of how the CPUs that have KVM_RUN called are
> selected.
> 
> Thus this implies the cpumask is a feature of KVM tool or QEMU that would
> need to be added there. (E.g. kvm_cmd_run_work would set some affinity when
> creating pthreads - based on a CPU mask triggered by setting the --spe flag)?
> 
> Thanks,
> 
> Andrew Murray
> 
> > and a check for SPE-capable CPUs.
> > > If any of these condition is not satisfied, the vcpu exits for userspace
> > > to sort out the affinity.
> > > 
> > > I hate heterogeneous systems.
> > 
> > They hate you too ;)
> > 
> > Will
> _______________________________________________
> kvmarm mailing list
> kvmarm@lists.cs.columbia.edu
> https://lists.cs.columbia.edu/mailman/listinfo/kvmarm
Will Deacon Jan. 9, 2020, 12:01 p.m. UTC | #11
On Thu, Jan 09, 2020 at 11:25:04AM +0000, Andrew Murray wrote:
> On Thu, Jan 09, 2020 at 11:23:37AM +0000, Andrew Murray wrote:
> > On Wed, Jan 08, 2020 at 01:10:21PM +0000, Will Deacon wrote:
> > > On Wed, Jan 08, 2020 at 12:36:11PM +0000, Marc Zyngier wrote:
> > > > On 2020-01-08 11:58, Will Deacon wrote:
> > > > > On Wed, Jan 08, 2020 at 11:17:16AM +0000, Marc Zyngier wrote:
> > > > > > On 2020-01-07 15:13, Andrew Murray wrote:
> > > > > > > Looking at the vcpu_load and related code, I don't see a way of saying
> > > > > > > 'don't schedule this VCPU on this CPU' or bailing in any way.
> > > > > > 
> > > > > > That would actually be pretty easy to implement. In vcpu_load(), check
> > > > > > that that the CPU physical has SPE. If not, raise a request for that
> > > > > > vcpu.
> > > > > > In the run loop, check for that request and abort if raised, returning
> > > > > > to userspace.
> > 
> > I hadn't really noticed the kvm_make_request mechanism - however it's now
> > clear how this could be implemented.
> > 
> > This approach gives responsibility for which CPUs should be used to userspace
> > and if userspace gets it wrong then the KVM_RUN ioctl won't do very much.
> > 
> > 
> > > > > > 
> > > > > > Userspace can always check /sys/devices/arm_spe_0/cpumask and work out
> > > > > > where to run that particular vcpu.
> > > > > 
> > > > > It's also worth considering systems where there are multiple
> > > > > implementations
> > > > > of SPE in play. Assuming we don't want to expose this to a guest, then
> > > > > the
> > > > > right interface here is probably for userspace to pick one SPE
> > > > > implementation and expose that to the guest.
> > 
> > If I understand correctly then this implies the following:
> > 
> >  - If the host userspace indicates it wants support for SPE in the guest (via 
> >    KVM_SET_DEVICE_ATTR at start of day) - then we should check in vcpu_load that
> >    the minimum version of SPE is present on the current CPU. 'minimum' because
> >    we don't know why userspace has selected the given cpumask.
> > 
> >  - Userspace can get it wrong, i.e. it can create a CPU mask with CPUs that
> >    have SPE with differing versions. If it does, and all CPUs have some form of
> >    SPE then errors may occur in the guest. Perhaps this is OK and userspace
> >    shouldn't get it wrong?
> 
> Actually this could be guarded against by emulating the ID_AA64DFR0_EL1 such to
> cap the version to the minimum SPE version - if absolutely required.

The problem is, it's not as simple as checking a version field. Instead,
you'd have to look at all of the ID registers for SPE so that you don't end
up with funny differences such as minimum sampling interval, or hardware RNG
support. Ultimately though, *much* of the trace is going to be describing
IMP DEF stuff because it's so micro-architectural, and there's very little
you can do to hide that.

Will
Andrew Murray Jan. 10, 2020, 10:54 a.m. UTC | #12
On Sat, Dec 21, 2019 at 02:13:25PM +0000, Marc Zyngier wrote:
> On Fri, 20 Dec 2019 14:30:16 +0000
> Andrew Murray <andrew.murray@arm.com> wrote:
> 
> [somehow managed not to do a reply all, re-sending]
> 
> > From: Sudeep Holla <sudeep.holla@arm.com>
> > 
> > Now that we can save/restore the full SPE controls, we can enable it
> > if SPE is setup and ready to use in KVM. It's supported in KVM only if
> > all the CPUs in the system supports SPE.
> > 
> > However to support heterogenous systems, we need to move the check if
> > host supports SPE and do a partial save/restore.
> 
> No. Let's just not go down that path. For now, KVM on heterogeneous
> systems do not get SPE. If SPE has been enabled on a guest and a CPU
> comes up without SPE, this CPU should fail to boot (same as exposing a
> feature to userspace).
> 
> > 
> > Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
> > Signed-off-by: Andrew Murray <andrew.murray@arm.com>
> > ---
> >  arch/arm64/kvm/hyp/debug-sr.c | 33 ++++++++++++++++-----------------
> >  include/kvm/arm_spe.h         |  6 ++++++
> >  2 files changed, 22 insertions(+), 17 deletions(-)
> > 
> > diff --git a/arch/arm64/kvm/hyp/debug-sr.c b/arch/arm64/kvm/hyp/debug-sr.c
> > index 12429b212a3a..d8d857067e6d 100644
> > --- a/arch/arm64/kvm/hyp/debug-sr.c
> > +++ b/arch/arm64/kvm/hyp/debug-sr.c
> > @@ -86,18 +86,13 @@
> >  	}
> >  
> >  static void __hyp_text
> > -__debug_save_spe_nvhe(struct kvm_cpu_context *ctxt, bool full_ctxt)
> > +__debug_save_spe_context(struct kvm_cpu_context *ctxt, bool full_ctxt)
> >  {
> >  	u64 reg;
> >  
> >  	/* Clear pmscr in case of early return */
> >  	ctxt->sys_regs[PMSCR_EL1] = 0;
> >  
> > -	/* SPE present on this CPU? */
> > -	if (!cpuid_feature_extract_unsigned_field(read_sysreg(id_aa64dfr0_el1),
> > -						  ID_AA64DFR0_PMSVER_SHIFT))
> > -		return;
> > -
> >  	/* Yes; is it owned by higher EL? */
> >  	reg = read_sysreg_s(SYS_PMBIDR_EL1);
> >  	if (reg & BIT(SYS_PMBIDR_EL1_P_SHIFT))
> > @@ -142,7 +137,7 @@ __debug_save_spe_nvhe(struct kvm_cpu_context *ctxt, bool full_ctxt)
> >  }
> >  
> >  static void __hyp_text
> > -__debug_restore_spe_nvhe(struct kvm_cpu_context *ctxt, bool full_ctxt)
> > +__debug_restore_spe_context(struct kvm_cpu_context *ctxt, bool full_ctxt)
> >  {
> >  	if (!ctxt->sys_regs[PMSCR_EL1])
> >  		return;
> > @@ -210,11 +205,14 @@ void __hyp_text __debug_restore_guest_context(struct kvm_vcpu *vcpu)
> >  	struct kvm_guest_debug_arch *host_dbg;
> >  	struct kvm_guest_debug_arch *guest_dbg;
> >  
> > +	host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context);
> > +	guest_ctxt = &vcpu->arch.ctxt;
> > +
> > +	__debug_restore_spe_context(guest_ctxt, kvm_arm_spe_v1_ready(vcpu));
> > +
> >  	if (!(vcpu->arch.flags & KVM_ARM64_DEBUG_DIRTY))
> >  		return;
> >  
> > -	host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context);
> > -	guest_ctxt = &vcpu->arch.ctxt;
> >  	host_dbg = &vcpu->arch.host_debug_state.regs;
> >  	guest_dbg = kern_hyp_va(vcpu->arch.debug_ptr);
> >  
> > @@ -232,8 +230,7 @@ void __hyp_text __debug_restore_host_context(struct kvm_vcpu *vcpu)
> >  	host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context);
> >  	guest_ctxt = &vcpu->arch.ctxt;
> >  
> > -	if (!has_vhe())
> > -		__debug_restore_spe_nvhe(host_ctxt, false);
> > +	__debug_restore_spe_context(host_ctxt, kvm_arm_spe_v1_ready(vcpu));
> 
> So you now do an unconditional save/restore on the exit path for VHE as
> well? Even if the host isn't using the SPE HW? That's not acceptable
> as, in most cases, only the host /or/ the guest will use SPE. Here, you
> put a measurable overhead on each exit.
> 
> If the host is not using SPE, then the restore/save should happen in
> vcpu_load/vcpu_put. Only if the host is using SPE should you do
> something in the run loop. Of course, this only applies to VHE and
> non-VHE must switch eagerly.
> 

On VHE where SPE is used in the guest only - we save/restore in vcpu_load/put.

On VHE where SPE is used in the host only - we save/restore in the run loop.

On VHE where SPE is used in guest and host - we save/restore in the run loop.

As the guest can't trace EL2 it doesn't matter if we restore guest SPE early
in the vcpu_load/put functions. (I assume it doesn't matter that we restore
an EL0/EL1 profiling buffer address at this point and enable tracing given
that there is nothing to trace until entering the guest).

However the reason for moving save/restore to vcpu_load/put when the host is
using SPE is to minimise the host EL2 black-out window.


On nVHE we always save/restore in the run loop. For the SPE guest-use-only
use-case we can't save/restore in vcpu_load/put - because the guest runs at
the same ELx level as the host - and thus doing so would result in the guest
tracing part of the host.

Though if we determine that (for nVHE systems) the guest SPE is profiling only
EL0 - then we could also save/restore in vcpu_load/put where SPE is only being
used in the guest.

Does that make sense, are my reasons correct?

Thanks,

Andrew Murray


> >  
> >  	if (!(vcpu->arch.flags & KVM_ARM64_DEBUG_DIRTY))
> >  		return;
> > @@ -249,19 +246,21 @@ void __hyp_text __debug_restore_host_context(struct kvm_vcpu *vcpu)
> >  
> >  void __hyp_text __debug_save_host_context(struct kvm_vcpu *vcpu)
> >  {
> > -	/*
> > -	 * Non-VHE: Disable and flush SPE data generation
> > -	 * VHE: The vcpu can run, but it can't hide.
> > -	 */
> >  	struct kvm_cpu_context *host_ctxt;
> >  
> >  	host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context);
> > -	if (!has_vhe())
> > -		__debug_save_spe_nvhe(host_ctxt, false);
> > +	if (cpuid_feature_extract_unsigned_field(read_sysreg(id_aa64dfr0_el1),
> > +						 ID_AA64DFR0_PMSVER_SHIFT))
> > +		__debug_save_spe_context(host_ctxt, kvm_arm_spe_v1_ready(vcpu));
> >  }
> >  
> >  void __hyp_text __debug_save_guest_context(struct kvm_vcpu *vcpu)
> >  {
> > +	bool kvm_spe_ready = kvm_arm_spe_v1_ready(vcpu);
> > +
> > +	/* SPE present on this vCPU? */
> > +	if (kvm_spe_ready)
> > +		__debug_save_spe_context(&vcpu->arch.ctxt, kvm_spe_ready);
> >  }
> >  
> >  u32 __hyp_text __kvm_get_mdcr_el2(void)
> > diff --git a/include/kvm/arm_spe.h b/include/kvm/arm_spe.h
> > index 48d118fdb174..30c40b1bc385 100644
> > --- a/include/kvm/arm_spe.h
> > +++ b/include/kvm/arm_spe.h
> > @@ -16,4 +16,10 @@ struct kvm_spe {
> >  	bool irq_level;
> >  };
> >  
> > +#ifdef CONFIG_KVM_ARM_SPE
> > +#define kvm_arm_spe_v1_ready(v)		((v)->arch.spe.ready)
> > +#else
> > +#define kvm_arm_spe_v1_ready(v)		(false)
> > +#endif /* CONFIG_KVM_ARM_SPE */
> > +
> >  #endif /* __ASM_ARM_KVM_SPE_H */
> 
> Thanks,
> 
> 	M.
> -- 
> Jazz is not dead. It just smells funny...
Andrew Murray Jan. 10, 2020, 11:04 a.m. UTC | #13
On Fri, Jan 10, 2020 at 10:54:36AM +0000, Andrew Murray wrote:
> On Sat, Dec 21, 2019 at 02:13:25PM +0000, Marc Zyngier wrote:
> > On Fri, 20 Dec 2019 14:30:16 +0000
> > Andrew Murray <andrew.murray@arm.com> wrote:
> > 
> > [somehow managed not to do a reply all, re-sending]
> > 
> > > From: Sudeep Holla <sudeep.holla@arm.com>
> > > 
> > > Now that we can save/restore the full SPE controls, we can enable it
> > > if SPE is setup and ready to use in KVM. It's supported in KVM only if
> > > all the CPUs in the system supports SPE.
> > > 
> > > However to support heterogenous systems, we need to move the check if
> > > host supports SPE and do a partial save/restore.
> > 
> > No. Let's just not go down that path. For now, KVM on heterogeneous
> > systems do not get SPE. If SPE has been enabled on a guest and a CPU
> > comes up without SPE, this CPU should fail to boot (same as exposing a
> > feature to userspace).
> > 
> > > 
> > > Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
> > > Signed-off-by: Andrew Murray <andrew.murray@arm.com>
> > > ---
> > >  arch/arm64/kvm/hyp/debug-sr.c | 33 ++++++++++++++++-----------------
> > >  include/kvm/arm_spe.h         |  6 ++++++
> > >  2 files changed, 22 insertions(+), 17 deletions(-)
> > > 
> > > diff --git a/arch/arm64/kvm/hyp/debug-sr.c b/arch/arm64/kvm/hyp/debug-sr.c
> > > index 12429b212a3a..d8d857067e6d 100644
> > > --- a/arch/arm64/kvm/hyp/debug-sr.c
> > > +++ b/arch/arm64/kvm/hyp/debug-sr.c
> > > @@ -86,18 +86,13 @@
> > >  	}
> > >  
> > >  static void __hyp_text
> > > -__debug_save_spe_nvhe(struct kvm_cpu_context *ctxt, bool full_ctxt)
> > > +__debug_save_spe_context(struct kvm_cpu_context *ctxt, bool full_ctxt)
> > >  {
> > >  	u64 reg;
> > >  
> > >  	/* Clear pmscr in case of early return */
> > >  	ctxt->sys_regs[PMSCR_EL1] = 0;
> > >  
> > > -	/* SPE present on this CPU? */
> > > -	if (!cpuid_feature_extract_unsigned_field(read_sysreg(id_aa64dfr0_el1),
> > > -						  ID_AA64DFR0_PMSVER_SHIFT))
> > > -		return;
> > > -
> > >  	/* Yes; is it owned by higher EL? */
> > >  	reg = read_sysreg_s(SYS_PMBIDR_EL1);
> > >  	if (reg & BIT(SYS_PMBIDR_EL1_P_SHIFT))
> > > @@ -142,7 +137,7 @@ __debug_save_spe_nvhe(struct kvm_cpu_context *ctxt, bool full_ctxt)
> > >  }
> > >  
> > >  static void __hyp_text
> > > -__debug_restore_spe_nvhe(struct kvm_cpu_context *ctxt, bool full_ctxt)
> > > +__debug_restore_spe_context(struct kvm_cpu_context *ctxt, bool full_ctxt)
> > >  {
> > >  	if (!ctxt->sys_regs[PMSCR_EL1])
> > >  		return;
> > > @@ -210,11 +205,14 @@ void __hyp_text __debug_restore_guest_context(struct kvm_vcpu *vcpu)
> > >  	struct kvm_guest_debug_arch *host_dbg;
> > >  	struct kvm_guest_debug_arch *guest_dbg;
> > >  
> > > +	host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context);
> > > +	guest_ctxt = &vcpu->arch.ctxt;
> > > +
> > > +	__debug_restore_spe_context(guest_ctxt, kvm_arm_spe_v1_ready(vcpu));
> > > +
> > >  	if (!(vcpu->arch.flags & KVM_ARM64_DEBUG_DIRTY))
> > >  		return;
> > >  
> > > -	host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context);
> > > -	guest_ctxt = &vcpu->arch.ctxt;
> > >  	host_dbg = &vcpu->arch.host_debug_state.regs;
> > >  	guest_dbg = kern_hyp_va(vcpu->arch.debug_ptr);
> > >  
> > > @@ -232,8 +230,7 @@ void __hyp_text __debug_restore_host_context(struct kvm_vcpu *vcpu)
> > >  	host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context);
> > >  	guest_ctxt = &vcpu->arch.ctxt;
> > >  
> > > -	if (!has_vhe())
> > > -		__debug_restore_spe_nvhe(host_ctxt, false);
> > > +	__debug_restore_spe_context(host_ctxt, kvm_arm_spe_v1_ready(vcpu));
> > 
> > So you now do an unconditional save/restore on the exit path for VHE as
> > well? Even if the host isn't using the SPE HW? That's not acceptable
> > as, in most cases, only the host /or/ the guest will use SPE. Here, you
> > put a measurable overhead on each exit.
> > 
> > If the host is not using SPE, then the restore/save should happen in
> > vcpu_load/vcpu_put. Only if the host is using SPE should you do
> > something in the run loop. Of course, this only applies to VHE and
> > non-VHE must switch eagerly.
> > 
> 
> On VHE where SPE is used in the guest only - we save/restore in vcpu_load/put.
> 
> On VHE where SPE is used in the host only - we save/restore in the run loop.
> 
> On VHE where SPE is used in guest and host - we save/restore in the run loop.
> 
> As the guest can't trace EL2 it doesn't matter if we restore guest SPE early
> in the vcpu_load/put functions. (I assume it doesn't matter that we restore
> an EL0/EL1 profiling buffer address at this point and enable tracing given
> that there is nothing to trace until entering the guest).
> 
> However the reason for moving save/restore to vcpu_load/put when the host is
> using SPE is to minimise the host EL2 black-out window.
> 
> 
> On nVHE we always save/restore in the run loop. For the SPE guest-use-only
> use-case we can't save/restore in vcpu_load/put - because the guest runs at
> the same ELx level as the host - and thus doing so would result in the guest
> tracing part of the host.
> 
> Though if we determine that (for nVHE systems) the guest SPE is profiling only
> EL0 - then we could also save/restore in vcpu_load/put where SPE is only being
> used in the guest.
> 
> Does that make sense, are my reasons correct?

Also I'm making the following assumptions:

 - We determine if the host or guest are using SPE by seeing if profiling
   (e.g. PMSCR_EL1) is enabled. That should determine *when* we restore as per
   my previous email.

 - I'm less sure on this: We should determine *what* we restore based on the
   availability of the SPE feature and not if it is being used - so for guest
   this is if the guest has the feature on the vcpu. For host this is based on
   the CPU feature registers.

   The downshot of this is that if you have SPE support present on guest and
   host and they aren't being used, then you still save/restore upon entering/
   leaving a guest. The reason I feel this is needed is to prevent the issue
   where the host starts programming the SPE registers, but is preempted by
   KVM entering a guest, before it could enable host SPE. Thus when we enter the
   guest we don't save all the registers, we return to the host and the host
   SPE carries on from where it left of and enables it - yet because we didn't
   restore all the programmed registers it doesn't work.

Thanks,

Andrew Murray

> 
> Thanks,
> 
> Andrew Murray
> 
> 
> > >  
> > >  	if (!(vcpu->arch.flags & KVM_ARM64_DEBUG_DIRTY))
> > >  		return;
> > > @@ -249,19 +246,21 @@ void __hyp_text __debug_restore_host_context(struct kvm_vcpu *vcpu)
> > >  
> > >  void __hyp_text __debug_save_host_context(struct kvm_vcpu *vcpu)
> > >  {
> > > -	/*
> > > -	 * Non-VHE: Disable and flush SPE data generation
> > > -	 * VHE: The vcpu can run, but it can't hide.
> > > -	 */
> > >  	struct kvm_cpu_context *host_ctxt;
> > >  
> > >  	host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context);
> > > -	if (!has_vhe())
> > > -		__debug_save_spe_nvhe(host_ctxt, false);
> > > +	if (cpuid_feature_extract_unsigned_field(read_sysreg(id_aa64dfr0_el1),
> > > +						 ID_AA64DFR0_PMSVER_SHIFT))
> > > +		__debug_save_spe_context(host_ctxt, kvm_arm_spe_v1_ready(vcpu));
> > >  }
> > >  
> > >  void __hyp_text __debug_save_guest_context(struct kvm_vcpu *vcpu)
> > >  {
> > > +	bool kvm_spe_ready = kvm_arm_spe_v1_ready(vcpu);
> > > +
> > > +	/* SPE present on this vCPU? */
> > > +	if (kvm_spe_ready)
> > > +		__debug_save_spe_context(&vcpu->arch.ctxt, kvm_spe_ready);
> > >  }
> > >  
> > >  u32 __hyp_text __kvm_get_mdcr_el2(void)
> > > diff --git a/include/kvm/arm_spe.h b/include/kvm/arm_spe.h
> > > index 48d118fdb174..30c40b1bc385 100644
> > > --- a/include/kvm/arm_spe.h
> > > +++ b/include/kvm/arm_spe.h
> > > @@ -16,4 +16,10 @@ struct kvm_spe {
> > >  	bool irq_level;
> > >  };
> > >  
> > > +#ifdef CONFIG_KVM_ARM_SPE
> > > +#define kvm_arm_spe_v1_ready(v)		((v)->arch.spe.ready)
> > > +#else
> > > +#define kvm_arm_spe_v1_ready(v)		(false)
> > > +#endif /* CONFIG_KVM_ARM_SPE */
> > > +
> > >  #endif /* __ASM_ARM_KVM_SPE_H */
> > 
> > Thanks,
> > 
> > 	M.
> > -- 
> > Jazz is not dead. It just smells funny...
> _______________________________________________
> kvmarm mailing list
> kvmarm@lists.cs.columbia.edu
> https://lists.cs.columbia.edu/mailman/listinfo/kvmarm
Marc Zyngier Jan. 10, 2020, 11:18 a.m. UTC | #14
On 2020-01-10 10:54, Andrew Murray wrote:
> On Sat, Dec 21, 2019 at 02:13:25PM +0000, Marc Zyngier wrote:
>> On Fri, 20 Dec 2019 14:30:16 +0000
>> Andrew Murray <andrew.murray@arm.com> wrote:
>> 
>> [somehow managed not to do a reply all, re-sending]
>> 
>> > From: Sudeep Holla <sudeep.holla@arm.com>
>> >
>> > Now that we can save/restore the full SPE controls, we can enable it
>> > if SPE is setup and ready to use in KVM. It's supported in KVM only if
>> > all the CPUs in the system supports SPE.
>> >
>> > However to support heterogenous systems, we need to move the check if
>> > host supports SPE and do a partial save/restore.
>> 
>> No. Let's just not go down that path. For now, KVM on heterogeneous
>> systems do not get SPE. If SPE has been enabled on a guest and a CPU
>> comes up without SPE, this CPU should fail to boot (same as exposing a
>> feature to userspace).
>> 
>> >
>> > Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
>> > Signed-off-by: Andrew Murray <andrew.murray@arm.com>
>> > ---
>> >  arch/arm64/kvm/hyp/debug-sr.c | 33 ++++++++++++++++-----------------
>> >  include/kvm/arm_spe.h         |  6 ++++++
>> >  2 files changed, 22 insertions(+), 17 deletions(-)
>> >
>> > diff --git a/arch/arm64/kvm/hyp/debug-sr.c b/arch/arm64/kvm/hyp/debug-sr.c
>> > index 12429b212a3a..d8d857067e6d 100644
>> > --- a/arch/arm64/kvm/hyp/debug-sr.c
>> > +++ b/arch/arm64/kvm/hyp/debug-sr.c
>> > @@ -86,18 +86,13 @@
>> >  	}
>> >
>> >  static void __hyp_text
>> > -__debug_save_spe_nvhe(struct kvm_cpu_context *ctxt, bool full_ctxt)
>> > +__debug_save_spe_context(struct kvm_cpu_context *ctxt, bool full_ctxt)
>> >  {
>> >  	u64 reg;
>> >
>> >  	/* Clear pmscr in case of early return */
>> >  	ctxt->sys_regs[PMSCR_EL1] = 0;
>> >
>> > -	/* SPE present on this CPU? */
>> > -	if (!cpuid_feature_extract_unsigned_field(read_sysreg(id_aa64dfr0_el1),
>> > -						  ID_AA64DFR0_PMSVER_SHIFT))
>> > -		return;
>> > -
>> >  	/* Yes; is it owned by higher EL? */
>> >  	reg = read_sysreg_s(SYS_PMBIDR_EL1);
>> >  	if (reg & BIT(SYS_PMBIDR_EL1_P_SHIFT))
>> > @@ -142,7 +137,7 @@ __debug_save_spe_nvhe(struct kvm_cpu_context *ctxt, bool full_ctxt)
>> >  }
>> >
>> >  static void __hyp_text
>> > -__debug_restore_spe_nvhe(struct kvm_cpu_context *ctxt, bool full_ctxt)
>> > +__debug_restore_spe_context(struct kvm_cpu_context *ctxt, bool full_ctxt)
>> >  {
>> >  	if (!ctxt->sys_regs[PMSCR_EL1])
>> >  		return;
>> > @@ -210,11 +205,14 @@ void __hyp_text __debug_restore_guest_context(struct kvm_vcpu *vcpu)
>> >  	struct kvm_guest_debug_arch *host_dbg;
>> >  	struct kvm_guest_debug_arch *guest_dbg;
>> >
>> > +	host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context);
>> > +	guest_ctxt = &vcpu->arch.ctxt;
>> > +
>> > +	__debug_restore_spe_context(guest_ctxt, kvm_arm_spe_v1_ready(vcpu));
>> > +
>> >  	if (!(vcpu->arch.flags & KVM_ARM64_DEBUG_DIRTY))
>> >  		return;
>> >
>> > -	host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context);
>> > -	guest_ctxt = &vcpu->arch.ctxt;
>> >  	host_dbg = &vcpu->arch.host_debug_state.regs;
>> >  	guest_dbg = kern_hyp_va(vcpu->arch.debug_ptr);
>> >
>> > @@ -232,8 +230,7 @@ void __hyp_text __debug_restore_host_context(struct kvm_vcpu *vcpu)
>> >  	host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context);
>> >  	guest_ctxt = &vcpu->arch.ctxt;
>> >
>> > -	if (!has_vhe())
>> > -		__debug_restore_spe_nvhe(host_ctxt, false);
>> > +	__debug_restore_spe_context(host_ctxt, kvm_arm_spe_v1_ready(vcpu));
>> 
>> So you now do an unconditional save/restore on the exit path for VHE 
>> as
>> well? Even if the host isn't using the SPE HW? That's not acceptable
>> as, in most cases, only the host /or/ the guest will use SPE. Here, 
>> you
>> put a measurable overhead on each exit.
>> 
>> If the host is not using SPE, then the restore/save should happen in
>> vcpu_load/vcpu_put. Only if the host is using SPE should you do
>> something in the run loop. Of course, this only applies to VHE and
>> non-VHE must switch eagerly.
>> 
> 
> On VHE where SPE is used in the guest only - we save/restore in 
> vcpu_load/put.

Yes.

> On VHE where SPE is used in the host only - we save/restore in the run 
> loop.

Why? If only the host is using SPE, why should we do *anything at all*?

> On VHE where SPE is used in guest and host - we save/restore in the run 
> loop.
> 
> As the guest can't trace EL2 it doesn't matter if we restore guest SPE 
> early
> in the vcpu_load/put functions. (I assume it doesn't matter that we 
> restore
> an EL0/EL1 profiling buffer address at this point and enable tracing 
> given
> that there is nothing to trace until entering the guest).

As long as you do it after the EL1 sysregs have need restored so that 
the SPE
HW has a valid context, we should be fine. Don't restore it before that 
point
though (you have no idea whether the SPE HW can do speculative memory 
accesses
that would use the wrong page tables).

> However the reason for moving save/restore to vcpu_load/put when the 
> host is
> using SPE is to minimise the host EL2 black-out window.

You should move it to *the run loop* when both host and guest are using 
SPE.

> On nVHE we always save/restore in the run loop. For the SPE 
> guest-use-only
> use-case we can't save/restore in vcpu_load/put - because the guest 
> runs at
> the same ELx level as the host - and thus doing so would result in the 
> guest
> tracing part of the host.

Not only. It would actively corrupt memory in the host by using the 
wrong
page tables.

> Though if we determine that (for nVHE systems) the guest SPE is 
> profiling only
> EL0 - then we could also save/restore in vcpu_load/put where SPE is 
> only being
> used in the guest.

Same as above: wrong MM context, speculation, potential memory 
corruption.

> Does that make sense, are my reasons correct?

Not entirely. I think you should use the following table:

VHE | Host-SPE | Guest-SPE | Switch location
  0  |     0    |     0     | none
  0  |     0    |     1     | run loop
  0  |     1    |     0     | run loop
  0  |     1    |     1     | run loop
  1  |     0    |     0     | none
  1  |     0    |     1     | load/put
  1  |     1    |     0     | none
  1  |     1    |     1     | run loop

Thanks,

         M.
Marc Zyngier Jan. 10, 2020, 11:51 a.m. UTC | #15
On 2020-01-10 11:04, Andrew Murray wrote:
> On Fri, Jan 10, 2020 at 10:54:36AM +0000, Andrew Murray wrote:
>> On Sat, Dec 21, 2019 at 02:13:25PM +0000, Marc Zyngier wrote:
>> > On Fri, 20 Dec 2019 14:30:16 +0000
>> > Andrew Murray <andrew.murray@arm.com> wrote:
>> >
>> > [somehow managed not to do a reply all, re-sending]
>> >
>> > > From: Sudeep Holla <sudeep.holla@arm.com>
>> > >
>> > > Now that we can save/restore the full SPE controls, we can enable it
>> > > if SPE is setup and ready to use in KVM. It's supported in KVM only if
>> > > all the CPUs in the system supports SPE.
>> > >
>> > > However to support heterogenous systems, we need to move the check if
>> > > host supports SPE and do a partial save/restore.
>> >
>> > No. Let's just not go down that path. For now, KVM on heterogeneous
>> > systems do not get SPE. If SPE has been enabled on a guest and a CPU
>> > comes up without SPE, this CPU should fail to boot (same as exposing a
>> > feature to userspace).
>> >
>> > >
>> > > Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
>> > > Signed-off-by: Andrew Murray <andrew.murray@arm.com>
>> > > ---
>> > >  arch/arm64/kvm/hyp/debug-sr.c | 33 ++++++++++++++++-----------------
>> > >  include/kvm/arm_spe.h         |  6 ++++++
>> > >  2 files changed, 22 insertions(+), 17 deletions(-)
>> > >
>> > > diff --git a/arch/arm64/kvm/hyp/debug-sr.c b/arch/arm64/kvm/hyp/debug-sr.c
>> > > index 12429b212a3a..d8d857067e6d 100644
>> > > --- a/arch/arm64/kvm/hyp/debug-sr.c
>> > > +++ b/arch/arm64/kvm/hyp/debug-sr.c
>> > > @@ -86,18 +86,13 @@
>> > >  	}
>> > >
>> > >  static void __hyp_text
>> > > -__debug_save_spe_nvhe(struct kvm_cpu_context *ctxt, bool full_ctxt)
>> > > +__debug_save_spe_context(struct kvm_cpu_context *ctxt, bool full_ctxt)
>> > >  {
>> > >  	u64 reg;
>> > >
>> > >  	/* Clear pmscr in case of early return */
>> > >  	ctxt->sys_regs[PMSCR_EL1] = 0;
>> > >
>> > > -	/* SPE present on this CPU? */
>> > > -	if (!cpuid_feature_extract_unsigned_field(read_sysreg(id_aa64dfr0_el1),
>> > > -						  ID_AA64DFR0_PMSVER_SHIFT))
>> > > -		return;
>> > > -
>> > >  	/* Yes; is it owned by higher EL? */
>> > >  	reg = read_sysreg_s(SYS_PMBIDR_EL1);
>> > >  	if (reg & BIT(SYS_PMBIDR_EL1_P_SHIFT))
>> > > @@ -142,7 +137,7 @@ __debug_save_spe_nvhe(struct kvm_cpu_context *ctxt, bool full_ctxt)
>> > >  }
>> > >
>> > >  static void __hyp_text
>> > > -__debug_restore_spe_nvhe(struct kvm_cpu_context *ctxt, bool full_ctxt)
>> > > +__debug_restore_spe_context(struct kvm_cpu_context *ctxt, bool full_ctxt)
>> > >  {
>> > >  	if (!ctxt->sys_regs[PMSCR_EL1])
>> > >  		return;
>> > > @@ -210,11 +205,14 @@ void __hyp_text __debug_restore_guest_context(struct kvm_vcpu *vcpu)
>> > >  	struct kvm_guest_debug_arch *host_dbg;
>> > >  	struct kvm_guest_debug_arch *guest_dbg;
>> > >
>> > > +	host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context);
>> > > +	guest_ctxt = &vcpu->arch.ctxt;
>> > > +
>> > > +	__debug_restore_spe_context(guest_ctxt, kvm_arm_spe_v1_ready(vcpu));
>> > > +
>> > >  	if (!(vcpu->arch.flags & KVM_ARM64_DEBUG_DIRTY))
>> > >  		return;
>> > >
>> > > -	host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context);
>> > > -	guest_ctxt = &vcpu->arch.ctxt;
>> > >  	host_dbg = &vcpu->arch.host_debug_state.regs;
>> > >  	guest_dbg = kern_hyp_va(vcpu->arch.debug_ptr);
>> > >
>> > > @@ -232,8 +230,7 @@ void __hyp_text __debug_restore_host_context(struct kvm_vcpu *vcpu)
>> > >  	host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context);
>> > >  	guest_ctxt = &vcpu->arch.ctxt;
>> > >
>> > > -	if (!has_vhe())
>> > > -		__debug_restore_spe_nvhe(host_ctxt, false);
>> > > +	__debug_restore_spe_context(host_ctxt, kvm_arm_spe_v1_ready(vcpu));
>> >
>> > So you now do an unconditional save/restore on the exit path for VHE as
>> > well? Even if the host isn't using the SPE HW? That's not acceptable
>> > as, in most cases, only the host /or/ the guest will use SPE. Here, you
>> > put a measurable overhead on each exit.
>> >
>> > If the host is not using SPE, then the restore/save should happen in
>> > vcpu_load/vcpu_put. Only if the host is using SPE should you do
>> > something in the run loop. Of course, this only applies to VHE and
>> > non-VHE must switch eagerly.
>> >
>> 
>> On VHE where SPE is used in the guest only - we save/restore in 
>> vcpu_load/put.
>> 
>> On VHE where SPE is used in the host only - we save/restore in the run 
>> loop.
>> 
>> On VHE where SPE is used in guest and host - we save/restore in the 
>> run loop.
>> 
>> As the guest can't trace EL2 it doesn't matter if we restore guest SPE 
>> early
>> in the vcpu_load/put functions. (I assume it doesn't matter that we 
>> restore
>> an EL0/EL1 profiling buffer address at this point and enable tracing 
>> given
>> that there is nothing to trace until entering the guest).
>> 
>> However the reason for moving save/restore to vcpu_load/put when the 
>> host is
>> using SPE is to minimise the host EL2 black-out window.
>> 
>> 
>> On nVHE we always save/restore in the run loop. For the SPE 
>> guest-use-only
>> use-case we can't save/restore in vcpu_load/put - because the guest 
>> runs at
>> the same ELx level as the host - and thus doing so would result in the 
>> guest
>> tracing part of the host.
>> 
>> Though if we determine that (for nVHE systems) the guest SPE is 
>> profiling only
>> EL0 - then we could also save/restore in vcpu_load/put where SPE is 
>> only being
>> used in the guest.
>> 
>> Does that make sense, are my reasons correct?
> 
> Also I'm making the following assumptions:
> 
>  - We determine if the host or guest are using SPE by seeing if 
> profiling
>    (e.g. PMSCR_EL1) is enabled. That should determine *when* we restore 
> as per
>    my previous email.

Yes.

>  - I'm less sure on this: We should determine *what* we restore based 
> on the
>    availability of the SPE feature and not if it is being used - so for 
> guest
>    this is if the guest has the feature on the vcpu. For host this is 
> based on
>    the CPU feature registers.

As long as the guest's feature is conditionned on the HW being present 
*and*
that you're running on a CPU that has the HW.

>    The downshot of this is that if you have SPE support present on 
> guest and
>    host and they aren't being used, then you still save/restore upon 
> entering/
>    leaving a guest. The reason I feel this is needed is to prevent the 
> issue
>    where the host starts programming the SPE registers, but is 
> preempted by
>    KVM entering a guest, before it could enable host SPE. Thus when we 
> enter the
>    guest we don't save all the registers, we return to the host and the 
> host
>    SPE carries on from where it left of and enables it - yet because we 
> didn't
>    restore all the programmed registers it doesn't work.

Saving the host registers is never optional if they are shared with the 
guest.

         M.
Andrew Murray Jan. 10, 2020, 12:12 p.m. UTC | #16
On Fri, Jan 10, 2020 at 11:18:48AM +0000, Marc Zyngier wrote:
> On 2020-01-10 10:54, Andrew Murray wrote:
> > On Sat, Dec 21, 2019 at 02:13:25PM +0000, Marc Zyngier wrote:
> > > On Fri, 20 Dec 2019 14:30:16 +0000
> > > Andrew Murray <andrew.murray@arm.com> wrote:
> > > 
> > > [somehow managed not to do a reply all, re-sending]
> > > 
> > > > From: Sudeep Holla <sudeep.holla@arm.com>
> > > >
> > > > Now that we can save/restore the full SPE controls, we can enable it
> > > > if SPE is setup and ready to use in KVM. It's supported in KVM only if
> > > > all the CPUs in the system supports SPE.
> > > >
> > > > However to support heterogenous systems, we need to move the check if
> > > > host supports SPE and do a partial save/restore.
> > > 
> > > No. Let's just not go down that path. For now, KVM on heterogeneous
> > > systems do not get SPE. If SPE has been enabled on a guest and a CPU
> > > comes up without SPE, this CPU should fail to boot (same as exposing a
> > > feature to userspace).
> > > 
> > > >
> > > > Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
> > > > Signed-off-by: Andrew Murray <andrew.murray@arm.com>
> > > > ---
> > > >  arch/arm64/kvm/hyp/debug-sr.c | 33 ++++++++++++++++-----------------
> > > >  include/kvm/arm_spe.h         |  6 ++++++
> > > >  2 files changed, 22 insertions(+), 17 deletions(-)
> > > >
> > > > diff --git a/arch/arm64/kvm/hyp/debug-sr.c b/arch/arm64/kvm/hyp/debug-sr.c
> > > > index 12429b212a3a..d8d857067e6d 100644
> > > > --- a/arch/arm64/kvm/hyp/debug-sr.c
> > > > +++ b/arch/arm64/kvm/hyp/debug-sr.c
> > > > @@ -86,18 +86,13 @@
> > > >  	}
> > > >
> > > >  static void __hyp_text
> > > > -__debug_save_spe_nvhe(struct kvm_cpu_context *ctxt, bool full_ctxt)
> > > > +__debug_save_spe_context(struct kvm_cpu_context *ctxt, bool full_ctxt)
> > > >  {
> > > >  	u64 reg;
> > > >
> > > >  	/* Clear pmscr in case of early return */
> > > >  	ctxt->sys_regs[PMSCR_EL1] = 0;
> > > >
> > > > -	/* SPE present on this CPU? */
> > > > -	if (!cpuid_feature_extract_unsigned_field(read_sysreg(id_aa64dfr0_el1),
> > > > -						  ID_AA64DFR0_PMSVER_SHIFT))
> > > > -		return;
> > > > -
> > > >  	/* Yes; is it owned by higher EL? */
> > > >  	reg = read_sysreg_s(SYS_PMBIDR_EL1);
> > > >  	if (reg & BIT(SYS_PMBIDR_EL1_P_SHIFT))
> > > > @@ -142,7 +137,7 @@ __debug_save_spe_nvhe(struct kvm_cpu_context *ctxt, bool full_ctxt)
> > > >  }
> > > >
> > > >  static void __hyp_text
> > > > -__debug_restore_spe_nvhe(struct kvm_cpu_context *ctxt, bool full_ctxt)
> > > > +__debug_restore_spe_context(struct kvm_cpu_context *ctxt, bool full_ctxt)
> > > >  {
> > > >  	if (!ctxt->sys_regs[PMSCR_EL1])
> > > >  		return;
> > > > @@ -210,11 +205,14 @@ void __hyp_text __debug_restore_guest_context(struct kvm_vcpu *vcpu)
> > > >  	struct kvm_guest_debug_arch *host_dbg;
> > > >  	struct kvm_guest_debug_arch *guest_dbg;
> > > >
> > > > +	host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context);
> > > > +	guest_ctxt = &vcpu->arch.ctxt;
> > > > +
> > > > +	__debug_restore_spe_context(guest_ctxt, kvm_arm_spe_v1_ready(vcpu));
> > > > +
> > > >  	if (!(vcpu->arch.flags & KVM_ARM64_DEBUG_DIRTY))
> > > >  		return;
> > > >
> > > > -	host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context);
> > > > -	guest_ctxt = &vcpu->arch.ctxt;
> > > >  	host_dbg = &vcpu->arch.host_debug_state.regs;
> > > >  	guest_dbg = kern_hyp_va(vcpu->arch.debug_ptr);
> > > >
> > > > @@ -232,8 +230,7 @@ void __hyp_text __debug_restore_host_context(struct kvm_vcpu *vcpu)
> > > >  	host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context);
> > > >  	guest_ctxt = &vcpu->arch.ctxt;
> > > >
> > > > -	if (!has_vhe())
> > > > -		__debug_restore_spe_nvhe(host_ctxt, false);
> > > > +	__debug_restore_spe_context(host_ctxt, kvm_arm_spe_v1_ready(vcpu));
> > > 
> > > So you now do an unconditional save/restore on the exit path for VHE
> > > as
> > > well? Even if the host isn't using the SPE HW? That's not acceptable
> > > as, in most cases, only the host /or/ the guest will use SPE. Here,
> > > you
> > > put a measurable overhead on each exit.
> > > 
> > > If the host is not using SPE, then the restore/save should happen in
> > > vcpu_load/vcpu_put. Only if the host is using SPE should you do
> > > something in the run loop. Of course, this only applies to VHE and
> > > non-VHE must switch eagerly.
> > > 
> > 
> > On VHE where SPE is used in the guest only - we save/restore in
> > vcpu_load/put.
> 
> Yes.
> 
> > On VHE where SPE is used in the host only - we save/restore in the run
> > loop.
> 
> Why? If only the host is using SPE, why should we do *anything at all*?

Oh yeah of course, we trap them in this case.

(Do I understand correctly that we don't/can't trap them for nVHE? - and so
we should save/restore them for this use-case in nVHE)


> 
> > On VHE where SPE is used in guest and host - we save/restore in the run
> > loop.
> > 
> > As the guest can't trace EL2 it doesn't matter if we restore guest SPE
> > early
> > in the vcpu_load/put functions. (I assume it doesn't matter that we
> > restore
> > an EL0/EL1 profiling buffer address at this point and enable tracing
> > given
> > that there is nothing to trace until entering the guest).
> 
> As long as you do it after the EL1 sysregs have need restored so that the
> SPE
> HW has a valid context, we should be fine. Don't restore it before that
> point
> though (you have no idea whether the SPE HW can do speculative memory
> accesses
> that would use the wrong page tables).

Right, so don't enable tracing until SPE has a valid context. I understand
that to mean at least the SPE buffer address registers (PMBPTR, PMBLIMITR)
in the right context with respect to the E2PB bits (translation regime)
and having those tables mapped in (which I think relate to the __activateX,
__sysreg_restore_guest_stateX type of calls in kvm_vcpu_run_X right?).

I think that means we can restore the registers no earler than vcpu_load/put
but we can't re-enable the tracing (PMSCR) until no earlier than just before
__set_guest_arch_workaround_state. I think that applies to both VHE and nVHE?

> 
> > However the reason for moving save/restore to vcpu_load/put when the
> > host is
> > using SPE is to minimise the host EL2 black-out window.
> 
> You should move it to *the run loop* when both host and guest are using SPE.
> 
> > On nVHE we always save/restore in the run loop. For the SPE
> > guest-use-only
> > use-case we can't save/restore in vcpu_load/put - because the guest runs
> > at
> > the same ELx level as the host - and thus doing so would result in the
> > guest
> > tracing part of the host.
> 
> Not only. It would actively corrupt memory in the host by using the wrong
> page tables.
> 
> > Though if we determine that (for nVHE systems) the guest SPE is
> > profiling only
> > EL0 - then we could also save/restore in vcpu_load/put where SPE is only
> > being
> > used in the guest.
> 
> Same as above: wrong MM context, speculation, potential memory corruption.
> 
> > Does that make sense, are my reasons correct?
> 
> Not entirely. I think you should use the following table:
> 
> VHE | Host-SPE | Guest-SPE | Switch location
>  0  |     0    |     0     | none
>  0  |     0    |     1     | run loop
>  0  |     1    |     0     | run loop
>  0  |     1    |     1     | run loop
>  1  |     0    |     0     | none
>  1  |     0    |     1     | load/put
>  1  |     1    |     0     | none
>  1  |     1    |     1     | run loop

Thanks,

Andrew Murray

> 
> Thanks,
> 
>         M.
> -- 
> Jazz is not dead. It just smells funny...
Andrew Murray Jan. 10, 2020, 12:12 p.m. UTC | #17
On Fri, Jan 10, 2020 at 11:51:39AM +0000, Marc Zyngier wrote:
> On 2020-01-10 11:04, Andrew Murray wrote:
> > On Fri, Jan 10, 2020 at 10:54:36AM +0000, Andrew Murray wrote:
> > > On Sat, Dec 21, 2019 at 02:13:25PM +0000, Marc Zyngier wrote:
> > > > On Fri, 20 Dec 2019 14:30:16 +0000
> > > > Andrew Murray <andrew.murray@arm.com> wrote:
> > > >
> > > > [somehow managed not to do a reply all, re-sending]
> > > >
> > > > > From: Sudeep Holla <sudeep.holla@arm.com>
> > > > >
> > > > > Now that we can save/restore the full SPE controls, we can enable it
> > > > > if SPE is setup and ready to use in KVM. It's supported in KVM only if
> > > > > all the CPUs in the system supports SPE.
> > > > >
> > > > > However to support heterogenous systems, we need to move the check if
> > > > > host supports SPE and do a partial save/restore.
> > > >
> > > > No. Let's just not go down that path. For now, KVM on heterogeneous
> > > > systems do not get SPE. If SPE has been enabled on a guest and a CPU
> > > > comes up without SPE, this CPU should fail to boot (same as exposing a
> > > > feature to userspace).
> > > >
> > > > >
> > > > > Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
> > > > > Signed-off-by: Andrew Murray <andrew.murray@arm.com>
> > > > > ---
> > > > >  arch/arm64/kvm/hyp/debug-sr.c | 33 ++++++++++++++++-----------------
> > > > >  include/kvm/arm_spe.h         |  6 ++++++
> > > > >  2 files changed, 22 insertions(+), 17 deletions(-)
> > > > >
> > > > > diff --git a/arch/arm64/kvm/hyp/debug-sr.c b/arch/arm64/kvm/hyp/debug-sr.c
> > > > > index 12429b212a3a..d8d857067e6d 100644
> > > > > --- a/arch/arm64/kvm/hyp/debug-sr.c
> > > > > +++ b/arch/arm64/kvm/hyp/debug-sr.c
> > > > > @@ -86,18 +86,13 @@
> > > > >  	}
> > > > >
> > > > >  static void __hyp_text
> > > > > -__debug_save_spe_nvhe(struct kvm_cpu_context *ctxt, bool full_ctxt)
> > > > > +__debug_save_spe_context(struct kvm_cpu_context *ctxt, bool full_ctxt)
> > > > >  {
> > > > >  	u64 reg;
> > > > >
> > > > >  	/* Clear pmscr in case of early return */
> > > > >  	ctxt->sys_regs[PMSCR_EL1] = 0;
> > > > >
> > > > > -	/* SPE present on this CPU? */
> > > > > -	if (!cpuid_feature_extract_unsigned_field(read_sysreg(id_aa64dfr0_el1),
> > > > > -						  ID_AA64DFR0_PMSVER_SHIFT))
> > > > > -		return;
> > > > > -
> > > > >  	/* Yes; is it owned by higher EL? */
> > > > >  	reg = read_sysreg_s(SYS_PMBIDR_EL1);
> > > > >  	if (reg & BIT(SYS_PMBIDR_EL1_P_SHIFT))
> > > > > @@ -142,7 +137,7 @@ __debug_save_spe_nvhe(struct kvm_cpu_context *ctxt, bool full_ctxt)
> > > > >  }
> > > > >
> > > > >  static void __hyp_text
> > > > > -__debug_restore_spe_nvhe(struct kvm_cpu_context *ctxt, bool full_ctxt)
> > > > > +__debug_restore_spe_context(struct kvm_cpu_context *ctxt, bool full_ctxt)
> > > > >  {
> > > > >  	if (!ctxt->sys_regs[PMSCR_EL1])
> > > > >  		return;
> > > > > @@ -210,11 +205,14 @@ void __hyp_text __debug_restore_guest_context(struct kvm_vcpu *vcpu)
> > > > >  	struct kvm_guest_debug_arch *host_dbg;
> > > > >  	struct kvm_guest_debug_arch *guest_dbg;
> > > > >
> > > > > +	host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context);
> > > > > +	guest_ctxt = &vcpu->arch.ctxt;
> > > > > +
> > > > > +	__debug_restore_spe_context(guest_ctxt, kvm_arm_spe_v1_ready(vcpu));
> > > > > +
> > > > >  	if (!(vcpu->arch.flags & KVM_ARM64_DEBUG_DIRTY))
> > > > >  		return;
> > > > >
> > > > > -	host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context);
> > > > > -	guest_ctxt = &vcpu->arch.ctxt;
> > > > >  	host_dbg = &vcpu->arch.host_debug_state.regs;
> > > > >  	guest_dbg = kern_hyp_va(vcpu->arch.debug_ptr);
> > > > >
> > > > > @@ -232,8 +230,7 @@ void __hyp_text __debug_restore_host_context(struct kvm_vcpu *vcpu)
> > > > >  	host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context);
> > > > >  	guest_ctxt = &vcpu->arch.ctxt;
> > > > >
> > > > > -	if (!has_vhe())
> > > > > -		__debug_restore_spe_nvhe(host_ctxt, false);
> > > > > +	__debug_restore_spe_context(host_ctxt, kvm_arm_spe_v1_ready(vcpu));
> > > >
> > > > So you now do an unconditional save/restore on the exit path for VHE as
> > > > well? Even if the host isn't using the SPE HW? That's not acceptable
> > > > as, in most cases, only the host /or/ the guest will use SPE. Here, you
> > > > put a measurable overhead on each exit.
> > > >
> > > > If the host is not using SPE, then the restore/save should happen in
> > > > vcpu_load/vcpu_put. Only if the host is using SPE should you do
> > > > something in the run loop. Of course, this only applies to VHE and
> > > > non-VHE must switch eagerly.
> > > >
> > > 
> > > On VHE where SPE is used in the guest only - we save/restore in
> > > vcpu_load/put.
> > > 
> > > On VHE where SPE is used in the host only - we save/restore in the
> > > run loop.
> > > 
> > > On VHE where SPE is used in guest and host - we save/restore in the
> > > run loop.
> > > 
> > > As the guest can't trace EL2 it doesn't matter if we restore guest
> > > SPE early
> > > in the vcpu_load/put functions. (I assume it doesn't matter that we
> > > restore
> > > an EL0/EL1 profiling buffer address at this point and enable tracing
> > > given
> > > that there is nothing to trace until entering the guest).
> > > 
> > > However the reason for moving save/restore to vcpu_load/put when the
> > > host is
> > > using SPE is to minimise the host EL2 black-out window.
> > > 
> > > 
> > > On nVHE we always save/restore in the run loop. For the SPE
> > > guest-use-only
> > > use-case we can't save/restore in vcpu_load/put - because the guest
> > > runs at
> > > the same ELx level as the host - and thus doing so would result in
> > > the guest
> > > tracing part of the host.
> > > 
> > > Though if we determine that (for nVHE systems) the guest SPE is
> > > profiling only
> > > EL0 - then we could also save/restore in vcpu_load/put where SPE is
> > > only being
> > > used in the guest.
> > > 
> > > Does that make sense, are my reasons correct?
> > 
> > Also I'm making the following assumptions:
> > 
> >  - We determine if the host or guest are using SPE by seeing if
> > profiling
> >    (e.g. PMSCR_EL1) is enabled. That should determine *when* we restore
> > as per
> >    my previous email.
> 
> Yes.
> 
> >  - I'm less sure on this: We should determine *what* we restore based on
> > the
> >    availability of the SPE feature and not if it is being used - so for
> > guest
> >    this is if the guest has the feature on the vcpu. For host this is
> > based on
> >    the CPU feature registers.
> 
> As long as the guest's feature is conditionned on the HW being present *and*
> that you're running on a CPU that has the HW.

Yes that makes sense.


> 
> >    The downshot of this is that if you have SPE support present on guest
> > and
> >    host and they aren't being used, then you still save/restore upon
> > entering/
> >    leaving a guest. The reason I feel this is needed is to prevent the
> > issue
> >    where the host starts programming the SPE registers, but is preempted
> > by
> >    KVM entering a guest, before it could enable host SPE. Thus when we
> > enter the
> >    guest we don't save all the registers, we return to the host and the
> > host
> >    SPE carries on from where it left of and enables it - yet because we
> > didn't
> >    restore all the programmed registers it doesn't work.
> 
> Saving the host registers is never optional if they are shared with the
> guest.

That make me feel better :)

Thanks,

Andrew Murray

> 
>         M.
> -- 
> Jazz is not dead. It just smells funny...
Marc Zyngier Jan. 10, 2020, 1:34 p.m. UTC | #18
On 2020-01-10 12:12, Andrew Murray wrote:
> On Fri, Jan 10, 2020 at 11:18:48AM +0000, Marc Zyngier wrote:
>> On 2020-01-10 10:54, Andrew Murray wrote:
>> > On Sat, Dec 21, 2019 at 02:13:25PM +0000, Marc Zyngier wrote:
>> > > On Fri, 20 Dec 2019 14:30:16 +0000
>> > > Andrew Murray <andrew.murray@arm.com> wrote:
>> > >
>> > > [somehow managed not to do a reply all, re-sending]
>> > >
>> > > > From: Sudeep Holla <sudeep.holla@arm.com>
>> > > >
>> > > > Now that we can save/restore the full SPE controls, we can enable it
>> > > > if SPE is setup and ready to use in KVM. It's supported in KVM only if
>> > > > all the CPUs in the system supports SPE.
>> > > >
>> > > > However to support heterogenous systems, we need to move the check if
>> > > > host supports SPE and do a partial save/restore.
>> > >
>> > > No. Let's just not go down that path. For now, KVM on heterogeneous
>> > > systems do not get SPE. If SPE has been enabled on a guest and a CPU
>> > > comes up without SPE, this CPU should fail to boot (same as exposing a
>> > > feature to userspace).
>> > >
>> > > >
>> > > > Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
>> > > > Signed-off-by: Andrew Murray <andrew.murray@arm.com>
>> > > > ---
>> > > >  arch/arm64/kvm/hyp/debug-sr.c | 33 ++++++++++++++++-----------------
>> > > >  include/kvm/arm_spe.h         |  6 ++++++
>> > > >  2 files changed, 22 insertions(+), 17 deletions(-)
>> > > >
>> > > > diff --git a/arch/arm64/kvm/hyp/debug-sr.c b/arch/arm64/kvm/hyp/debug-sr.c
>> > > > index 12429b212a3a..d8d857067e6d 100644
>> > > > --- a/arch/arm64/kvm/hyp/debug-sr.c
>> > > > +++ b/arch/arm64/kvm/hyp/debug-sr.c
>> > > > @@ -86,18 +86,13 @@
>> > > >  	}
>> > > >
>> > > >  static void __hyp_text
>> > > > -__debug_save_spe_nvhe(struct kvm_cpu_context *ctxt, bool full_ctxt)
>> > > > +__debug_save_spe_context(struct kvm_cpu_context *ctxt, bool full_ctxt)
>> > > >  {
>> > > >  	u64 reg;
>> > > >
>> > > >  	/* Clear pmscr in case of early return */
>> > > >  	ctxt->sys_regs[PMSCR_EL1] = 0;
>> > > >
>> > > > -	/* SPE present on this CPU? */
>> > > > -	if (!cpuid_feature_extract_unsigned_field(read_sysreg(id_aa64dfr0_el1),
>> > > > -						  ID_AA64DFR0_PMSVER_SHIFT))
>> > > > -		return;
>> > > > -
>> > > >  	/* Yes; is it owned by higher EL? */
>> > > >  	reg = read_sysreg_s(SYS_PMBIDR_EL1);
>> > > >  	if (reg & BIT(SYS_PMBIDR_EL1_P_SHIFT))
>> > > > @@ -142,7 +137,7 @@ __debug_save_spe_nvhe(struct kvm_cpu_context *ctxt, bool full_ctxt)
>> > > >  }
>> > > >
>> > > >  static void __hyp_text
>> > > > -__debug_restore_spe_nvhe(struct kvm_cpu_context *ctxt, bool full_ctxt)
>> > > > +__debug_restore_spe_context(struct kvm_cpu_context *ctxt, bool full_ctxt)
>> > > >  {
>> > > >  	if (!ctxt->sys_regs[PMSCR_EL1])
>> > > >  		return;
>> > > > @@ -210,11 +205,14 @@ void __hyp_text __debug_restore_guest_context(struct kvm_vcpu *vcpu)
>> > > >  	struct kvm_guest_debug_arch *host_dbg;
>> > > >  	struct kvm_guest_debug_arch *guest_dbg;
>> > > >
>> > > > +	host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context);
>> > > > +	guest_ctxt = &vcpu->arch.ctxt;
>> > > > +
>> > > > +	__debug_restore_spe_context(guest_ctxt, kvm_arm_spe_v1_ready(vcpu));
>> > > > +
>> > > >  	if (!(vcpu->arch.flags & KVM_ARM64_DEBUG_DIRTY))
>> > > >  		return;
>> > > >
>> > > > -	host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context);
>> > > > -	guest_ctxt = &vcpu->arch.ctxt;
>> > > >  	host_dbg = &vcpu->arch.host_debug_state.regs;
>> > > >  	guest_dbg = kern_hyp_va(vcpu->arch.debug_ptr);
>> > > >
>> > > > @@ -232,8 +230,7 @@ void __hyp_text __debug_restore_host_context(struct kvm_vcpu *vcpu)
>> > > >  	host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context);
>> > > >  	guest_ctxt = &vcpu->arch.ctxt;
>> > > >
>> > > > -	if (!has_vhe())
>> > > > -		__debug_restore_spe_nvhe(host_ctxt, false);
>> > > > +	__debug_restore_spe_context(host_ctxt, kvm_arm_spe_v1_ready(vcpu));
>> > >
>> > > So you now do an unconditional save/restore on the exit path for VHE
>> > > as
>> > > well? Even if the host isn't using the SPE HW? That's not acceptable
>> > > as, in most cases, only the host /or/ the guest will use SPE. Here,
>> > > you
>> > > put a measurable overhead on each exit.
>> > >
>> > > If the host is not using SPE, then the restore/save should happen in
>> > > vcpu_load/vcpu_put. Only if the host is using SPE should you do
>> > > something in the run loop. Of course, this only applies to VHE and
>> > > non-VHE must switch eagerly.
>> > >
>> >
>> > On VHE where SPE is used in the guest only - we save/restore in
>> > vcpu_load/put.
>> 
>> Yes.
>> 
>> > On VHE where SPE is used in the host only - we save/restore in the run
>> > loop.
>> 
>> Why? If only the host is using SPE, why should we do *anything at 
>> all*?
> 
> Oh yeah of course, we trap them in this case.
> 
> (Do I understand correctly that we don't/can't trap them for nVHE? - 
> and so
> we should save/restore them for this use-case in nVHE)

We can always trap. Otherwise we wouldn't be able to hide the feature
from the guest.

>> > On VHE where SPE is used in guest and host - we save/restore in the run
>> > loop.
>> >
>> > As the guest can't trace EL2 it doesn't matter if we restore guest SPE
>> > early
>> > in the vcpu_load/put functions. (I assume it doesn't matter that we
>> > restore
>> > an EL0/EL1 profiling buffer address at this point and enable tracing
>> > given
>> > that there is nothing to trace until entering the guest).
>> 
>> As long as you do it after the EL1 sysregs have need restored so that 
>> the
>> SPE
>> HW has a valid context, we should be fine. Don't restore it before 
>> that
>> point
>> though (you have no idea whether the SPE HW can do speculative memory
>> accesses
>> that would use the wrong page tables).
> 
> Right, so don't enable tracing until SPE has a valid context. I 
> understand
> that to mean at least the SPE buffer address registers (PMBPTR, 
> PMBLIMITR)
> in the right context with respect to the E2PB bits (translation regime)
> and having those tables mapped in (which I think relate to the 
> __activateX,
> __sysreg_restore_guest_stateX type of calls in kvm_vcpu_run_X right?).

The full MM context has to be in place before you can do anything. This 
means
at least TTBR*_EL1, TCR_EL1 and co. But maybe this note in the SPE 
architecture
document would allow us to relax things:

"The Statistical Profiling Extension is always disabled if the owning 
Exception
level is a lower Exception level than the current Exception level."

So as long as you restore the guest state from EL2, SPE should be 
disabled.

> I think that means we can restore the registers no earler than 
> vcpu_load/put
> but we can't re-enable the tracing (PMSCR) until no earlier than just 
> before
> __set_guest_arch_workaround_state. I think that applies to both VHE and 
> nVHE?

I'm sorry, but I don't understand what you mean.

         M.
diff mbox series

Patch

diff --git a/arch/arm64/kvm/hyp/debug-sr.c b/arch/arm64/kvm/hyp/debug-sr.c
index 12429b212a3a..d8d857067e6d 100644
--- a/arch/arm64/kvm/hyp/debug-sr.c
+++ b/arch/arm64/kvm/hyp/debug-sr.c
@@ -86,18 +86,13 @@ 
 	}
 
 static void __hyp_text
-__debug_save_spe_nvhe(struct kvm_cpu_context *ctxt, bool full_ctxt)
+__debug_save_spe_context(struct kvm_cpu_context *ctxt, bool full_ctxt)
 {
 	u64 reg;
 
 	/* Clear pmscr in case of early return */
 	ctxt->sys_regs[PMSCR_EL1] = 0;
 
-	/* SPE present on this CPU? */
-	if (!cpuid_feature_extract_unsigned_field(read_sysreg(id_aa64dfr0_el1),
-						  ID_AA64DFR0_PMSVER_SHIFT))
-		return;
-
 	/* Yes; is it owned by higher EL? */
 	reg = read_sysreg_s(SYS_PMBIDR_EL1);
 	if (reg & BIT(SYS_PMBIDR_EL1_P_SHIFT))
@@ -142,7 +137,7 @@  __debug_save_spe_nvhe(struct kvm_cpu_context *ctxt, bool full_ctxt)
 }
 
 static void __hyp_text
-__debug_restore_spe_nvhe(struct kvm_cpu_context *ctxt, bool full_ctxt)
+__debug_restore_spe_context(struct kvm_cpu_context *ctxt, bool full_ctxt)
 {
 	if (!ctxt->sys_regs[PMSCR_EL1])
 		return;
@@ -210,11 +205,14 @@  void __hyp_text __debug_restore_guest_context(struct kvm_vcpu *vcpu)
 	struct kvm_guest_debug_arch *host_dbg;
 	struct kvm_guest_debug_arch *guest_dbg;
 
+	host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context);
+	guest_ctxt = &vcpu->arch.ctxt;
+
+	__debug_restore_spe_context(guest_ctxt, kvm_arm_spe_v1_ready(vcpu));
+
 	if (!(vcpu->arch.flags & KVM_ARM64_DEBUG_DIRTY))
 		return;
 
-	host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context);
-	guest_ctxt = &vcpu->arch.ctxt;
 	host_dbg = &vcpu->arch.host_debug_state.regs;
 	guest_dbg = kern_hyp_va(vcpu->arch.debug_ptr);
 
@@ -232,8 +230,7 @@  void __hyp_text __debug_restore_host_context(struct kvm_vcpu *vcpu)
 	host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context);
 	guest_ctxt = &vcpu->arch.ctxt;
 
-	if (!has_vhe())
-		__debug_restore_spe_nvhe(host_ctxt, false);
+	__debug_restore_spe_context(host_ctxt, kvm_arm_spe_v1_ready(vcpu));
 
 	if (!(vcpu->arch.flags & KVM_ARM64_DEBUG_DIRTY))
 		return;
@@ -249,19 +246,21 @@  void __hyp_text __debug_restore_host_context(struct kvm_vcpu *vcpu)
 
 void __hyp_text __debug_save_host_context(struct kvm_vcpu *vcpu)
 {
-	/*
-	 * Non-VHE: Disable and flush SPE data generation
-	 * VHE: The vcpu can run, but it can't hide.
-	 */
 	struct kvm_cpu_context *host_ctxt;
 
 	host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context);
-	if (!has_vhe())
-		__debug_save_spe_nvhe(host_ctxt, false);
+	if (cpuid_feature_extract_unsigned_field(read_sysreg(id_aa64dfr0_el1),
+						 ID_AA64DFR0_PMSVER_SHIFT))
+		__debug_save_spe_context(host_ctxt, kvm_arm_spe_v1_ready(vcpu));
 }
 
 void __hyp_text __debug_save_guest_context(struct kvm_vcpu *vcpu)
 {
+	bool kvm_spe_ready = kvm_arm_spe_v1_ready(vcpu);
+
+	/* SPE present on this vCPU? */
+	if (kvm_spe_ready)
+		__debug_save_spe_context(&vcpu->arch.ctxt, kvm_spe_ready);
 }
 
 u32 __hyp_text __kvm_get_mdcr_el2(void)
diff --git a/include/kvm/arm_spe.h b/include/kvm/arm_spe.h
index 48d118fdb174..30c40b1bc385 100644
--- a/include/kvm/arm_spe.h
+++ b/include/kvm/arm_spe.h
@@ -16,4 +16,10 @@  struct kvm_spe {
 	bool irq_level;
 };
 
+#ifdef CONFIG_KVM_ARM_SPE
+#define kvm_arm_spe_v1_ready(v)		((v)->arch.spe.ready)
+#else
+#define kvm_arm_spe_v1_ready(v)		(false)
+#endif /* CONFIG_KVM_ARM_SPE */
+
 #endif /* __ASM_ARM_KVM_SPE_H */