diff mbox

[08/37] KVM: arm64: Defer restoring host VFP state to vcpu_put

Message ID 20171012104141.26902-9-christoffer.dall@linaro.org (mailing list archive)
State New, archived
Headers show

Commit Message

Christoffer Dall Oct. 12, 2017, 10:41 a.m. UTC
Avoid saving the guest VFP registers and restoring the host VFP
registers on every exit from the VM.  Only when we're about to run
userspace or other threads in the kernel do we really have to switch the
state back to the host state.

We still initially configure the VFP registers to trap when entering the
VM, but the difference is that we now leave the guest state in the
hardware registers while running the VM.

Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 arch/arm64/include/asm/kvm_emulate.h |  5 ++++
 arch/arm64/include/asm/kvm_host.h    |  3 +++
 arch/arm64/kernel/asm-offsets.c      |  1 +
 arch/arm64/kvm/hyp/entry.S           |  3 +++
 arch/arm64/kvm/hyp/switch.c          | 47 +++++++++++-------------------------
 arch/arm64/kvm/hyp/sysreg-sr.c       | 21 +++++++++++++---
 6 files changed, 44 insertions(+), 36 deletions(-)

Comments

Andrew Jones Nov. 7, 2017, 1:15 p.m. UTC | #1
On Thu, Oct 12, 2017 at 12:41:12PM +0200, Christoffer Dall wrote:
> Avoid saving the guest VFP registers and restoring the host VFP
> registers on every exit from the VM.  Only when we're about to run
> userspace or other threads in the kernel do we really have to switch the
> state back to the host state.
> 
> We still initially configure the VFP registers to trap when entering the
> VM, but the difference is that we now leave the guest state in the
> hardware registers while running the VM.

running the host.

> 
> Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
> ---
>  arch/arm64/include/asm/kvm_emulate.h |  5 ++++
>  arch/arm64/include/asm/kvm_host.h    |  3 +++
>  arch/arm64/kernel/asm-offsets.c      |  1 +
>  arch/arm64/kvm/hyp/entry.S           |  3 +++
>  arch/arm64/kvm/hyp/switch.c          | 47 +++++++++++-------------------------
>  arch/arm64/kvm/hyp/sysreg-sr.c       | 21 +++++++++++++---
>  6 files changed, 44 insertions(+), 36 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
> index 1fbfe96..630dd60 100644
> --- a/arch/arm64/include/asm/kvm_emulate.h
> +++ b/arch/arm64/include/asm/kvm_emulate.h
> @@ -56,6 +56,11 @@ static inline unsigned long *vcpu_hcr(struct kvm_vcpu *vcpu)
>  	return (unsigned long *)&vcpu->arch.hcr_el2;
>  }
>  
> +static inline bool vcpu_el1_is_32bit(struct kvm_vcpu *vcpu)
> +{
> +	return (!(vcpu->arch.hcr_el2 & HCR_RW));

nit: no need for the outer ().

> +}
> +
>  static inline unsigned long *vcpu_pc(const struct kvm_vcpu *vcpu)
>  {
>  	return (unsigned long *)&vcpu_gp_regs(vcpu)->regs.pc;
> diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
> index 7d3bfa7..5e09eb9 100644
> --- a/arch/arm64/include/asm/kvm_host.h
> +++ b/arch/arm64/include/asm/kvm_host.h
> @@ -210,6 +210,9 @@ struct kvm_vcpu_arch {
>  	/* Guest debug state */
>  	u64 debug_flags;
>  
> +	/* 1 if the guest VFP state is loaded into the hardware */
> +	u64 guest_vfp_loaded;
> +

Is there a chance we'll want other flags like this? Should we just make
this a lazy state flags field with the (currently only) flag VFP? If not,
then a bool would be nicer, although I see below the u64 was chosen in
order for the 'str' to be used.

>  	/*
>  	 * We maintain more than a single set of debug registers to support
>  	 * debugging the guest from the host and to maintain separate host and
> diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
> index 612021d..9946732 100644
> --- a/arch/arm64/kernel/asm-offsets.c
> +++ b/arch/arm64/kernel/asm-offsets.c
> @@ -133,6 +133,7 @@ int main(void)
>    DEFINE(CPU_GP_REGS,		offsetof(struct kvm_cpu_context, gp_regs));
>    DEFINE(CPU_USER_PT_REGS,	offsetof(struct kvm_regs, regs));
>    DEFINE(CPU_FP_REGS,		offsetof(struct kvm_regs, fp_regs));
> +  DEFINE(VCPU_GUEST_VFP_LOADED,	offsetof(struct kvm_vcpu, arch.guest_vfp_loaded));
>    DEFINE(VCPU_FPEXC32_EL2,	offsetof(struct kvm_vcpu, arch.ctxt.sys_regs[FPEXC32_EL2]));
>    DEFINE(VCPU_HOST_CONTEXT,	offsetof(struct kvm_vcpu, arch.host_cpu_context));
>    DEFINE(HOST_CONTEXT_VCPU,	offsetof(struct kvm_cpu_context, __hyp_running_vcpu));
> diff --git a/arch/arm64/kvm/hyp/entry.S b/arch/arm64/kvm/hyp/entry.S
> index 76cd48f..b3e7191 100644
> --- a/arch/arm64/kvm/hyp/entry.S
> +++ b/arch/arm64/kvm/hyp/entry.S
> @@ -185,6 +185,9 @@ alternative_endif
>  	add	x0, x2, #CPU_GP_REG_OFFSET(CPU_FP_REGS)
>  	bl	__fpsimd_restore_state
>  
> +	mov	x0, #1
> +	str	x0, [x3, #VCPU_GUEST_VFP_LOADED]
> +
>  	// Skip restoring fpexc32 for AArch64 guests
>  	mrs	x1, hcr_el2
>  	tbnz	x1, #HCR_RW_SHIFT, 1f
> diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c
> index 7703d63..ef05c59 100644
> --- a/arch/arm64/kvm/hyp/switch.c
> +++ b/arch/arm64/kvm/hyp/switch.c
> @@ -23,43 +23,31 @@
>  #include <asm/kvm_hyp.h>
>  #include <asm/fpsimd.h>
>  
> -static bool __hyp_text __fpsimd_enabled_nvhe(void)
> -{
> -	return !(read_sysreg(cptr_el2) & CPTR_EL2_TFP);
> -}
> -
> -static bool __hyp_text __fpsimd_enabled_vhe(void)
> -{
> -	return !!(read_sysreg(cpacr_el1) & CPACR_EL1_FPEN);
> -}
> -
> -static hyp_alternate_select(__fpsimd_is_enabled,
> -			    __fpsimd_enabled_nvhe, __fpsimd_enabled_vhe,
> -			    ARM64_HAS_VIRT_HOST_EXTN);
> -
> -bool __hyp_text __fpsimd_enabled(void)
> -{
> -	return __fpsimd_is_enabled()();
> -}
> -
> -static void __hyp_text __activate_traps_vhe(void)
> +static void __hyp_text __activate_traps_vhe(struct kvm_vcpu *vcpu)
>  {
>  	u64 val;
>  
>  	val = read_sysreg(cpacr_el1);
>  	val |= CPACR_EL1_TTA;
> -	val &= ~CPACR_EL1_FPEN;
> +	if (vcpu->arch.guest_vfp_loaded)
> +		val |= CPACR_EL1_FPEN;
> +	else
> +		val &= ~CPACR_EL1_FPEN;
>  	write_sysreg(val, cpacr_el1);
>  
>  	write_sysreg(__kvm_hyp_vector, vbar_el1);
>  }
>  
> -static void __hyp_text __activate_traps_nvhe(void)
> +static void __hyp_text __activate_traps_nvhe(struct kvm_vcpu *vcpu)
>  {
>  	u64 val;
>  
>  	val = CPTR_EL2_DEFAULT;
> -	val |= CPTR_EL2_TTA | CPTR_EL2_TFP;
> +	val |= CPTR_EL2_TTA;
> +	if (vcpu->arch.guest_vfp_loaded)
> +		val &= ~CPTR_EL2_TFP;
> +	else
> +		val |= CPTR_EL2_TFP;
>  	write_sysreg(val, cptr_el2);
>  }
>  
> @@ -81,7 +69,8 @@ static void __hyp_text __activate_traps(struct kvm_vcpu *vcpu)
>  	 * it will cause an exception.
>  	 */
>  	val = vcpu->arch.hcr_el2;
> -	if (!(val & HCR_RW) && system_supports_fpsimd()) {
> +	if (vcpu_el1_is_32bit(vcpu) && system_supports_fpsimd() &&
> +	    !vcpu->arch.guest_vfp_loaded) {
>  		write_sysreg(1 << 30, fpexc32_el2);
>  		isb();
>  	}
> @@ -97,7 +86,7 @@ static void __hyp_text __activate_traps(struct kvm_vcpu *vcpu)
>  	write_sysreg(0, pmselr_el0);
>  	write_sysreg(ARMV8_PMU_USERENR_MASK, pmuserenr_el0);
>  	write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2);
> -	__activate_traps_arch()();
> +	__activate_traps_arch()(vcpu);
>  }
>  
>  static void __hyp_text __deactivate_traps_vhe(void)
> @@ -273,7 +262,6 @@ int __hyp_text __kvm_vcpu_run(struct kvm_vcpu *vcpu)
>  {
>  	struct kvm_cpu_context *host_ctxt;
>  	struct kvm_cpu_context *guest_ctxt;
> -	bool fp_enabled;
>  	u64 exit_code;
>  
>  	vcpu = kern_hyp_va(vcpu);
> @@ -355,8 +343,6 @@ int __hyp_text __kvm_vcpu_run(struct kvm_vcpu *vcpu)
>  		/* 0 falls through to be handled out of EL2 */
>  	}
>  
> -	fp_enabled = __fpsimd_enabled();
> -
>  	__sysreg_save_guest_state(guest_ctxt);
>  	__sysreg32_save_state(vcpu);
>  	__timer_disable_traps(vcpu);
> @@ -367,11 +353,6 @@ int __hyp_text __kvm_vcpu_run(struct kvm_vcpu *vcpu)
>  
>  	__sysreg_restore_host_state(host_ctxt);
>  
> -	if (fp_enabled) {
> -		__fpsimd_save_state(&guest_ctxt->gp_regs.fp_regs);
> -		__fpsimd_restore_state(&host_ctxt->gp_regs.fp_regs);
> -	}
> -
>  	__debug_save_state(vcpu, kern_hyp_va(vcpu->arch.debug_ptr), guest_ctxt);
>  	/*
>  	 * This must come after restoring the host sysregs, since a non-VHE
> diff --git a/arch/arm64/kvm/hyp/sysreg-sr.c b/arch/arm64/kvm/hyp/sysreg-sr.c
> index b7438c8..c4a3714 100644
> --- a/arch/arm64/kvm/hyp/sysreg-sr.c
> +++ b/arch/arm64/kvm/hyp/sysreg-sr.c
> @@ -19,6 +19,7 @@
>  #include <linux/kvm_host.h>
>  
>  #include <asm/kvm_asm.h>
> +#include <asm/kvm_emulate.h>
>  #include <asm/kvm_hyp.h>
>  
>  /* Yes, this does nothing, on purpose */
> @@ -137,6 +138,11 @@ void __hyp_text __sysreg_restore_guest_state(struct kvm_cpu_context *ctxt)
>  	__sysreg_restore_common_state(ctxt);
>  }
>  
> +static void __hyp_text __fpsimd32_save_state(struct kvm_cpu_context *ctxt)
> +{
> +	ctxt->sys_regs[FPEXC32_EL2] = read_sysreg(fpexc32_el2);
> +}
> +
>  void __hyp_text __sysreg32_save_state(struct kvm_vcpu *vcpu)
>  {
>  	u64 *spsr, *sysreg;
> @@ -155,9 +161,6 @@ void __hyp_text __sysreg32_save_state(struct kvm_vcpu *vcpu)
>  	sysreg[DACR32_EL2] = read_sysreg(dacr32_el2);
>  	sysreg[IFSR32_EL2] = read_sysreg(ifsr32_el2);
>  
> -	if (__fpsimd_enabled())
> -		sysreg[FPEXC32_EL2] = read_sysreg(fpexc32_el2);
> -
>  	if (vcpu->arch.debug_flags & KVM_ARM64_DEBUG_DIRTY)
>  		sysreg[DBGVCR32_EL2] = read_sysreg(dbgvcr32_el2);
>  }
> @@ -209,4 +212,16 @@ void kvm_vcpu_load_sysregs(struct kvm_vcpu *vcpu)
>   */
>  void kvm_vcpu_put_sysregs(struct kvm_vcpu *vcpu)
>  {
> +	struct kvm_cpu_context *host_ctxt = vcpu->arch.host_cpu_context;
> +	struct kvm_cpu_context *guest_ctxt = &vcpu->arch.ctxt;
> +
> +	/* Restore host FP/SIMD state */
> +	if (vcpu->arch.guest_vfp_loaded) {
> +		if (vcpu_el1_is_32bit(vcpu))
> +			kvm_call_hyp(__fpsimd32_save_state,
> +				     kern_hyp_va(guest_ctxt));

nit: might be nice to use {} since we need two lines.

> +		__fpsimd_save_state(&guest_ctxt->gp_regs.fp_regs);
> +		__fpsimd_restore_state(&host_ctxt->gp_regs.fp_regs);
> +		vcpu->arch.guest_vfp_loaded = 0;
> +	}
>  }
> -- 
> 2.9.0
>

Otherwise,

Reviewed-by: Andrew Jones <drjones@redhat.com>
Andrew Jones Nov. 15, 2017, 4:04 p.m. UTC | #2
On Thu, Oct 12, 2017 at 12:41:12PM +0200, Christoffer Dall wrote:
> Avoid saving the guest VFP registers and restoring the host VFP
> registers on every exit from the VM.  Only when we're about to run
> userspace or other threads in the kernel do we really have to switch the
> state back to the host state.

Rik van Riel's recently post patch "[PATCH v2 0/2] x86,kvm: move qemu/guest
FPU switching out to kvm_arch_vcpu_ioctl_run" indicates that for x86 they
only need to swap guest and userspace VFP registers before exiting VCPU_RUN
to userspace, not for running other threads. I imagine that's the same for
ARM as well.

If so, then I think this hunk

> @@ -209,4 +212,16 @@ void kvm_vcpu_load_sysregs(struct kvm_vcpu *vcpu)
>   */
>  void kvm_vcpu_put_sysregs(struct kvm_vcpu *vcpu)
>  {
> +	struct kvm_cpu_context *host_ctxt = vcpu->arch.host_cpu_context;
> +	struct kvm_cpu_context *guest_ctxt = &vcpu->arch.ctxt;
> +
> +	/* Restore host FP/SIMD state */
> +	if (vcpu->arch.guest_vfp_loaded) {
> +		if (vcpu_el1_is_32bit(vcpu))
> +			kvm_call_hyp(__fpsimd32_save_state,
> +				     kern_hyp_va(guest_ctxt));
> +		__fpsimd_save_state(&guest_ctxt->gp_regs.fp_regs);
> +		__fpsimd_restore_state(&host_ctxt->gp_regs.fp_regs);
> +		vcpu->arch.guest_vfp_loaded = 0;
> +	}
>  }

could be moved to the return of kvm_arch_vcpu_ioctl_run().

Thanks,
drew
Yury Norov Nov. 25, 2017, 7:52 a.m. UTC | #3
Hi Christoffer,

On Thu, Oct 12, 2017 at 12:41:12PM +0200, Christoffer Dall wrote:
> Avoid saving the guest VFP registers and restoring the host VFP
> registers on every exit from the VM.  Only when we're about to run
> userspace or other threads in the kernel do we really have to switch the
> state back to the host state.
> 
> We still initially configure the VFP registers to trap when entering the
> VM, but the difference is that we now leave the guest state in the
> hardware registers while running the VM.
> 
> Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
> ---
>  arch/arm64/include/asm/kvm_emulate.h |  5 ++++
>  arch/arm64/include/asm/kvm_host.h    |  3 +++
>  arch/arm64/kernel/asm-offsets.c      |  1 +
>  arch/arm64/kvm/hyp/entry.S           |  3 +++
>  arch/arm64/kvm/hyp/switch.c          | 47 +++++++++++-------------------------
>  arch/arm64/kvm/hyp/sysreg-sr.c       | 21 +++++++++++++---
>  6 files changed, 44 insertions(+), 36 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
> index 1fbfe96..630dd60 100644
> --- a/arch/arm64/include/asm/kvm_emulate.h
> +++ b/arch/arm64/include/asm/kvm_emulate.h
> @@ -56,6 +56,11 @@ static inline unsigned long *vcpu_hcr(struct kvm_vcpu *vcpu)
>  	return (unsigned long *)&vcpu->arch.hcr_el2;
>  }
>  
> +static inline bool vcpu_el1_is_32bit(struct kvm_vcpu *vcpu)
> +{
> +	return (!(vcpu->arch.hcr_el2 & HCR_RW));
> +}
> +
>  static inline unsigned long *vcpu_pc(const struct kvm_vcpu *vcpu)
>  {
>  	return (unsigned long *)&vcpu_gp_regs(vcpu)->regs.pc;
> diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
> index 7d3bfa7..5e09eb9 100644
> --- a/arch/arm64/include/asm/kvm_host.h
> +++ b/arch/arm64/include/asm/kvm_host.h
> @@ -210,6 +210,9 @@ struct kvm_vcpu_arch {
>  	/* Guest debug state */
>  	u64 debug_flags;
>  
> +	/* 1 if the guest VFP state is loaded into the hardware */
> +	u64 guest_vfp_loaded;

May it be just u8/bool?

Yury

> +
>  	/*
>  	 * We maintain more than a single set of debug registers to support
>  	 * debugging the guest from the host and to maintain separate host and
> diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
> index 612021d..9946732 100644
> --- a/arch/arm64/kernel/asm-offsets.c
> +++ b/arch/arm64/kernel/asm-offsets.c
> @@ -133,6 +133,7 @@ int main(void)
>    DEFINE(CPU_GP_REGS,		offsetof(struct kvm_cpu_context, gp_regs));
>    DEFINE(CPU_USER_PT_REGS,	offsetof(struct kvm_regs, regs));
>    DEFINE(CPU_FP_REGS,		offsetof(struct kvm_regs, fp_regs));
> +  DEFINE(VCPU_GUEST_VFP_LOADED,	offsetof(struct kvm_vcpu, arch.guest_vfp_loaded));
>    DEFINE(VCPU_FPEXC32_EL2,	offsetof(struct kvm_vcpu, arch.ctxt.sys_regs[FPEXC32_EL2]));
>    DEFINE(VCPU_HOST_CONTEXT,	offsetof(struct kvm_vcpu, arch.host_cpu_context));
>    DEFINE(HOST_CONTEXT_VCPU,	offsetof(struct kvm_cpu_context, __hyp_running_vcpu));
> diff --git a/arch/arm64/kvm/hyp/entry.S b/arch/arm64/kvm/hyp/entry.S
> index 76cd48f..b3e7191 100644
> --- a/arch/arm64/kvm/hyp/entry.S
> +++ b/arch/arm64/kvm/hyp/entry.S
> @@ -185,6 +185,9 @@ alternative_endif
>  	add	x0, x2, #CPU_GP_REG_OFFSET(CPU_FP_REGS)
>  	bl	__fpsimd_restore_state
>  
> +	mov	x0, #1
> +	str	x0, [x3, #VCPU_GUEST_VFP_LOADED]
> +
>  	// Skip restoring fpexc32 for AArch64 guests
>  	mrs	x1, hcr_el2
>  	tbnz	x1, #HCR_RW_SHIFT, 1f
> diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c
> index 7703d63..ef05c59 100644
> --- a/arch/arm64/kvm/hyp/switch.c
> +++ b/arch/arm64/kvm/hyp/switch.c
> @@ -23,43 +23,31 @@
>  #include <asm/kvm_hyp.h>
>  #include <asm/fpsimd.h>
>  
> -static bool __hyp_text __fpsimd_enabled_nvhe(void)
> -{
> -	return !(read_sysreg(cptr_el2) & CPTR_EL2_TFP);
> -}
> -
> -static bool __hyp_text __fpsimd_enabled_vhe(void)
> -{
> -	return !!(read_sysreg(cpacr_el1) & CPACR_EL1_FPEN);
> -}
> -
> -static hyp_alternate_select(__fpsimd_is_enabled,
> -			    __fpsimd_enabled_nvhe, __fpsimd_enabled_vhe,
> -			    ARM64_HAS_VIRT_HOST_EXTN);
> -
> -bool __hyp_text __fpsimd_enabled(void)
> -{
> -	return __fpsimd_is_enabled()();
> -}
> -
> -static void __hyp_text __activate_traps_vhe(void)
> +static void __hyp_text __activate_traps_vhe(struct kvm_vcpu *vcpu)
>  {
>  	u64 val;
>  
>  	val = read_sysreg(cpacr_el1);
>  	val |= CPACR_EL1_TTA;
> -	val &= ~CPACR_EL1_FPEN;
> +	if (vcpu->arch.guest_vfp_loaded)
> +		val |= CPACR_EL1_FPEN;
> +	else
> +		val &= ~CPACR_EL1_FPEN;
>  	write_sysreg(val, cpacr_el1);
>  
>  	write_sysreg(__kvm_hyp_vector, vbar_el1);
>  }
>  
> -static void __hyp_text __activate_traps_nvhe(void)
> +static void __hyp_text __activate_traps_nvhe(struct kvm_vcpu *vcpu)
>  {
>  	u64 val;
>  
>  	val = CPTR_EL2_DEFAULT;
> -	val |= CPTR_EL2_TTA | CPTR_EL2_TFP;
> +	val |= CPTR_EL2_TTA;
> +	if (vcpu->arch.guest_vfp_loaded)
> +		val &= ~CPTR_EL2_TFP;
> +	else
> +		val |= CPTR_EL2_TFP;
>  	write_sysreg(val, cptr_el2);
>  }
>  
> @@ -81,7 +69,8 @@ static void __hyp_text __activate_traps(struct kvm_vcpu *vcpu)
>  	 * it will cause an exception.
>  	 */
>  	val = vcpu->arch.hcr_el2;
> -	if (!(val & HCR_RW) && system_supports_fpsimd()) {
> +	if (vcpu_el1_is_32bit(vcpu) && system_supports_fpsimd() &&
> +	    !vcpu->arch.guest_vfp_loaded) {
>  		write_sysreg(1 << 30, fpexc32_el2);
>  		isb();
>  	}
> @@ -97,7 +86,7 @@ static void __hyp_text __activate_traps(struct kvm_vcpu *vcpu)
>  	write_sysreg(0, pmselr_el0);
>  	write_sysreg(ARMV8_PMU_USERENR_MASK, pmuserenr_el0);
>  	write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2);
> -	__activate_traps_arch()();
> +	__activate_traps_arch()(vcpu);
>  }
>  
>  static void __hyp_text __deactivate_traps_vhe(void)
> @@ -273,7 +262,6 @@ int __hyp_text __kvm_vcpu_run(struct kvm_vcpu *vcpu)
>  {
>  	struct kvm_cpu_context *host_ctxt;
>  	struct kvm_cpu_context *guest_ctxt;
> -	bool fp_enabled;
>  	u64 exit_code;
>  
>  	vcpu = kern_hyp_va(vcpu);
> @@ -355,8 +343,6 @@ int __hyp_text __kvm_vcpu_run(struct kvm_vcpu *vcpu)
>  		/* 0 falls through to be handled out of EL2 */
>  	}
>  
> -	fp_enabled = __fpsimd_enabled();
> -
>  	__sysreg_save_guest_state(guest_ctxt);
>  	__sysreg32_save_state(vcpu);
>  	__timer_disable_traps(vcpu);
> @@ -367,11 +353,6 @@ int __hyp_text __kvm_vcpu_run(struct kvm_vcpu *vcpu)
>  
>  	__sysreg_restore_host_state(host_ctxt);
>  
> -	if (fp_enabled) {
> -		__fpsimd_save_state(&guest_ctxt->gp_regs.fp_regs);
> -		__fpsimd_restore_state(&host_ctxt->gp_regs.fp_regs);
> -	}
> -
>  	__debug_save_state(vcpu, kern_hyp_va(vcpu->arch.debug_ptr), guest_ctxt);
>  	/*
>  	 * This must come after restoring the host sysregs, since a non-VHE
> diff --git a/arch/arm64/kvm/hyp/sysreg-sr.c b/arch/arm64/kvm/hyp/sysreg-sr.c
> index b7438c8..c4a3714 100644
> --- a/arch/arm64/kvm/hyp/sysreg-sr.c
> +++ b/arch/arm64/kvm/hyp/sysreg-sr.c
> @@ -19,6 +19,7 @@
>  #include <linux/kvm_host.h>
>  
>  #include <asm/kvm_asm.h>
> +#include <asm/kvm_emulate.h>
>  #include <asm/kvm_hyp.h>
>  
>  /* Yes, this does nothing, on purpose */
> @@ -137,6 +138,11 @@ void __hyp_text __sysreg_restore_guest_state(struct kvm_cpu_context *ctxt)
>  	__sysreg_restore_common_state(ctxt);
>  }
>  
> +static void __hyp_text __fpsimd32_save_state(struct kvm_cpu_context *ctxt)
> +{
> +	ctxt->sys_regs[FPEXC32_EL2] = read_sysreg(fpexc32_el2);
> +}
> +
>  void __hyp_text __sysreg32_save_state(struct kvm_vcpu *vcpu)
>  {
>  	u64 *spsr, *sysreg;
> @@ -155,9 +161,6 @@ void __hyp_text __sysreg32_save_state(struct kvm_vcpu *vcpu)
>  	sysreg[DACR32_EL2] = read_sysreg(dacr32_el2);
>  	sysreg[IFSR32_EL2] = read_sysreg(ifsr32_el2);
>  
> -	if (__fpsimd_enabled())
> -		sysreg[FPEXC32_EL2] = read_sysreg(fpexc32_el2);
> -
>  	if (vcpu->arch.debug_flags & KVM_ARM64_DEBUG_DIRTY)
>  		sysreg[DBGVCR32_EL2] = read_sysreg(dbgvcr32_el2);
>  }
> @@ -209,4 +212,16 @@ void kvm_vcpu_load_sysregs(struct kvm_vcpu *vcpu)
>   */
>  void kvm_vcpu_put_sysregs(struct kvm_vcpu *vcpu)
>  {
> +	struct kvm_cpu_context *host_ctxt = vcpu->arch.host_cpu_context;
> +	struct kvm_cpu_context *guest_ctxt = &vcpu->arch.ctxt;
> +
> +	/* Restore host FP/SIMD state */
> +	if (vcpu->arch.guest_vfp_loaded) {
> +		if (vcpu_el1_is_32bit(vcpu))
> +			kvm_call_hyp(__fpsimd32_save_state,
> +				     kern_hyp_va(guest_ctxt));
> +		__fpsimd_save_state(&guest_ctxt->gp_regs.fp_regs);
> +		__fpsimd_restore_state(&host_ctxt->gp_regs.fp_regs);
> +		vcpu->arch.guest_vfp_loaded = 0;
> +	}
>  }
> -- 
> 2.9.0
Christoffer Dall Nov. 26, 2017, 4:17 p.m. UTC | #4
Hi Yury,

On Sat, Nov 25, 2017 at 10:52:21AM +0300, Yury Norov wrote:
> 
> On Thu, Oct 12, 2017 at 12:41:12PM +0200, Christoffer Dall wrote:
> > Avoid saving the guest VFP registers and restoring the host VFP
> > registers on every exit from the VM.  Only when we're about to run
> > userspace or other threads in the kernel do we really have to switch the
> > state back to the host state.
> > 
> > We still initially configure the VFP registers to trap when entering the
> > VM, but the difference is that we now leave the guest state in the
> > hardware registers while running the VM.
> > 
> > Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
> > ---
> >  arch/arm64/include/asm/kvm_emulate.h |  5 ++++
> >  arch/arm64/include/asm/kvm_host.h    |  3 +++
> >  arch/arm64/kernel/asm-offsets.c      |  1 +
> >  arch/arm64/kvm/hyp/entry.S           |  3 +++
> >  arch/arm64/kvm/hyp/switch.c          | 47 +++++++++++-------------------------
> >  arch/arm64/kvm/hyp/sysreg-sr.c       | 21 +++++++++++++---
> >  6 files changed, 44 insertions(+), 36 deletions(-)
> > 
> > diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
> > index 1fbfe96..630dd60 100644
> > --- a/arch/arm64/include/asm/kvm_emulate.h
> > +++ b/arch/arm64/include/asm/kvm_emulate.h
> > @@ -56,6 +56,11 @@ static inline unsigned long *vcpu_hcr(struct kvm_vcpu *vcpu)
> >  	return (unsigned long *)&vcpu->arch.hcr_el2;
> >  }
> >  
> > +static inline bool vcpu_el1_is_32bit(struct kvm_vcpu *vcpu)
> > +{
> > +	return (!(vcpu->arch.hcr_el2 & HCR_RW));
> > +}
> > +
> >  static inline unsigned long *vcpu_pc(const struct kvm_vcpu *vcpu)
> >  {
> >  	return (unsigned long *)&vcpu_gp_regs(vcpu)->regs.pc;
> > diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
> > index 7d3bfa7..5e09eb9 100644
> > --- a/arch/arm64/include/asm/kvm_host.h
> > +++ b/arch/arm64/include/asm/kvm_host.h
> > @@ -210,6 +210,9 @@ struct kvm_vcpu_arch {
> >  	/* Guest debug state */
> >  	u64 debug_flags;
> >  
> > +	/* 1 if the guest VFP state is loaded into the hardware */
> > +	u64 guest_vfp_loaded;
> 
> May it be just u8/bool?
> 
This particular field is accessed from assembly code, and I'm not sure
what guarantees the compiler makes in terms of how a u8/bool is
allocated with respect to padding and alignment, and I think that's why
we've been using u64 fields in the past.

I don't actually remember the details, but I'd rather err on the side of
caution than trying to save a few bytes.  However, if someone can
convince me there's a completely safe way to do this, then I'm happy to
change it.

Thanks,
-Christoffer
Christoffer Dall Nov. 26, 2017, 4:17 p.m. UTC | #5
Hi Drew,

On Wed, Nov 15, 2017 at 05:04:40PM +0100, Andrew Jones wrote:
> On Thu, Oct 12, 2017 at 12:41:12PM +0200, Christoffer Dall wrote:
> > Avoid saving the guest VFP registers and restoring the host VFP
> > registers on every exit from the VM.  Only when we're about to run
> > userspace or other threads in the kernel do we really have to switch the
> > state back to the host state.
> 
> Rik van Riel's recently post patch "[PATCH v2 0/2] x86,kvm: move qemu/guest
> FPU switching out to kvm_arch_vcpu_ioctl_run" indicates that for x86 they
> only need to swap guest and userspace VFP registers before exiting VCPU_RUN
> to userspace, not for running other threads. I imagine that's the same for
> ARM as well.
> 
> If so, then I think this hunk
> 
> > @@ -209,4 +212,16 @@ void kvm_vcpu_load_sysregs(struct kvm_vcpu *vcpu)
> >   */
> >  void kvm_vcpu_put_sysregs(struct kvm_vcpu *vcpu)
> >  {
> > +	struct kvm_cpu_context *host_ctxt = vcpu->arch.host_cpu_context;
> > +	struct kvm_cpu_context *guest_ctxt = &vcpu->arch.ctxt;
> > +
> > +	/* Restore host FP/SIMD state */
> > +	if (vcpu->arch.guest_vfp_loaded) {
> > +		if (vcpu_el1_is_32bit(vcpu))
> > +			kvm_call_hyp(__fpsimd32_save_state,
> > +				     kern_hyp_va(guest_ctxt));
> > +		__fpsimd_save_state(&guest_ctxt->gp_regs.fp_regs);
> > +		__fpsimd_restore_state(&host_ctxt->gp_regs.fp_regs);
> > +		vcpu->arch.guest_vfp_loaded = 0;
> > +	}
> >  }
> 
> could be moved to the return of kvm_arch_vcpu_ioctl_run().
> 
That sounds cool.

I'll keep this patch as it is now, and look at Rik's patches and post a
follow up later, does that sound ok?

Thanks,
-Christoffer
Christoffer Dall Nov. 26, 2017, 4:24 p.m. UTC | #6
On Tue, Nov 07, 2017 at 02:15:50PM +0100, Andrew Jones wrote:
> On Thu, Oct 12, 2017 at 12:41:12PM +0200, Christoffer Dall wrote:
> > Avoid saving the guest VFP registers and restoring the host VFP
> > registers on every exit from the VM.  Only when we're about to run
> > userspace or other threads in the kernel do we really have to switch the
> > state back to the host state.
> > 
> > We still initially configure the VFP registers to trap when entering the
> > VM, but the difference is that we now leave the guest state in the
> > hardware registers while running the VM.
> 
> running the host.
> 

I actually did mean the VM, but I should clarify to mean as long as
we're running the VCPU on this physical CPU, even if we trap to the
host.

> > 
> > Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
> > ---
> >  arch/arm64/include/asm/kvm_emulate.h |  5 ++++
> >  arch/arm64/include/asm/kvm_host.h    |  3 +++
> >  arch/arm64/kernel/asm-offsets.c      |  1 +
> >  arch/arm64/kvm/hyp/entry.S           |  3 +++
> >  arch/arm64/kvm/hyp/switch.c          | 47 +++++++++++-------------------------
> >  arch/arm64/kvm/hyp/sysreg-sr.c       | 21 +++++++++++++---
> >  6 files changed, 44 insertions(+), 36 deletions(-)
> > 
> > diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
> > index 1fbfe96..630dd60 100644
> > --- a/arch/arm64/include/asm/kvm_emulate.h
> > +++ b/arch/arm64/include/asm/kvm_emulate.h
> > @@ -56,6 +56,11 @@ static inline unsigned long *vcpu_hcr(struct kvm_vcpu *vcpu)
> >  	return (unsigned long *)&vcpu->arch.hcr_el2;
> >  }
> >  
> > +static inline bool vcpu_el1_is_32bit(struct kvm_vcpu *vcpu)
> > +{
> > +	return (!(vcpu->arch.hcr_el2 & HCR_RW));
> 
> nit: no need for the outer ().
> 
> > +}
> > +
> >  static inline unsigned long *vcpu_pc(const struct kvm_vcpu *vcpu)
> >  {
> >  	return (unsigned long *)&vcpu_gp_regs(vcpu)->regs.pc;
> > diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
> > index 7d3bfa7..5e09eb9 100644
> > --- a/arch/arm64/include/asm/kvm_host.h
> > +++ b/arch/arm64/include/asm/kvm_host.h
> > @@ -210,6 +210,9 @@ struct kvm_vcpu_arch {
> >  	/* Guest debug state */
> >  	u64 debug_flags;
> >  
> > +	/* 1 if the guest VFP state is loaded into the hardware */
> > +	u64 guest_vfp_loaded;
> > +
> 
> Is there a chance we'll want other flags like this? Should we just make
> this a lazy state flags field with the (currently only) flag VFP? If not,
> then a bool would be nicer, although I see below the u64 was chosen in
> order for the 'str' to be used.
> 

See my reply to Yury.  In terms of merging flags I thought about merging
it with the debug flags, but I didn't think it would look very nice, and
I couldn't come up with a name for the variable that would describe the
logic.

Honestly, I didn't care about the few extra bytes per CPU, and much prefer
clarity, but it may make sense to combine this with for example the
sysreg and timer state later, I'll have a look.

> >  	/*
> >  	 * We maintain more than a single set of debug registers to support
> >  	 * debugging the guest from the host and to maintain separate host and
> > diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
> > index 612021d..9946732 100644
> > --- a/arch/arm64/kernel/asm-offsets.c
> > +++ b/arch/arm64/kernel/asm-offsets.c
> > @@ -133,6 +133,7 @@ int main(void)
> >    DEFINE(CPU_GP_REGS,		offsetof(struct kvm_cpu_context, gp_regs));
> >    DEFINE(CPU_USER_PT_REGS,	offsetof(struct kvm_regs, regs));
> >    DEFINE(CPU_FP_REGS,		offsetof(struct kvm_regs, fp_regs));
> > +  DEFINE(VCPU_GUEST_VFP_LOADED,	offsetof(struct kvm_vcpu, arch.guest_vfp_loaded));
> >    DEFINE(VCPU_FPEXC32_EL2,	offsetof(struct kvm_vcpu, arch.ctxt.sys_regs[FPEXC32_EL2]));
> >    DEFINE(VCPU_HOST_CONTEXT,	offsetof(struct kvm_vcpu, arch.host_cpu_context));
> >    DEFINE(HOST_CONTEXT_VCPU,	offsetof(struct kvm_cpu_context, __hyp_running_vcpu));
> > diff --git a/arch/arm64/kvm/hyp/entry.S b/arch/arm64/kvm/hyp/entry.S
> > index 76cd48f..b3e7191 100644
> > --- a/arch/arm64/kvm/hyp/entry.S
> > +++ b/arch/arm64/kvm/hyp/entry.S
> > @@ -185,6 +185,9 @@ alternative_endif
> >  	add	x0, x2, #CPU_GP_REG_OFFSET(CPU_FP_REGS)
> >  	bl	__fpsimd_restore_state
> >  
> > +	mov	x0, #1
> > +	str	x0, [x3, #VCPU_GUEST_VFP_LOADED]
> > +
> >  	// Skip restoring fpexc32 for AArch64 guests
> >  	mrs	x1, hcr_el2
> >  	tbnz	x1, #HCR_RW_SHIFT, 1f
> > diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c
> > index 7703d63..ef05c59 100644
> > --- a/arch/arm64/kvm/hyp/switch.c
> > +++ b/arch/arm64/kvm/hyp/switch.c
> > @@ -23,43 +23,31 @@
> >  #include <asm/kvm_hyp.h>
> >  #include <asm/fpsimd.h>
> >  
> > -static bool __hyp_text __fpsimd_enabled_nvhe(void)
> > -{
> > -	return !(read_sysreg(cptr_el2) & CPTR_EL2_TFP);
> > -}
> > -
> > -static bool __hyp_text __fpsimd_enabled_vhe(void)
> > -{
> > -	return !!(read_sysreg(cpacr_el1) & CPACR_EL1_FPEN);
> > -}
> > -
> > -static hyp_alternate_select(__fpsimd_is_enabled,
> > -			    __fpsimd_enabled_nvhe, __fpsimd_enabled_vhe,
> > -			    ARM64_HAS_VIRT_HOST_EXTN);
> > -
> > -bool __hyp_text __fpsimd_enabled(void)
> > -{
> > -	return __fpsimd_is_enabled()();
> > -}
> > -
> > -static void __hyp_text __activate_traps_vhe(void)
> > +static void __hyp_text __activate_traps_vhe(struct kvm_vcpu *vcpu)
> >  {
> >  	u64 val;
> >  
> >  	val = read_sysreg(cpacr_el1);
> >  	val |= CPACR_EL1_TTA;
> > -	val &= ~CPACR_EL1_FPEN;
> > +	if (vcpu->arch.guest_vfp_loaded)
> > +		val |= CPACR_EL1_FPEN;
> > +	else
> > +		val &= ~CPACR_EL1_FPEN;
> >  	write_sysreg(val, cpacr_el1);
> >  
> >  	write_sysreg(__kvm_hyp_vector, vbar_el1);
> >  }
> >  
> > -static void __hyp_text __activate_traps_nvhe(void)
> > +static void __hyp_text __activate_traps_nvhe(struct kvm_vcpu *vcpu)
> >  {
> >  	u64 val;
> >  
> >  	val = CPTR_EL2_DEFAULT;
> > -	val |= CPTR_EL2_TTA | CPTR_EL2_TFP;
> > +	val |= CPTR_EL2_TTA;
> > +	if (vcpu->arch.guest_vfp_loaded)
> > +		val &= ~CPTR_EL2_TFP;
> > +	else
> > +		val |= CPTR_EL2_TFP;
> >  	write_sysreg(val, cptr_el2);
> >  }
> >  
> > @@ -81,7 +69,8 @@ static void __hyp_text __activate_traps(struct kvm_vcpu *vcpu)
> >  	 * it will cause an exception.
> >  	 */
> >  	val = vcpu->arch.hcr_el2;
> > -	if (!(val & HCR_RW) && system_supports_fpsimd()) {
> > +	if (vcpu_el1_is_32bit(vcpu) && system_supports_fpsimd() &&
> > +	    !vcpu->arch.guest_vfp_loaded) {
> >  		write_sysreg(1 << 30, fpexc32_el2);
> >  		isb();
> >  	}
> > @@ -97,7 +86,7 @@ static void __hyp_text __activate_traps(struct kvm_vcpu *vcpu)
> >  	write_sysreg(0, pmselr_el0);
> >  	write_sysreg(ARMV8_PMU_USERENR_MASK, pmuserenr_el0);
> >  	write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2);
> > -	__activate_traps_arch()();
> > +	__activate_traps_arch()(vcpu);
> >  }
> >  
> >  static void __hyp_text __deactivate_traps_vhe(void)
> > @@ -273,7 +262,6 @@ int __hyp_text __kvm_vcpu_run(struct kvm_vcpu *vcpu)
> >  {
> >  	struct kvm_cpu_context *host_ctxt;
> >  	struct kvm_cpu_context *guest_ctxt;
> > -	bool fp_enabled;
> >  	u64 exit_code;
> >  
> >  	vcpu = kern_hyp_va(vcpu);
> > @@ -355,8 +343,6 @@ int __hyp_text __kvm_vcpu_run(struct kvm_vcpu *vcpu)
> >  		/* 0 falls through to be handled out of EL2 */
> >  	}
> >  
> > -	fp_enabled = __fpsimd_enabled();
> > -
> >  	__sysreg_save_guest_state(guest_ctxt);
> >  	__sysreg32_save_state(vcpu);
> >  	__timer_disable_traps(vcpu);
> > @@ -367,11 +353,6 @@ int __hyp_text __kvm_vcpu_run(struct kvm_vcpu *vcpu)
> >  
> >  	__sysreg_restore_host_state(host_ctxt);
> >  
> > -	if (fp_enabled) {
> > -		__fpsimd_save_state(&guest_ctxt->gp_regs.fp_regs);
> > -		__fpsimd_restore_state(&host_ctxt->gp_regs.fp_regs);
> > -	}
> > -
> >  	__debug_save_state(vcpu, kern_hyp_va(vcpu->arch.debug_ptr), guest_ctxt);
> >  	/*
> >  	 * This must come after restoring the host sysregs, since a non-VHE
> > diff --git a/arch/arm64/kvm/hyp/sysreg-sr.c b/arch/arm64/kvm/hyp/sysreg-sr.c
> > index b7438c8..c4a3714 100644
> > --- a/arch/arm64/kvm/hyp/sysreg-sr.c
> > +++ b/arch/arm64/kvm/hyp/sysreg-sr.c
> > @@ -19,6 +19,7 @@
> >  #include <linux/kvm_host.h>
> >  
> >  #include <asm/kvm_asm.h>
> > +#include <asm/kvm_emulate.h>
> >  #include <asm/kvm_hyp.h>
> >  
> >  /* Yes, this does nothing, on purpose */
> > @@ -137,6 +138,11 @@ void __hyp_text __sysreg_restore_guest_state(struct kvm_cpu_context *ctxt)
> >  	__sysreg_restore_common_state(ctxt);
> >  }
> >  
> > +static void __hyp_text __fpsimd32_save_state(struct kvm_cpu_context *ctxt)
> > +{
> > +	ctxt->sys_regs[FPEXC32_EL2] = read_sysreg(fpexc32_el2);
> > +}
> > +
> >  void __hyp_text __sysreg32_save_state(struct kvm_vcpu *vcpu)
> >  {
> >  	u64 *spsr, *sysreg;
> > @@ -155,9 +161,6 @@ void __hyp_text __sysreg32_save_state(struct kvm_vcpu *vcpu)
> >  	sysreg[DACR32_EL2] = read_sysreg(dacr32_el2);
> >  	sysreg[IFSR32_EL2] = read_sysreg(ifsr32_el2);
> >  
> > -	if (__fpsimd_enabled())
> > -		sysreg[FPEXC32_EL2] = read_sysreg(fpexc32_el2);
> > -
> >  	if (vcpu->arch.debug_flags & KVM_ARM64_DEBUG_DIRTY)
> >  		sysreg[DBGVCR32_EL2] = read_sysreg(dbgvcr32_el2);
> >  }
> > @@ -209,4 +212,16 @@ void kvm_vcpu_load_sysregs(struct kvm_vcpu *vcpu)
> >   */
> >  void kvm_vcpu_put_sysregs(struct kvm_vcpu *vcpu)
> >  {
> > +	struct kvm_cpu_context *host_ctxt = vcpu->arch.host_cpu_context;
> > +	struct kvm_cpu_context *guest_ctxt = &vcpu->arch.ctxt;
> > +
> > +	/* Restore host FP/SIMD state */
> > +	if (vcpu->arch.guest_vfp_loaded) {
> > +		if (vcpu_el1_is_32bit(vcpu))
> > +			kvm_call_hyp(__fpsimd32_save_state,
> > +				     kern_hyp_va(guest_ctxt));
> 
> nit: might be nice to use {} since we need two lines.
> 

sure.

> > +		__fpsimd_save_state(&guest_ctxt->gp_regs.fp_regs);
> > +		__fpsimd_restore_state(&host_ctxt->gp_regs.fp_regs);
> > +		vcpu->arch.guest_vfp_loaded = 0;
> > +	}
> >  }
> > -- 
> > 2.9.0
> >
> 
> Otherwise,
> 
> Reviewed-by: Andrew Jones <drjones@redhat.com>

Thanks,
-Christoffer
Yury Norov Nov. 26, 2017, 6:58 p.m. UTC | #7
On Sun, Nov 26, 2017 at 05:17:16PM +0100, Christoffer Dall wrote:
> Hi Yury,
> 
> On Sat, Nov 25, 2017 at 10:52:21AM +0300, Yury Norov wrote:
> > 
> > On Thu, Oct 12, 2017 at 12:41:12PM +0200, Christoffer Dall wrote:
> > > Avoid saving the guest VFP registers and restoring the host VFP
> > > registers on every exit from the VM.  Only when we're about to run
> > > userspace or other threads in the kernel do we really have to switch the
> > > state back to the host state.
> > > 
> > > We still initially configure the VFP registers to trap when entering the
> > > VM, but the difference is that we now leave the guest state in the
> > > hardware registers while running the VM.
> > > 
> > > Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
> > > ---
> > >  arch/arm64/include/asm/kvm_emulate.h |  5 ++++
> > >  arch/arm64/include/asm/kvm_host.h    |  3 +++
> > >  arch/arm64/kernel/asm-offsets.c      |  1 +
> > >  arch/arm64/kvm/hyp/entry.S           |  3 +++
> > >  arch/arm64/kvm/hyp/switch.c          | 47 +++++++++++-------------------------
> > >  arch/arm64/kvm/hyp/sysreg-sr.c       | 21 +++++++++++++---
> > >  6 files changed, 44 insertions(+), 36 deletions(-)
> > > 
> > > diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
> > > index 1fbfe96..630dd60 100644
> > > --- a/arch/arm64/include/asm/kvm_emulate.h
> > > +++ b/arch/arm64/include/asm/kvm_emulate.h
> > > @@ -56,6 +56,11 @@ static inline unsigned long *vcpu_hcr(struct kvm_vcpu *vcpu)
> > >  	return (unsigned long *)&vcpu->arch.hcr_el2;
> > >  }
> > >  
> > > +static inline bool vcpu_el1_is_32bit(struct kvm_vcpu *vcpu)
> > > +{
> > > +	return (!(vcpu->arch.hcr_el2 & HCR_RW));
> > > +}
> > > +
> > >  static inline unsigned long *vcpu_pc(const struct kvm_vcpu *vcpu)
> > >  {
> > >  	return (unsigned long *)&vcpu_gp_regs(vcpu)->regs.pc;
> > > diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
> > > index 7d3bfa7..5e09eb9 100644
> > > --- a/arch/arm64/include/asm/kvm_host.h
> > > +++ b/arch/arm64/include/asm/kvm_host.h
> > > @@ -210,6 +210,9 @@ struct kvm_vcpu_arch {
> > >  	/* Guest debug state */
> > >  	u64 debug_flags;
> > >  
> > > +	/* 1 if the guest VFP state is loaded into the hardware */
> > > +	u64 guest_vfp_loaded;
> > 
> > May it be just u8/bool?
> > 
> This particular field is accessed from assembly code, and I'm not sure
> what guarantees the compiler makes in terms of how a u8/bool is
> allocated with respect to padding and alignment, and I think that's why
> we've been using u64 fields in the past.
> 
> I don't actually remember the details, but I'd rather err on the side of
> caution than trying to save a few bytes.  However, if someone can
> convince me there's a completely safe way to do this, then I'm happy to
> change it.

'strb     w0, [x3, #VCPU_GUEST_VFP_LOADED]' would work. See
C6.6.181 STRB (register) in ARM64 ARM.

The only thing I would recommend is to reorder fields in kvm_vcpu_arch
to avoid unneeded holes in the structure. It already spend 10 bytes for
nothing in 3 holes.

Yury
Christoffer Dall Nov. 26, 2017, 7:18 p.m. UTC | #8
On Sun, Nov 26, 2017 at 09:58:52PM +0300, Yury Norov wrote:
> On Sun, Nov 26, 2017 at 05:17:16PM +0100, Christoffer Dall wrote:
> > Hi Yury,
> > 
> > On Sat, Nov 25, 2017 at 10:52:21AM +0300, Yury Norov wrote:
> > > 
> > > On Thu, Oct 12, 2017 at 12:41:12PM +0200, Christoffer Dall wrote:
> > > > Avoid saving the guest VFP registers and restoring the host VFP
> > > > registers on every exit from the VM.  Only when we're about to run
> > > > userspace or other threads in the kernel do we really have to switch the
> > > > state back to the host state.
> > > > 
> > > > We still initially configure the VFP registers to trap when entering the
> > > > VM, but the difference is that we now leave the guest state in the
> > > > hardware registers while running the VM.
> > > > 
> > > > Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
> > > > ---
> > > >  arch/arm64/include/asm/kvm_emulate.h |  5 ++++
> > > >  arch/arm64/include/asm/kvm_host.h    |  3 +++
> > > >  arch/arm64/kernel/asm-offsets.c      |  1 +
> > > >  arch/arm64/kvm/hyp/entry.S           |  3 +++
> > > >  arch/arm64/kvm/hyp/switch.c          | 47 +++++++++++-------------------------
> > > >  arch/arm64/kvm/hyp/sysreg-sr.c       | 21 +++++++++++++---
> > > >  6 files changed, 44 insertions(+), 36 deletions(-)
> > > > 
> > > > diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
> > > > index 1fbfe96..630dd60 100644
> > > > --- a/arch/arm64/include/asm/kvm_emulate.h
> > > > +++ b/arch/arm64/include/asm/kvm_emulate.h
> > > > @@ -56,6 +56,11 @@ static inline unsigned long *vcpu_hcr(struct kvm_vcpu *vcpu)
> > > >  	return (unsigned long *)&vcpu->arch.hcr_el2;
> > > >  }
> > > >  
> > > > +static inline bool vcpu_el1_is_32bit(struct kvm_vcpu *vcpu)
> > > > +{
> > > > +	return (!(vcpu->arch.hcr_el2 & HCR_RW));
> > > > +}
> > > > +
> > > >  static inline unsigned long *vcpu_pc(const struct kvm_vcpu *vcpu)
> > > >  {
> > > >  	return (unsigned long *)&vcpu_gp_regs(vcpu)->regs.pc;
> > > > diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
> > > > index 7d3bfa7..5e09eb9 100644
> > > > --- a/arch/arm64/include/asm/kvm_host.h
> > > > +++ b/arch/arm64/include/asm/kvm_host.h
> > > > @@ -210,6 +210,9 @@ struct kvm_vcpu_arch {
> > > >  	/* Guest debug state */
> > > >  	u64 debug_flags;
> > > >  
> > > > +	/* 1 if the guest VFP state is loaded into the hardware */
> > > > +	u64 guest_vfp_loaded;
> > > 
> > > May it be just u8/bool?
> > > 
> > This particular field is accessed from assembly code, and I'm not sure
> > what guarantees the compiler makes in terms of how a u8/bool is
> > allocated with respect to padding and alignment, and I think that's why
> > we've been using u64 fields in the past.
> > 
> > I don't actually remember the details, but I'd rather err on the side of
> > caution than trying to save a few bytes.  However, if someone can
> > convince me there's a completely safe way to do this, then I'm happy to
> > change it.
> 
> 'strb     w0, [x3, #VCPU_GUEST_VFP_LOADED]' would work. See
> C6.6.181 STRB (register) in ARM64 ARM.

I'm well aware of this instruction.  Thank you though.

The concern was that we haven't done this in the past.  I think that was
because the size of a _Bool is not well-defined and we really didn't
care about a couple of handful of bytes when talking about vcpu
structures.  Really.

A u8 should work though, but probably this will all be moot if I combine
the flags into a single field.

> 
> The only thing I would recommend is to reorder fields in kvm_vcpu_arch
> to avoid unneeded holes in the structure. It already spend 10 bytes for
> nothing in 3 holes.
> 
Patches are welcome.

-Christoffer
Andrew Jones Nov. 27, 2017, 8:32 a.m. UTC | #9
On Sun, Nov 26, 2017 at 05:17:56PM +0100, Christoffer Dall wrote:
> Hi Drew,
> 
> On Wed, Nov 15, 2017 at 05:04:40PM +0100, Andrew Jones wrote:
> > On Thu, Oct 12, 2017 at 12:41:12PM +0200, Christoffer Dall wrote:
> > > Avoid saving the guest VFP registers and restoring the host VFP
> > > registers on every exit from the VM.  Only when we're about to run
> > > userspace or other threads in the kernel do we really have to switch the
> > > state back to the host state.
> > 
> > Rik van Riel's recently post patch "[PATCH v2 0/2] x86,kvm: move qemu/guest
> > FPU switching out to kvm_arch_vcpu_ioctl_run" indicates that for x86 they
> > only need to swap guest and userspace VFP registers before exiting VCPU_RUN
> > to userspace, not for running other threads. I imagine that's the same for
> > ARM as well.
> > 
> > If so, then I think this hunk
> > 
> > > @@ -209,4 +212,16 @@ void kvm_vcpu_load_sysregs(struct kvm_vcpu *vcpu)
> > >   */
> > >  void kvm_vcpu_put_sysregs(struct kvm_vcpu *vcpu)
> > >  {
> > > +	struct kvm_cpu_context *host_ctxt = vcpu->arch.host_cpu_context;
> > > +	struct kvm_cpu_context *guest_ctxt = &vcpu->arch.ctxt;
> > > +
> > > +	/* Restore host FP/SIMD state */
> > > +	if (vcpu->arch.guest_vfp_loaded) {
> > > +		if (vcpu_el1_is_32bit(vcpu))
> > > +			kvm_call_hyp(__fpsimd32_save_state,
> > > +				     kern_hyp_va(guest_ctxt));
> > > +		__fpsimd_save_state(&guest_ctxt->gp_regs.fp_regs);
> > > +		__fpsimd_restore_state(&host_ctxt->gp_regs.fp_regs);
> > > +		vcpu->arch.guest_vfp_loaded = 0;
> > > +	}
> > >  }
> > 
> > could be moved to the return of kvm_arch_vcpu_ioctl_run().
> > 
> That sounds cool.
> 
> I'll keep this patch as it is now, and look at Rik's patches and post a
> follow up later, does that sound ok?

Works for me. I'm guessing there's also going to be some overlap with
Dave Martin's SVE work. So whichever series goes second can probably
be the one to put the most consideration into it.

Thanks,
drew
Marc Zyngier Nov. 30, 2017, 7:07 p.m. UTC | #10
On 26/11/17 18:58, Yury Norov wrote:
> On Sun, Nov 26, 2017 at 05:17:16PM +0100, Christoffer Dall wrote:
>> Hi Yury,
>>
>> On Sat, Nov 25, 2017 at 10:52:21AM +0300, Yury Norov wrote:
>>>
>>> On Thu, Oct 12, 2017 at 12:41:12PM +0200, Christoffer Dall wrote:
>>>> Avoid saving the guest VFP registers and restoring the host VFP
>>>> registers on every exit from the VM.  Only when we're about to run
>>>> userspace or other threads in the kernel do we really have to switch the
>>>> state back to the host state.
>>>>
>>>> We still initially configure the VFP registers to trap when entering the
>>>> VM, but the difference is that we now leave the guest state in the
>>>> hardware registers while running the VM.
>>>>
>>>> Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
>>>> ---
>>>>  arch/arm64/include/asm/kvm_emulate.h |  5 ++++
>>>>  arch/arm64/include/asm/kvm_host.h    |  3 +++
>>>>  arch/arm64/kernel/asm-offsets.c      |  1 +
>>>>  arch/arm64/kvm/hyp/entry.S           |  3 +++
>>>>  arch/arm64/kvm/hyp/switch.c          | 47 +++++++++++-------------------------
>>>>  arch/arm64/kvm/hyp/sysreg-sr.c       | 21 +++++++++++++---
>>>>  6 files changed, 44 insertions(+), 36 deletions(-)
>>>>
>>>> diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
>>>> index 1fbfe96..630dd60 100644
>>>> --- a/arch/arm64/include/asm/kvm_emulate.h
>>>> +++ b/arch/arm64/include/asm/kvm_emulate.h
>>>> @@ -56,6 +56,11 @@ static inline unsigned long *vcpu_hcr(struct kvm_vcpu *vcpu)
>>>>  	return (unsigned long *)&vcpu->arch.hcr_el2;
>>>>  }
>>>>  
>>>> +static inline bool vcpu_el1_is_32bit(struct kvm_vcpu *vcpu)
>>>> +{
>>>> +	return (!(vcpu->arch.hcr_el2 & HCR_RW));
>>>> +}
>>>> +
>>>>  static inline unsigned long *vcpu_pc(const struct kvm_vcpu *vcpu)
>>>>  {
>>>>  	return (unsigned long *)&vcpu_gp_regs(vcpu)->regs.pc;
>>>> diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
>>>> index 7d3bfa7..5e09eb9 100644
>>>> --- a/arch/arm64/include/asm/kvm_host.h
>>>> +++ b/arch/arm64/include/asm/kvm_host.h
>>>> @@ -210,6 +210,9 @@ struct kvm_vcpu_arch {
>>>>  	/* Guest debug state */
>>>>  	u64 debug_flags;
>>>>  
>>>> +	/* 1 if the guest VFP state is loaded into the hardware */
>>>> +	u64 guest_vfp_loaded;
>>>
>>> May it be just u8/bool?
>>>
>> This particular field is accessed from assembly code, and I'm not sure
>> what guarantees the compiler makes in terms of how a u8/bool is
>> allocated with respect to padding and alignment, and I think that's why
>> we've been using u64 fields in the past.
>>
>> I don't actually remember the details, but I'd rather err on the side of
>> caution than trying to save a few bytes.  However, if someone can
>> convince me there's a completely safe way to do this, then I'm happy to
>> change it.
> 
> 'strb     w0, [x3, #VCPU_GUEST_VFP_LOADED]' would work. See
> C6.6.181 STRB (register) in ARM64 ARM.
> 
> The only thing I would recommend is to reorder fields in kvm_vcpu_arch
> to avoid unneeded holes in the structure. It already spend 10 bytes for
> nothing in 3 holes.

Terrifying. How many vcpu are you going to run before this becomes a
real bottleneck? KVM on a 6502? ;-)

Now, when it comes to reordering fields, please keep in mind that the
order of the fields in the structure does matter. We want the hottest
fields grouped together so that they are fetched in the same cache line.

Thanks,

	M.
diff mbox

Patch

diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
index 1fbfe96..630dd60 100644
--- a/arch/arm64/include/asm/kvm_emulate.h
+++ b/arch/arm64/include/asm/kvm_emulate.h
@@ -56,6 +56,11 @@  static inline unsigned long *vcpu_hcr(struct kvm_vcpu *vcpu)
 	return (unsigned long *)&vcpu->arch.hcr_el2;
 }
 
+static inline bool vcpu_el1_is_32bit(struct kvm_vcpu *vcpu)
+{
+	return (!(vcpu->arch.hcr_el2 & HCR_RW));
+}
+
 static inline unsigned long *vcpu_pc(const struct kvm_vcpu *vcpu)
 {
 	return (unsigned long *)&vcpu_gp_regs(vcpu)->regs.pc;
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 7d3bfa7..5e09eb9 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -210,6 +210,9 @@  struct kvm_vcpu_arch {
 	/* Guest debug state */
 	u64 debug_flags;
 
+	/* 1 if the guest VFP state is loaded into the hardware */
+	u64 guest_vfp_loaded;
+
 	/*
 	 * We maintain more than a single set of debug registers to support
 	 * debugging the guest from the host and to maintain separate host and
diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
index 612021d..9946732 100644
--- a/arch/arm64/kernel/asm-offsets.c
+++ b/arch/arm64/kernel/asm-offsets.c
@@ -133,6 +133,7 @@  int main(void)
   DEFINE(CPU_GP_REGS,		offsetof(struct kvm_cpu_context, gp_regs));
   DEFINE(CPU_USER_PT_REGS,	offsetof(struct kvm_regs, regs));
   DEFINE(CPU_FP_REGS,		offsetof(struct kvm_regs, fp_regs));
+  DEFINE(VCPU_GUEST_VFP_LOADED,	offsetof(struct kvm_vcpu, arch.guest_vfp_loaded));
   DEFINE(VCPU_FPEXC32_EL2,	offsetof(struct kvm_vcpu, arch.ctxt.sys_regs[FPEXC32_EL2]));
   DEFINE(VCPU_HOST_CONTEXT,	offsetof(struct kvm_vcpu, arch.host_cpu_context));
   DEFINE(HOST_CONTEXT_VCPU,	offsetof(struct kvm_cpu_context, __hyp_running_vcpu));
diff --git a/arch/arm64/kvm/hyp/entry.S b/arch/arm64/kvm/hyp/entry.S
index 76cd48f..b3e7191 100644
--- a/arch/arm64/kvm/hyp/entry.S
+++ b/arch/arm64/kvm/hyp/entry.S
@@ -185,6 +185,9 @@  alternative_endif
 	add	x0, x2, #CPU_GP_REG_OFFSET(CPU_FP_REGS)
 	bl	__fpsimd_restore_state
 
+	mov	x0, #1
+	str	x0, [x3, #VCPU_GUEST_VFP_LOADED]
+
 	// Skip restoring fpexc32 for AArch64 guests
 	mrs	x1, hcr_el2
 	tbnz	x1, #HCR_RW_SHIFT, 1f
diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c
index 7703d63..ef05c59 100644
--- a/arch/arm64/kvm/hyp/switch.c
+++ b/arch/arm64/kvm/hyp/switch.c
@@ -23,43 +23,31 @@ 
 #include <asm/kvm_hyp.h>
 #include <asm/fpsimd.h>
 
-static bool __hyp_text __fpsimd_enabled_nvhe(void)
-{
-	return !(read_sysreg(cptr_el2) & CPTR_EL2_TFP);
-}
-
-static bool __hyp_text __fpsimd_enabled_vhe(void)
-{
-	return !!(read_sysreg(cpacr_el1) & CPACR_EL1_FPEN);
-}
-
-static hyp_alternate_select(__fpsimd_is_enabled,
-			    __fpsimd_enabled_nvhe, __fpsimd_enabled_vhe,
-			    ARM64_HAS_VIRT_HOST_EXTN);
-
-bool __hyp_text __fpsimd_enabled(void)
-{
-	return __fpsimd_is_enabled()();
-}
-
-static void __hyp_text __activate_traps_vhe(void)
+static void __hyp_text __activate_traps_vhe(struct kvm_vcpu *vcpu)
 {
 	u64 val;
 
 	val = read_sysreg(cpacr_el1);
 	val |= CPACR_EL1_TTA;
-	val &= ~CPACR_EL1_FPEN;
+	if (vcpu->arch.guest_vfp_loaded)
+		val |= CPACR_EL1_FPEN;
+	else
+		val &= ~CPACR_EL1_FPEN;
 	write_sysreg(val, cpacr_el1);
 
 	write_sysreg(__kvm_hyp_vector, vbar_el1);
 }
 
-static void __hyp_text __activate_traps_nvhe(void)
+static void __hyp_text __activate_traps_nvhe(struct kvm_vcpu *vcpu)
 {
 	u64 val;
 
 	val = CPTR_EL2_DEFAULT;
-	val |= CPTR_EL2_TTA | CPTR_EL2_TFP;
+	val |= CPTR_EL2_TTA;
+	if (vcpu->arch.guest_vfp_loaded)
+		val &= ~CPTR_EL2_TFP;
+	else
+		val |= CPTR_EL2_TFP;
 	write_sysreg(val, cptr_el2);
 }
 
@@ -81,7 +69,8 @@  static void __hyp_text __activate_traps(struct kvm_vcpu *vcpu)
 	 * it will cause an exception.
 	 */
 	val = vcpu->arch.hcr_el2;
-	if (!(val & HCR_RW) && system_supports_fpsimd()) {
+	if (vcpu_el1_is_32bit(vcpu) && system_supports_fpsimd() &&
+	    !vcpu->arch.guest_vfp_loaded) {
 		write_sysreg(1 << 30, fpexc32_el2);
 		isb();
 	}
@@ -97,7 +86,7 @@  static void __hyp_text __activate_traps(struct kvm_vcpu *vcpu)
 	write_sysreg(0, pmselr_el0);
 	write_sysreg(ARMV8_PMU_USERENR_MASK, pmuserenr_el0);
 	write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2);
-	__activate_traps_arch()();
+	__activate_traps_arch()(vcpu);
 }
 
 static void __hyp_text __deactivate_traps_vhe(void)
@@ -273,7 +262,6 @@  int __hyp_text __kvm_vcpu_run(struct kvm_vcpu *vcpu)
 {
 	struct kvm_cpu_context *host_ctxt;
 	struct kvm_cpu_context *guest_ctxt;
-	bool fp_enabled;
 	u64 exit_code;
 
 	vcpu = kern_hyp_va(vcpu);
@@ -355,8 +343,6 @@  int __hyp_text __kvm_vcpu_run(struct kvm_vcpu *vcpu)
 		/* 0 falls through to be handled out of EL2 */
 	}
 
-	fp_enabled = __fpsimd_enabled();
-
 	__sysreg_save_guest_state(guest_ctxt);
 	__sysreg32_save_state(vcpu);
 	__timer_disable_traps(vcpu);
@@ -367,11 +353,6 @@  int __hyp_text __kvm_vcpu_run(struct kvm_vcpu *vcpu)
 
 	__sysreg_restore_host_state(host_ctxt);
 
-	if (fp_enabled) {
-		__fpsimd_save_state(&guest_ctxt->gp_regs.fp_regs);
-		__fpsimd_restore_state(&host_ctxt->gp_regs.fp_regs);
-	}
-
 	__debug_save_state(vcpu, kern_hyp_va(vcpu->arch.debug_ptr), guest_ctxt);
 	/*
 	 * This must come after restoring the host sysregs, since a non-VHE
diff --git a/arch/arm64/kvm/hyp/sysreg-sr.c b/arch/arm64/kvm/hyp/sysreg-sr.c
index b7438c8..c4a3714 100644
--- a/arch/arm64/kvm/hyp/sysreg-sr.c
+++ b/arch/arm64/kvm/hyp/sysreg-sr.c
@@ -19,6 +19,7 @@ 
 #include <linux/kvm_host.h>
 
 #include <asm/kvm_asm.h>
+#include <asm/kvm_emulate.h>
 #include <asm/kvm_hyp.h>
 
 /* Yes, this does nothing, on purpose */
@@ -137,6 +138,11 @@  void __hyp_text __sysreg_restore_guest_state(struct kvm_cpu_context *ctxt)
 	__sysreg_restore_common_state(ctxt);
 }
 
+static void __hyp_text __fpsimd32_save_state(struct kvm_cpu_context *ctxt)
+{
+	ctxt->sys_regs[FPEXC32_EL2] = read_sysreg(fpexc32_el2);
+}
+
 void __hyp_text __sysreg32_save_state(struct kvm_vcpu *vcpu)
 {
 	u64 *spsr, *sysreg;
@@ -155,9 +161,6 @@  void __hyp_text __sysreg32_save_state(struct kvm_vcpu *vcpu)
 	sysreg[DACR32_EL2] = read_sysreg(dacr32_el2);
 	sysreg[IFSR32_EL2] = read_sysreg(ifsr32_el2);
 
-	if (__fpsimd_enabled())
-		sysreg[FPEXC32_EL2] = read_sysreg(fpexc32_el2);
-
 	if (vcpu->arch.debug_flags & KVM_ARM64_DEBUG_DIRTY)
 		sysreg[DBGVCR32_EL2] = read_sysreg(dbgvcr32_el2);
 }
@@ -209,4 +212,16 @@  void kvm_vcpu_load_sysregs(struct kvm_vcpu *vcpu)
  */
 void kvm_vcpu_put_sysregs(struct kvm_vcpu *vcpu)
 {
+	struct kvm_cpu_context *host_ctxt = vcpu->arch.host_cpu_context;
+	struct kvm_cpu_context *guest_ctxt = &vcpu->arch.ctxt;
+
+	/* Restore host FP/SIMD state */
+	if (vcpu->arch.guest_vfp_loaded) {
+		if (vcpu_el1_is_32bit(vcpu))
+			kvm_call_hyp(__fpsimd32_save_state,
+				     kern_hyp_va(guest_ctxt));
+		__fpsimd_save_state(&guest_ctxt->gp_regs.fp_regs);
+		__fpsimd_restore_state(&host_ctxt->gp_regs.fp_regs);
+		vcpu->arch.guest_vfp_loaded = 0;
+	}
 }