diff mbox series

[16/59] KVM: arm64: nv: Save/Restore vEL2 sysregs

Message ID 20190621093843.220980-17-marc.zyngier@arm.com (mailing list archive)
State New, archived
Headers show
Series KVM: arm64: ARMv8.3 Nested Virtualization support | expand

Commit Message

Marc Zyngier June 21, 2019, 9:38 a.m. UTC
From: Andre Przywara <andre.przywara@arm.com>

Whenever we need to restore the guest's system registers to the CPU, we
now need to take care of the EL2 system registers as well. Most of them
are accessed via traps only, but some have an immediate effect and also
a guest running in VHE mode would expect them to be accessible via their
EL1 encoding, which we do not trap.

Split the current __sysreg_{save,restore}_el1_state() functions into
handling common sysregs, then differentiate between the guest running in
vEL2 and vEL1.

For vEL2 we write the virtual EL2 registers with an identical format directly
into their EL1 counterpart, and translate the few registers that have a
different format for the same effect on the execution when running a
non-VHE guest guest hypervisor.

  [ Commit message reworked and many bug fixes applied by Marc Zyngier
    and Christoffer Dall. ]

Signed-off-by: Andre Przywara <andre.przywara@arm.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@arm.com>
---
 arch/arm64/kvm/hyp/sysreg-sr.c | 160 +++++++++++++++++++++++++++++++--
 1 file changed, 153 insertions(+), 7 deletions(-)

Comments

Julien Thierry June 25, 2019, 8:48 a.m. UTC | #1
On 06/21/2019 10:38 AM, Marc Zyngier wrote:
> From: Andre Przywara <andre.przywara@arm.com>
> 
> Whenever we need to restore the guest's system registers to the CPU, we
> now need to take care of the EL2 system registers as well. Most of them
> are accessed via traps only, but some have an immediate effect and also
> a guest running in VHE mode would expect them to be accessible via their
> EL1 encoding, which we do not trap.
> 
> Split the current __sysreg_{save,restore}_el1_state() functions into
> handling common sysregs, then differentiate between the guest running in
> vEL2 and vEL1.
> 
> For vEL2 we write the virtual EL2 registers with an identical format directly
> into their EL1 counterpart, and translate the few registers that have a
> different format for the same effect on the execution when running a
> non-VHE guest guest hypervisor.
> 
>   [ Commit message reworked and many bug fixes applied by Marc Zyngier
>     and Christoffer Dall. ]
> 
> Signed-off-by: Andre Przywara <andre.przywara@arm.com>
> Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
> Signed-off-by: Christoffer Dall <christoffer.dall@arm.com>
> ---
>  arch/arm64/kvm/hyp/sysreg-sr.c | 160 +++++++++++++++++++++++++++++++--
>  1 file changed, 153 insertions(+), 7 deletions(-)
> 
> diff --git a/arch/arm64/kvm/hyp/sysreg-sr.c b/arch/arm64/kvm/hyp/sysreg-sr.c
> index 62866a68e852..2abb9c3ff24f 100644
> --- a/arch/arm64/kvm/hyp/sysreg-sr.c
> +++ b/arch/arm64/kvm/hyp/sysreg-sr.c

[...]

> @@ -124,10 +167,91 @@ static void __hyp_text __sysreg_restore_user_state(struct kvm_cpu_context *ctxt)
>  	write_sysreg(ctxt->sys_regs[TPIDRRO_EL0],	tpidrro_el0);
>  }
>  
> -static void __hyp_text __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt)
> +static void __sysreg_restore_vel2_state(struct kvm_cpu_context *ctxt)
>  {
> +	u64 val;
> +
> +	write_sysreg(read_cpuid_id(),			vpidr_el2);
>  	write_sysreg(ctxt->sys_regs[MPIDR_EL1],		vmpidr_el2);
> -	write_sysreg(ctxt->sys_regs[CSSELR_EL1],	csselr_el1);
> +	write_sysreg_el1(ctxt->sys_regs[MAIR_EL2],	SYS_MAIR);
> +	write_sysreg_el1(ctxt->sys_regs[VBAR_EL2],	SYS_VBAR);
> +	write_sysreg_el1(ctxt->sys_regs[CONTEXTIDR_EL2],SYS_CONTEXTIDR);
> +	write_sysreg_el1(ctxt->sys_regs[AMAIR_EL2],	SYS_AMAIR);
> +
> +	if (__vcpu_el2_e2h_is_set(ctxt)) {
> +		/*
> +		 * In VHE mode those registers are compatible between
> +		 * EL1 and EL2.
> +		 */
> +		write_sysreg_el1(ctxt->sys_regs[SCTLR_EL2],	SYS_SCTLR);
> +		write_sysreg_el1(ctxt->sys_regs[CPTR_EL2],	SYS_CPACR);
> +		write_sysreg_el1(ctxt->sys_regs[TTBR0_EL2],	SYS_TTBR0);
> +		write_sysreg_el1(ctxt->sys_regs[TTBR1_EL2],	SYS_TTBR1);
> +		write_sysreg_el1(ctxt->sys_regs[TCR_EL2],	SYS_TCR);
> +		write_sysreg_el1(ctxt->sys_regs[CNTHCTL_EL2],	SYS_CNTKCTL);
> +	} else {
> +		write_sysreg_el1(translate_sctlr(ctxt->sys_regs[SCTLR_EL2]),
> +				 SYS_SCTLR);
> +		write_sysreg_el1(translate_cptr(ctxt->sys_regs[CPTR_EL2]),
> +				 SYS_CPACR);
> +		write_sysreg_el1(translate_ttbr0(ctxt->sys_regs[TTBR0_EL2]),
> +				 SYS_TTBR0);
> +		write_sysreg_el1(translate_tcr(ctxt->sys_regs[TCR_EL2]),
> +				 SYS_TCR);
> +		write_sysreg_el1(translate_cnthctl(ctxt->sys_regs[CNTHCTL_EL2]),
> +				 SYS_CNTKCTL);
> +	}
> +
> +	/*
> +	 * These registers can be modified behind our back by a fault
> +	 * taken inside vEL2. Save them, always.
> +	 */
> +	write_sysreg_el1(ctxt->sys_regs[ESR_EL2],	SYS_ESR);
> +	write_sysreg_el1(ctxt->sys_regs[AFSR0_EL2],	SYS_AFSR0);
> +	write_sysreg_el1(ctxt->sys_regs[AFSR1_EL2],	SYS_AFSR1);
> +	write_sysreg_el1(ctxt->sys_regs[FAR_EL2],	SYS_FAR);
> +	write_sysreg(ctxt->sys_regs[SP_EL2],		sp_el1);
> +	write_sysreg_el1(ctxt->sys_regs[ELR_EL2],	SYS_ELR);
> +
> +	val = __fixup_spsr_el2_write(ctxt, ctxt->sys_regs[SPSR_EL2]);
> +	write_sysreg_el1(val,	SYS_SPSR);
> +}
> +
> +static void __hyp_text __sysreg_restore_vel1_state(struct kvm_cpu_context *ctxt)
> +{
> +	u64 mpidr;
> +
> +	if (has_vhe()) {
> +		struct kvm_vcpu *vcpu;
> +
> +		/*
> +		 * Warning: this hack only works on VHE, because we only
> +		 * call this with the *guest* context, which is part of
> +		 * struct kvm_vcpu. On a host context, you'd get pure junk.
> +		 */
> +		vcpu = container_of(ctxt, struct kvm_vcpu, arch.ctxt);

This seems very fragile, just to find out whether the guest has hyp
capabilities. It would be at least nice to make sure this is indeed a
guest context.

The *clean* way to do it could be to have a pointer to kvm_vcpu in the
kvm_cpu_context which would be NULL for host contexts.

Otherwise, I'm under the impression that for a host context,
ctxt->sys_reg[HCR_EL2] == 0 and that this would also be true for a guest
without nested virt capability. Could we use something like that here?

Cheers,

Julien
Alexandru Elisei July 1, 2019, 12:09 p.m. UTC | #2
On 6/21/19 10:38 AM, Marc Zyngier wrote:
> From: Andre Przywara <andre.przywara@arm.com>
>
> Whenever we need to restore the guest's system registers to the CPU, we
> now need to take care of the EL2 system registers as well. Most of them
> are accessed via traps only, but some have an immediate effect and also
> a guest running in VHE mode would expect them to be accessible via their
> EL1 encoding, which we do not trap.
>
> Split the current __sysreg_{save,restore}_el1_state() functions into
> handling common sysregs, then differentiate between the guest running in
> vEL2 and vEL1.
>
> For vEL2 we write the virtual EL2 registers with an identical format directly
> into their EL1 counterpart, and translate the few registers that have a
> different format for the same effect on the execution when running a
> non-VHE guest guest hypervisor.
>
>   [ Commit message reworked and many bug fixes applied by Marc Zyngier
>     and Christoffer Dall. ]
>
> Signed-off-by: Andre Przywara <andre.przywara@arm.com>
> Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
> Signed-off-by: Christoffer Dall <christoffer.dall@arm.com>
> ---
>  arch/arm64/kvm/hyp/sysreg-sr.c | 160 +++++++++++++++++++++++++++++++--
>  1 file changed, 153 insertions(+), 7 deletions(-)
>
> diff --git a/arch/arm64/kvm/hyp/sysreg-sr.c b/arch/arm64/kvm/hyp/sysreg-sr.c
> index 62866a68e852..2abb9c3ff24f 100644
> --- a/arch/arm64/kvm/hyp/sysreg-sr.c
> +++ b/arch/arm64/kvm/hyp/sysreg-sr.c
> @@ -22,6 +22,7 @@
>  #include <asm/kvm_asm.h>
>  #include <asm/kvm_emulate.h>
>  #include <asm/kvm_hyp.h>
> +#include <asm/kvm_nested.h>
>  
>  /*
>   * Non-VHE: Both host and guest must save everything.
> @@ -51,11 +52,9 @@ static void __hyp_text __sysreg_save_user_state(struct kvm_cpu_context *ctxt)
>  	ctxt->sys_regs[TPIDRRO_EL0]	= read_sysreg(tpidrro_el0);
>  }
>  
> -static void __hyp_text __sysreg_save_el1_state(struct kvm_cpu_context *ctxt)
> +static void __hyp_text __sysreg_save_vel1_state(struct kvm_cpu_context *ctxt)
>  {
> -	ctxt->sys_regs[CSSELR_EL1]	= read_sysreg(csselr_el1);
>  	ctxt->sys_regs[SCTLR_EL1]	= read_sysreg_el1(SYS_SCTLR);
> -	ctxt->sys_regs[ACTLR_EL1]	= read_sysreg(actlr_el1);
>  	ctxt->sys_regs[CPACR_EL1]	= read_sysreg_el1(SYS_CPACR);
>  	ctxt->sys_regs[TTBR0_EL1]	= read_sysreg_el1(SYS_TTBR0);
>  	ctxt->sys_regs[TTBR1_EL1]	= read_sysreg_el1(SYS_TTBR1);
> @@ -69,14 +68,58 @@ static void __hyp_text __sysreg_save_el1_state(struct kvm_cpu_context *ctxt)
>  	ctxt->sys_regs[CONTEXTIDR_EL1]	= read_sysreg_el1(SYS_CONTEXTIDR);
>  	ctxt->sys_regs[AMAIR_EL1]	= read_sysreg_el1(SYS_AMAIR);
>  	ctxt->sys_regs[CNTKCTL_EL1]	= read_sysreg_el1(SYS_CNTKCTL);
> -	ctxt->sys_regs[PAR_EL1]		= read_sysreg(par_el1);
> -	ctxt->sys_regs[TPIDR_EL1]	= read_sysreg(tpidr_el1);
>  
>  	ctxt->gp_regs.sp_el1		= read_sysreg(sp_el1);
>  	ctxt->gp_regs.elr_el1		= read_sysreg_el1(SYS_ELR);
>  	ctxt->gp_regs.spsr[KVM_SPSR_EL1]= read_sysreg_el1(SYS_SPSR);
>  }
>  
> +static void __sysreg_save_vel2_state(struct kvm_cpu_context *ctxt)
> +{
> +	ctxt->sys_regs[ESR_EL2]		= read_sysreg_el1(SYS_ESR);
> +	ctxt->sys_regs[AFSR0_EL2]	= read_sysreg_el1(SYS_AFSR0);
> +	ctxt->sys_regs[AFSR1_EL2]	= read_sysreg_el1(SYS_AFSR1);
> +	ctxt->sys_regs[FAR_EL2]		= read_sysreg_el1(SYS_FAR);
> +	ctxt->sys_regs[MAIR_EL2]	= read_sysreg_el1(SYS_MAIR);
> +	ctxt->sys_regs[VBAR_EL2]	= read_sysreg_el1(SYS_VBAR);
> +	ctxt->sys_regs[CONTEXTIDR_EL2]	= read_sysreg_el1(SYS_CONTEXTIDR);
> +	ctxt->sys_regs[AMAIR_EL2]	= read_sysreg_el1(SYS_AMAIR);
> +
> +	/*
> +	 * In VHE mode those registers are compatible between EL1 and EL2,
> +	 * and the guest uses the _EL1 versions on the CPU naturally.
> +	 * So we save them into their _EL2 versions here.
> +	 * For nVHE mode we trap accesses to those registers, so our
> +	 * _EL2 copy in sys_regs[] is always up-to-date and we don't need
> +	 * to save anything here.
> +	 */
> +	if (__vcpu_el2_e2h_is_set(ctxt)) {
> +		ctxt->sys_regs[SCTLR_EL2]	= read_sysreg_el1(SYS_SCTLR);
> +		ctxt->sys_regs[CPTR_EL2]	= read_sysreg_el1(SYS_CPACR);
> +		ctxt->sys_regs[TTBR0_EL2]	= read_sysreg_el1(SYS_TTBR0);
> +		ctxt->sys_regs[TTBR1_EL2]	= read_sysreg_el1(SYS_TTBR1);
> +		ctxt->sys_regs[TCR_EL2]		= read_sysreg_el1(SYS_TCR);
> +		ctxt->sys_regs[CNTHCTL_EL2]	= read_sysreg_el1(SYS_CNTKCTL);

This goes against how the register is declared in arch/arm64/kvm/sys_regs.c
(added by patch 13), where it's declared as a "pure" EL2 register with no EL1
counterpart. I think this is correct, and having it as a pure register is not
the right approach, I'll explain why in patch 13.

> +	}
> +
> +	ctxt->sys_regs[SP_EL2]		= read_sysreg(sp_el1);
> +	ctxt->sys_regs[ELR_EL2]		= read_sysreg_el1(SYS_ELR);
> +	ctxt->sys_regs[SPSR_EL2]	= __fixup_spsr_el2_read(ctxt, read_sysreg_el1(SYS_SPSR));
> +}
> +
> +static void __hyp_text __sysreg_save_el1_state(struct kvm_cpu_context *ctxt)
> +{
> +	ctxt->sys_regs[CSSELR_EL1]	= read_sysreg(csselr_el1);
> +	ctxt->sys_regs[ACTLR_EL1]	= read_sysreg(actlr_el1);
> +	ctxt->sys_regs[PAR_EL1]		= read_sysreg(par_el1);
> +	ctxt->sys_regs[TPIDR_EL1]	= read_sysreg(tpidr_el1);
> +
> +	if (unlikely(__is_hyp_ctxt(ctxt)))
> +		__sysreg_save_vel2_state(ctxt);
> +	else
> +		__sysreg_save_vel1_state(ctxt);
> +}
> +
>  static void __hyp_text __sysreg_save_el2_return_state(struct kvm_cpu_context *ctxt)
>  {
>  	ctxt->gp_regs.regs.pc		= read_sysreg_el2(SYS_ELR);
> @@ -124,10 +167,91 @@ static void __hyp_text __sysreg_restore_user_state(struct kvm_cpu_context *ctxt)
>  	write_sysreg(ctxt->sys_regs[TPIDRRO_EL0],	tpidrro_el0);
>  }
>  
> -static void __hyp_text __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt)
> +static void __sysreg_restore_vel2_state(struct kvm_cpu_context *ctxt)
>  {
> +	u64 val;
> +
> +	write_sysreg(read_cpuid_id(),			vpidr_el2);
>  	write_sysreg(ctxt->sys_regs[MPIDR_EL1],		vmpidr_el2);
> -	write_sysreg(ctxt->sys_regs[CSSELR_EL1],	csselr_el1);
> +	write_sysreg_el1(ctxt->sys_regs[MAIR_EL2],	SYS_MAIR);
> +	write_sysreg_el1(ctxt->sys_regs[VBAR_EL2],	SYS_VBAR);
> +	write_sysreg_el1(ctxt->sys_regs[CONTEXTIDR_EL2],SYS_CONTEXTIDR);
> +	write_sysreg_el1(ctxt->sys_regs[AMAIR_EL2],	SYS_AMAIR);
> +
> +	if (__vcpu_el2_e2h_is_set(ctxt)) {
> +		/*
> +		 * In VHE mode those registers are compatible between
> +		 * EL1 and EL2.
> +		 */
> +		write_sysreg_el1(ctxt->sys_regs[SCTLR_EL2],	SYS_SCTLR);
> +		write_sysreg_el1(ctxt->sys_regs[CPTR_EL2],	SYS_CPACR);
> +		write_sysreg_el1(ctxt->sys_regs[TTBR0_EL2],	SYS_TTBR0);
> +		write_sysreg_el1(ctxt->sys_regs[TTBR1_EL2],	SYS_TTBR1);
> +		write_sysreg_el1(ctxt->sys_regs[TCR_EL2],	SYS_TCR);
> +		write_sysreg_el1(ctxt->sys_regs[CNTHCTL_EL2],	SYS_CNTKCTL);
> +	} else {
> +		write_sysreg_el1(translate_sctlr(ctxt->sys_regs[SCTLR_EL2]),
> +				 SYS_SCTLR);
> +		write_sysreg_el1(translate_cptr(ctxt->sys_regs[CPTR_EL2]),
> +				 SYS_CPACR);
> +		write_sysreg_el1(translate_ttbr0(ctxt->sys_regs[TTBR0_EL2]),
> +				 SYS_TTBR0);
> +		write_sysreg_el1(translate_tcr(ctxt->sys_regs[TCR_EL2]),
> +				 SYS_TCR);
> +		write_sysreg_el1(translate_cnthctl(ctxt->sys_regs[CNTHCTL_EL2]),
> +				 SYS_CNTKCTL);
> +	}
> +
> +	/*
> +	 * These registers can be modified behind our back by a fault
> +	 * taken inside vEL2. Save them, always.
> +	 */
> +	write_sysreg_el1(ctxt->sys_regs[ESR_EL2],	SYS_ESR);
> +	write_sysreg_el1(ctxt->sys_regs[AFSR0_EL2],	SYS_AFSR0);
> +	write_sysreg_el1(ctxt->sys_regs[AFSR1_EL2],	SYS_AFSR1);
> +	write_sysreg_el1(ctxt->sys_regs[FAR_EL2],	SYS_FAR);
> +	write_sysreg(ctxt->sys_regs[SP_EL2],		sp_el1);
> +	write_sysreg_el1(ctxt->sys_regs[ELR_EL2],	SYS_ELR);
> +
> +	val = __fixup_spsr_el2_write(ctxt, ctxt->sys_regs[SPSR_EL2]);
> +	write_sysreg_el1(val,	SYS_SPSR);
> +}
> +
> +static void __hyp_text __sysreg_restore_vel1_state(struct kvm_cpu_context *ctxt)
> +{
> +	u64 mpidr;
> +
> +	if (has_vhe()) {
> +		struct kvm_vcpu *vcpu;
> +
> +		/*
> +		 * Warning: this hack only works on VHE, because we only
> +		 * call this with the *guest* context, which is part of
> +		 * struct kvm_vcpu. On a host context, you'd get pure junk.
> +		 */
> +		vcpu = container_of(ctxt, struct kvm_vcpu, arch.ctxt);
> +
> +		if (nested_virt_in_use(vcpu)) {
> +			/*
> +			 * Only set VPIDR_EL2 for nested VMs, as this is the
> +			 * only time it changes. We'll restore the MIDR_EL1
> +			 * view on put.
> +			 */
> +			write_sysreg(ctxt->sys_regs[VPIDR_EL2],	vpidr_el2);
> +
> +			/*
> +			 * As we're restoring a nested guest, set the value
> +			 * provided by the guest hypervisor.
> +			 */
> +			mpidr = ctxt->sys_regs[VMPIDR_EL2];
> +		} else {
> +			mpidr = ctxt->sys_regs[MPIDR_EL1];
> +		}
> +	} else {
> +		mpidr = ctxt->sys_regs[MPIDR_EL1];
> +	}
> +
> +	write_sysreg(mpidr,				vmpidr_el2);
>  	write_sysreg_el1(ctxt->sys_regs[SCTLR_EL1],	SYS_SCTLR);
>  	write_sysreg(ctxt->sys_regs[ACTLR_EL1],		actlr_el1);
>  	write_sysreg_el1(ctxt->sys_regs[CPACR_EL1],	SYS_CPACR);
> @@ -151,6 +275,19 @@ static void __hyp_text __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt)
>  	write_sysreg_el1(ctxt->gp_regs.spsr[KVM_SPSR_EL1],SYS_SPSR);
>  }
>  
> +static void __hyp_text __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt)
> +{
> +	write_sysreg(ctxt->sys_regs[CSSELR_EL1],	csselr_el1);
> +	write_sysreg(ctxt->sys_regs[ACTLR_EL1],	  	actlr_el1);
> +	write_sysreg(ctxt->sys_regs[PAR_EL1],		par_el1);
> +	write_sysreg(ctxt->sys_regs[TPIDR_EL1],		tpidr_el1);
> +
> +	if (__is_hyp_ctxt(ctxt))
> +		__sysreg_restore_vel2_state(ctxt);
> +	else
> +		__sysreg_restore_vel1_state(ctxt);
> +}
> +
>  static void __hyp_text
>  __sysreg_restore_el2_return_state(struct kvm_cpu_context *ctxt)
>  {
> @@ -307,6 +444,15 @@ void kvm_vcpu_put_sysregs(struct kvm_vcpu *vcpu)
>  	/* Restore host user state */
>  	__sysreg_restore_user_state(host_ctxt);
>  
> +	/*
> +	 * If leaving a nesting guest, restore MPIDR_EL1 default view. It is
> +	 * slightly ugly to do it here, but the alternative is to penalize
> +	 * all non-nesting guests by forcing this on every load. Instead, we
> +	 * choose to only penalize nesting VMs.
> +	 */
> +	if (nested_virt_in_use(vcpu))
> +		write_sysreg(read_cpuid_id(),	vpidr_el2);
> +
>  	vcpu->arch.sysregs_loaded_on_cpu = false;
>  }
>
Marc Zyngier July 3, 2019, 1:42 p.m. UTC | #3
On 25/06/2019 09:48, Julien Thierry wrote:
> 
> 
> On 06/21/2019 10:38 AM, Marc Zyngier wrote:
>> From: Andre Przywara <andre.przywara@arm.com>
>>
>> Whenever we need to restore the guest's system registers to the CPU, we
>> now need to take care of the EL2 system registers as well. Most of them
>> are accessed via traps only, but some have an immediate effect and also
>> a guest running in VHE mode would expect them to be accessible via their
>> EL1 encoding, which we do not trap.
>>
>> Split the current __sysreg_{save,restore}_el1_state() functions into
>> handling common sysregs, then differentiate between the guest running in
>> vEL2 and vEL1.
>>
>> For vEL2 we write the virtual EL2 registers with an identical format directly
>> into their EL1 counterpart, and translate the few registers that have a
>> different format for the same effect on the execution when running a
>> non-VHE guest guest hypervisor.
>>
>>   [ Commit message reworked and many bug fixes applied by Marc Zyngier
>>     and Christoffer Dall. ]
>>
>> Signed-off-by: Andre Przywara <andre.przywara@arm.com>
>> Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
>> Signed-off-by: Christoffer Dall <christoffer.dall@arm.com>
>> ---
>>  arch/arm64/kvm/hyp/sysreg-sr.c | 160 +++++++++++++++++++++++++++++++--
>>  1 file changed, 153 insertions(+), 7 deletions(-)
>>
>> diff --git a/arch/arm64/kvm/hyp/sysreg-sr.c b/arch/arm64/kvm/hyp/sysreg-sr.c
>> index 62866a68e852..2abb9c3ff24f 100644
>> --- a/arch/arm64/kvm/hyp/sysreg-sr.c
>> +++ b/arch/arm64/kvm/hyp/sysreg-sr.c
> 
> [...]
> 
>> @@ -124,10 +167,91 @@ static void __hyp_text __sysreg_restore_user_state(struct kvm_cpu_context *ctxt)
>>  	write_sysreg(ctxt->sys_regs[TPIDRRO_EL0],	tpidrro_el0);
>>  }
>>  
>> -static void __hyp_text __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt)
>> +static void __sysreg_restore_vel2_state(struct kvm_cpu_context *ctxt)
>>  {
>> +	u64 val;
>> +
>> +	write_sysreg(read_cpuid_id(),			vpidr_el2);
>>  	write_sysreg(ctxt->sys_regs[MPIDR_EL1],		vmpidr_el2);
>> -	write_sysreg(ctxt->sys_regs[CSSELR_EL1],	csselr_el1);
>> +	write_sysreg_el1(ctxt->sys_regs[MAIR_EL2],	SYS_MAIR);
>> +	write_sysreg_el1(ctxt->sys_regs[VBAR_EL2],	SYS_VBAR);
>> +	write_sysreg_el1(ctxt->sys_regs[CONTEXTIDR_EL2],SYS_CONTEXTIDR);
>> +	write_sysreg_el1(ctxt->sys_regs[AMAIR_EL2],	SYS_AMAIR);
>> +
>> +	if (__vcpu_el2_e2h_is_set(ctxt)) {
>> +		/*
>> +		 * In VHE mode those registers are compatible between
>> +		 * EL1 and EL2.
>> +		 */
>> +		write_sysreg_el1(ctxt->sys_regs[SCTLR_EL2],	SYS_SCTLR);
>> +		write_sysreg_el1(ctxt->sys_regs[CPTR_EL2],	SYS_CPACR);
>> +		write_sysreg_el1(ctxt->sys_regs[TTBR0_EL2],	SYS_TTBR0);
>> +		write_sysreg_el1(ctxt->sys_regs[TTBR1_EL2],	SYS_TTBR1);
>> +		write_sysreg_el1(ctxt->sys_regs[TCR_EL2],	SYS_TCR);
>> +		write_sysreg_el1(ctxt->sys_regs[CNTHCTL_EL2],	SYS_CNTKCTL);
>> +	} else {
>> +		write_sysreg_el1(translate_sctlr(ctxt->sys_regs[SCTLR_EL2]),
>> +				 SYS_SCTLR);
>> +		write_sysreg_el1(translate_cptr(ctxt->sys_regs[CPTR_EL2]),
>> +				 SYS_CPACR);
>> +		write_sysreg_el1(translate_ttbr0(ctxt->sys_regs[TTBR0_EL2]),
>> +				 SYS_TTBR0);
>> +		write_sysreg_el1(translate_tcr(ctxt->sys_regs[TCR_EL2]),
>> +				 SYS_TCR);
>> +		write_sysreg_el1(translate_cnthctl(ctxt->sys_regs[CNTHCTL_EL2]),
>> +				 SYS_CNTKCTL);
>> +	}
>> +
>> +	/*
>> +	 * These registers can be modified behind our back by a fault
>> +	 * taken inside vEL2. Save them, always.
>> +	 */
>> +	write_sysreg_el1(ctxt->sys_regs[ESR_EL2],	SYS_ESR);
>> +	write_sysreg_el1(ctxt->sys_regs[AFSR0_EL2],	SYS_AFSR0);
>> +	write_sysreg_el1(ctxt->sys_regs[AFSR1_EL2],	SYS_AFSR1);
>> +	write_sysreg_el1(ctxt->sys_regs[FAR_EL2],	SYS_FAR);
>> +	write_sysreg(ctxt->sys_regs[SP_EL2],		sp_el1);
>> +	write_sysreg_el1(ctxt->sys_regs[ELR_EL2],	SYS_ELR);
>> +
>> +	val = __fixup_spsr_el2_write(ctxt, ctxt->sys_regs[SPSR_EL2]);
>> +	write_sysreg_el1(val,	SYS_SPSR);
>> +}
>> +
>> +static void __hyp_text __sysreg_restore_vel1_state(struct kvm_cpu_context *ctxt)
>> +{
>> +	u64 mpidr;
>> +
>> +	if (has_vhe()) {
>> +		struct kvm_vcpu *vcpu;
>> +
>> +		/*
>> +		 * Warning: this hack only works on VHE, because we only
>> +		 * call this with the *guest* context, which is part of
>> +		 * struct kvm_vcpu. On a host context, you'd get pure junk.
>> +		 */
>> +		vcpu = container_of(ctxt, struct kvm_vcpu, arch.ctxt);
> 
> This seems very fragile, just to find out whether the guest has hyp
> capabilities. It would be at least nice to make sure this is indeed a
> guest context.

Oh come on! It is such a nice hack! ;-) I distinctly remember
Christoffer being >that< close to vomiting when he saw that first.

More seriously, we know what the context is by construction.

> The *clean* way to do it could be to have a pointer to kvm_vcpu in the
> kvm_cpu_context which would be NULL for host contexts.

Funny you mention that. We have the exact opposite (host context
pointing to the running vcpu, and NULL in the guest context). Maybe we
can come up with something that always point to the vcpu, assuming
nothing yet checks for NULL to identify a guest context! ;-)

> Otherwise, I'm under the impression that for a host context,
> ctxt->sys_reg[HCR_EL2] == 0 and that this would also be true for a guest
> without nested virt capability. Could we use something like that here?

Urgh. I think I'd prefer the above suggestion. Or even my hack.

Thanks,

	M.
Alexandru Elisei Aug. 21, 2019, 11:57 a.m. UTC | #4
On 6/21/19 10:38 AM, Marc Zyngier wrote:
> From: Andre Przywara <andre.przywara@arm.com>
>
> Whenever we need to restore the guest's system registers to the CPU, we
> now need to take care of the EL2 system registers as well. Most of them
> are accessed via traps only, but some have an immediate effect and also
> a guest running in VHE mode would expect them to be accessible via their
> EL1 encoding, which we do not trap.
>
> Split the current __sysreg_{save,restore}_el1_state() functions into
> handling common sysregs, then differentiate between the guest running in
> vEL2 and vEL1.
>
> For vEL2 we write the virtual EL2 registers with an identical format directly
> into their EL1 counterpart, and translate the few registers that have a
> different format for the same effect on the execution when running a
> non-VHE guest guest hypervisor.
>
>   [ Commit message reworked and many bug fixes applied by Marc Zyngier
>     and Christoffer Dall. ]
>
> Signed-off-by: Andre Przywara <andre.przywara@arm.com>
> Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
> Signed-off-by: Christoffer Dall <christoffer.dall@arm.com>
> ---
>  arch/arm64/kvm/hyp/sysreg-sr.c | 160 +++++++++++++++++++++++++++++++--
>  1 file changed, 153 insertions(+), 7 deletions(-)
>
> diff --git a/arch/arm64/kvm/hyp/sysreg-sr.c b/arch/arm64/kvm/hyp/sysreg-sr.c
> index 62866a68e852..2abb9c3ff24f 100644
> --- a/arch/arm64/kvm/hyp/sysreg-sr.c
> +++ b/arch/arm64/kvm/hyp/sysreg-sr.c
> @@ -22,6 +22,7 @@
>  #include <asm/kvm_asm.h>
>  #include <asm/kvm_emulate.h>
>  #include <asm/kvm_hyp.h>
> +#include <asm/kvm_nested.h>
>  
>  /*
>   * Non-VHE: Both host and guest must save everything.
> @@ -51,11 +52,9 @@ static void __hyp_text __sysreg_save_user_state(struct kvm_cpu_context *ctxt)
>  	ctxt->sys_regs[TPIDRRO_EL0]	= read_sysreg(tpidrro_el0);
>  }
>  
> -static void __hyp_text __sysreg_save_el1_state(struct kvm_cpu_context *ctxt)
> +static void __hyp_text __sysreg_save_vel1_state(struct kvm_cpu_context *ctxt)
>  {
> -	ctxt->sys_regs[CSSELR_EL1]	= read_sysreg(csselr_el1);
>  	ctxt->sys_regs[SCTLR_EL1]	= read_sysreg_el1(SYS_SCTLR);
> -	ctxt->sys_regs[ACTLR_EL1]	= read_sysreg(actlr_el1);
>  	ctxt->sys_regs[CPACR_EL1]	= read_sysreg_el1(SYS_CPACR);
>  	ctxt->sys_regs[TTBR0_EL1]	= read_sysreg_el1(SYS_TTBR0);
>  	ctxt->sys_regs[TTBR1_EL1]	= read_sysreg_el1(SYS_TTBR1);
> @@ -69,14 +68,58 @@ static void __hyp_text __sysreg_save_el1_state(struct kvm_cpu_context *ctxt)
>  	ctxt->sys_regs[CONTEXTIDR_EL1]	= read_sysreg_el1(SYS_CONTEXTIDR);
>  	ctxt->sys_regs[AMAIR_EL1]	= read_sysreg_el1(SYS_AMAIR);
>  	ctxt->sys_regs[CNTKCTL_EL1]	= read_sysreg_el1(SYS_CNTKCTL);
> -	ctxt->sys_regs[PAR_EL1]		= read_sysreg(par_el1);
> -	ctxt->sys_regs[TPIDR_EL1]	= read_sysreg(tpidr_el1);
>  
>  	ctxt->gp_regs.sp_el1		= read_sysreg(sp_el1);
>  	ctxt->gp_regs.elr_el1		= read_sysreg_el1(SYS_ELR);
>  	ctxt->gp_regs.spsr[KVM_SPSR_EL1]= read_sysreg_el1(SYS_SPSR);
>  }
>  
> +static void __sysreg_save_vel2_state(struct kvm_cpu_context *ctxt)
> +{
> +	ctxt->sys_regs[ESR_EL2]		= read_sysreg_el1(SYS_ESR);
> +	ctxt->sys_regs[AFSR0_EL2]	= read_sysreg_el1(SYS_AFSR0);
> +	ctxt->sys_regs[AFSR1_EL2]	= read_sysreg_el1(SYS_AFSR1);
> +	ctxt->sys_regs[FAR_EL2]		= read_sysreg_el1(SYS_FAR);
> +	ctxt->sys_regs[MAIR_EL2]	= read_sysreg_el1(SYS_MAIR);
> +	ctxt->sys_regs[VBAR_EL2]	= read_sysreg_el1(SYS_VBAR);
> +	ctxt->sys_regs[CONTEXTIDR_EL2]	= read_sysreg_el1(SYS_CONTEXTIDR);
> +	ctxt->sys_regs[AMAIR_EL2]	= read_sysreg_el1(SYS_AMAIR);
> +
> +	/*
> +	 * In VHE mode those registers are compatible between EL1 and EL2,
> +	 * and the guest uses the _EL1 versions on the CPU naturally.
> +	 * So we save them into their _EL2 versions here.
> +	 * For nVHE mode we trap accesses to those registers, so our
> +	 * _EL2 copy in sys_regs[] is always up-to-date and we don't need
> +	 * to save anything here.
> +	 */
> +	if (__vcpu_el2_e2h_is_set(ctxt)) {
> +		ctxt->sys_regs[SCTLR_EL2]	= read_sysreg_el1(SYS_SCTLR);
> +		ctxt->sys_regs[CPTR_EL2]	= read_sysreg_el1(SYS_CPACR);
> +		ctxt->sys_regs[TTBR0_EL2]	= read_sysreg_el1(SYS_TTBR0);
> +		ctxt->sys_regs[TTBR1_EL2]	= read_sysreg_el1(SYS_TTBR1);
> +		ctxt->sys_regs[TCR_EL2]		= read_sysreg_el1(SYS_TCR);
> +		ctxt->sys_regs[CNTHCTL_EL2]	= read_sysreg_el1(SYS_CNTKCTL);
> +	}

This can break guests that run with VHE on, then disable it. I stumbled into
this while working on kvm-unit-tests, which uses TTBR0 for the translation
tables. Let's consider the following scenario:

1. Guest sets HCR_EL2.E2H
2. Guest programs translation tables in TTBR0_EL1, which should reflect in
TTBR0_EL2.
3. Guest enabled MMU and does stuff.
4. Guest disables MMU and clears HCR_EL2.E2H
5. Guest turns MMU on. It doesn't change TTBR0_EL2, because it will use the same
translation tables as when running with E2H set.
6. The vcpu gets scheduled out. E2H is not set, so the value that the guest
programmed in hardware TTBR0_EL1 won't be copied to virtual TTBR0_EL2.
7. The vcpu gets scheduled back in. KVM will write the reset value for virtual
TTBR0_EL2 (which is 0x0).
8. The guest hangs.

I think this is actually a symptom of a deeper issue. When E2H is cleared, the
values that the guest wrote to the EL1 registers aren't immediately reflected in
the virtual EL2 registers, as it happens on real hardware. Instead, some of the
hardware values from the EL1 registers are copied to the corresponding EL2
registers on the next vcpu_put, which happens at a later time.

I am thinking that something like this will fix the issues (it did fix disabling
VHE in kvm-unit-tests):

diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 1d896113f1f8..f2b5a39762d0 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -333,7 +333,8 @@ void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg)
                 * to reverse-translate virtual EL2 system registers for a
                 * non-VHE guest hypervisor.
                 */
-               __vcpu_sys_reg(vcpu, reg) = val;
+               if (reg != HCR_EL2)
+                       __vcpu_sys_reg(vcpu, reg) = val;
 
                switch (reg) {
                case ELR_EL2:
@@ -370,7 +371,17 @@ void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int
reg)
                return;
 
 memory_write:
-        __vcpu_sys_reg(vcpu, reg) = val;
+       if (reg == HCR_EL2 && vcpu_el2_e2h_is_set(vcpu) && !(val & HCR_E2H)) {
+               preempt_disable();
+               kvm_arch_vcpu_put(vcpu);
+
+               __vcpu_sys_reg(vcpu, reg) = val;
+
+               kvm_arch_vcpu_load(vcpu, smp_processor_id());
+               preempt_enable();
+       } else {
+                __vcpu_sys_reg(vcpu, reg) = val;
+       }
 }
 
 /* 3 bits per cache level, as per CLIDR, but non-existent caches always 0 */

I don't think there's any need to convert EL1 registers to their non-vhe EL2
format because of how RES0/RES1 is defined in the architecture glossary (ARM DDI
0487E.a, page 7893 for RES0 and 7894 for RES1):

"If a bit is RES0 only in some contexts:
A read of the bit must return the value last successfully written to the bit, by
either a direct or an indirect write, regardless of the use of the register when
the bit was written
[..]
While the use of the register is such that the bit is described as RES0, the
value of the bit must have no effect on the operation of the PE, other than
determining the value read back from that bit, unless this Manual explicitly
defines additional properties for the bit"

We have the translate functions that should take care of converting the non-vhe
EL2 format to the hardware EL1 format.

As an aside, the diff looks weird because the vcpu_write_sys_reg is very
complex, there are a LOT of exit points from the function, and the register
value is written twice for some registers. I think it's worth considering making
the function simpler, maybe splitting it into two separate functions, one for
EL2 registers, one for regular registers.

Here's the kvm-unit-tests diff that I used to spot the bug. It's very far from
being correct, but the test is able to finish with the fix (it hangs otherwise).
You can apply it on top of 2130fd4154ad ("tscdeadline_latency: Check condition
first before loop"):

diff --git a/arm/cstart64.S b/arm/cstart64.S
index b0e8baa1a23a..01357b3b116b 100644
--- a/arm/cstart64.S
+++ b/arm/cstart64.S
@@ -51,6 +51,18 @@ start:
     b    1b
 
 1:
+    mrs    x4, CurrentEL
+    cmp    x4, CurrentEL_EL2
+    b.ne    1f
+    mrs    x4, mpidr_el1
+    msr    vmpidr_el2, x4
+    mrs    x4, midr_el1
+    msr    vpidr_el2, x4
+    ldr    x4, =(HCR_EL2_TGE | HCR_EL2_E2H)
+    msr    hcr_el2, x4
+    isb
+
+1:
     /* set up stack */
     mov    x4, #1
     msr    spsel, x4
@@ -101,6 +113,18 @@ get_mmu_off:
 
 .globl secondary_entry
 secondary_entry:
+    mrs    x0, CurrentEL
+    cmp    x0, CurrentEL_EL2
+    b.ne    1f
+    mrs    x0, mpidr_el1
+    msr    vmpidr_el2, x0
+    mrs    x0, midr_el1
+    msr    vpidr_el2, x0
+    ldr    x0, =(HCR_EL2_TGE | HCR_EL2_E2H)
+    msr    hcr_el2, x0
+    isb
+
+1:
     /* Enable FP/ASIMD */
     mov    x0, #(3 << 20)
     msr    cpacr_el1, x0
@@ -194,6 +218,33 @@ asm_mmu_enable:
 
     ret
 
+asm_mmu_enable_hyp:
+       ic      iallu
+       tlbi    alle2is
+       dsb     ish
+
+        /* TCR */
+       ldr     x1, =TCR_EL2_RES1 |                     \
+                    TCR_T0SZ(VA_BITS) |                \
+                    TCR_TG0_64K |                      \
+                    TCR_IRGN0_WBWA | TCR_ORGN0_WBWA |  \
+                    TCR_SH0_IS
+       mrs     x2, id_aa64mmfr0_el1
+       bfi     x1, x2, #TCR_EL2_PS_SHIFT, #3
+       msr     tcr_el2, x1
+
+       /* Same MAIR and TTBR0 as in VHE mode */
+
+       /* SCTLR */
+       ldr     x1, =SCTLR_EL2_RES1 |                   \
+                    SCTLR_EL2_C |                      \
+                    SCTLR_EL2_I |                      \
+                    SCTLR_EL2_M
+       msr     sctlr_el2, x1
+       isb
+
+       ret
+
 .globl asm_mmu_disable
 asm_mmu_disable:
     mrs    x0, sctlr_el1
@@ -202,6 +253,18 @@ asm_mmu_disable:
     isb
     ret
 
+.globl asm_disable_vhe
+asm_disable_vhe:
+    str     x30, [sp, #-16]!
+
+    bl      asm_mmu_disable
+    msr     hcr_el2, xzr
+    isb
+    bl      asm_mmu_enable_hyp
+
+    ldr     x30, [sp], #16
+    ret
+
 /*
  * Vectors
  * Adapted from arch/arm64/kernel/entry.S
diff --git a/arm/selftest.c b/arm/selftest.c
index 28a17f7a7531..68a18036221b 100644
--- a/arm/selftest.c
+++ b/arm/selftest.c
@@ -287,6 +287,12 @@ static void user_psci_system_off(struct pt_regs *regs,
unsigned int esr)
 {
     __user_psci_system_off();
 }
+
+extern void asm_disable_vhe(void);
+static void check_el2(void)
+{
+    asm_disable_vhe();
+}
 #endif
 
 static void check_vectors(void *arg __unused)
@@ -369,6 +375,10 @@ int main(int argc, char **argv)
         report("PSCI version", psci_check());
         on_cpus(cpu_report, NULL);
 
+    } else if (strcmp(argv[1], "el2") == 0) {
+
+        check_el2();
+
     } else {
         printf("Unknown subtest\n");
         abort();
diff --git a/lib/arm/psci.c b/lib/arm/psci.c
index c3d399064ae3..5ef1c0386ce1 100644
--- a/lib/arm/psci.c
+++ b/lib/arm/psci.c
@@ -16,7 +16,7 @@ int psci_invoke(unsigned long function_id, unsigned long arg0,
         unsigned long arg1, unsigned long arg2)
 {
     asm volatile(
-        "hvc #0"
+        "smc #0"
     : "+r" (function_id)
     : "r" (arg0), "r" (arg1), "r" (arg2));
     return function_id;
diff --git a/lib/arm64/asm/pgtable-hwdef.h b/lib/arm64/asm/pgtable-hwdef.h
index 045a3ce12645..6b80e34dda0c 100644
--- a/lib/arm64/asm/pgtable-hwdef.h
+++ b/lib/arm64/asm/pgtable-hwdef.h
@@ -95,18 +95,42 @@
 /*
  * TCR flags.
  */
-#define TCR_TxSZ(x)        (((UL(64) - (x)) << 16) | ((UL(64) - (x)) << 0))
-#define TCR_IRGN_NC        ((UL(0) << 8) | (UL(0) << 24))
-#define TCR_IRGN_WBWA        ((UL(1) << 8) | (UL(1) << 24))
-#define TCR_IRGN_WT        ((UL(2) << 8) | (UL(2) << 24))
-#define TCR_IRGN_WBnWA        ((UL(3) << 8) | (UL(3) << 24))
-#define TCR_IRGN_MASK        ((UL(3) << 8) | (UL(3) << 24))
-#define TCR_ORGN_NC        ((UL(0) << 10) | (UL(0) << 26))
-#define TCR_ORGN_WBWA        ((UL(1) << 10) | (UL(1) << 26))
-#define TCR_ORGN_WT        ((UL(2) << 10) | (UL(2) << 26))
-#define TCR_ORGN_WBnWA        ((UL(3) << 10) | (UL(3) << 26))
-#define TCR_ORGN_MASK        ((UL(3) << 10) | (UL(3) << 26))
-#define TCR_SHARED        ((UL(3) << 12) | (UL(3) << 28))
+#define TCR_T0SZ(x)        ((UL(64) - (x)) << 0)
+#define TCR_T1SZ(x)        ((UL(64) - (x)) << 16)
+#define TCR_TxSZ(x)        (TCR_T0SZ(x) | TCR_T1SZ(x))
+#define TCR_IRGN0_NC        (UL(0) << 8)
+#define TCR_IRGN1_NC        (UL(0) << 24)
+#define TCR_IRGN_NC        (TCR_IRGN0_NC | TCR_IRGN1_NC)
+#define TCR_IRGN0_WBWA        (UL(1) << 8)
+#define TCR_IRGN1_WBWA        (UL(1) << 24)
+#define TCR_IRGN_WBWA        (TCR_IRGN0_WBWA | TCR_IRGN1_WBWA)
+#define TCR_IRGN0_WT        (UL(2) << 8)
+#define TCR_IRGN1_WT        (UL(2) << 24)
+#define TCR_IRGN_WT        (TCR_IRGN0_WT | TCR_IRGN1_WT)
+#define TCR_IRGN0_WBnWA        (UL(3) << 8)
+#define TCR_IRGN1_WBnWA        (UL(3) << 24)
+#define TCR_IRGN_WBnWA        (TCR_IRGN0_WBnWA | TCR_IRGN1_WBnWA)
+#define TCR_IRGN0_MASK        (UL(3) << 8)
+#define TCR_IRGN1_MASK        (UL(3) << 24)
+#define TCR_IRGN_MASK        (TCR_IRGN0_MASK | TCR_IRGN1_MASK)
+#define TCR_ORGN0_NC        (UL(0) << 10)
+#define TCR_ORGN1_NC        (UL(0) << 26)
+#define TCR_ORGN_NC        (TCR_ORGN0_NC | TCR_ORGN1_NC)
+#define TCR_ORGN0_WBWA        (UL(1) << 10)
+#define TCR_ORGN1_WBWA        (UL(1) << 26)
+#define TCR_ORGN_WBWA        (TCR_ORGN0_WBWA | TCR_ORGN1_WBWA)
+#define TCR_ORGN0_WT        (UL(2) << 10)
+#define TCR_ORGN1_WT        (UL(2) << 26)
+#define TCR_ORGN_WT        (TCR_ORGN0_WT | TCR_ORGN1_WT)
+#define TCR_ORGN0_WBnWA        (UL(3) << 8)
+#define TCR_ORGN1_WBnWA        (UL(3) << 24)
+#define TCR_ORGN_WBnWA        (TCR_ORGN0_WBnWA | TCR_ORGN1_WBnWA)
+#define TCR_ORGN0_MASK        (UL(3) << 10)
+#define TCR_ORGN1_MASK        (UL(3) << 26)
+#define TCR_ORGN_MASK        (TCR_ORGN0_MASK | TCR_ORGN1_MASK)
+#define TCR_SH0_IS        (UL(3) << 12)
+#define TCR_SH1_IS        (UL(3) << 28)
+#define TCR_SHARED        (TCR_SH0_IS | TCR_SH1_IS)
 #define TCR_TG0_4K        (UL(0) << 14)
 #define TCR_TG0_64K        (UL(1) << 14)
 #define TCR_TG0_16K        (UL(2) << 14)
@@ -116,6 +140,9 @@
 #define TCR_ASID16        (UL(1) << 36)
 #define TCR_TBI0        (UL(1) << 37)
 
+#define TCR_EL2_RES1        ((UL(1) << 31) | (UL(1) << 23))
+#define TCR_EL2_PS_SHIFT    16
+
 /*
  * Memory types available.
  */
diff --git a/lib/arm64/asm/processor.h b/lib/arm64/asm/processor.h
index 1d9223f728a5..b2136acda743 100644
--- a/lib/arm64/asm/processor.h
+++ b/lib/arm64/asm/processor.h
@@ -16,6 +16,16 @@
 #define SCTLR_EL1_A    (1 << 1)
 #define SCTLR_EL1_M    (1 << 0)
 
+#define HCR_EL2_TGE    (1 << 27)
+#define HCR_EL2_E2H    (1 << 34)
+
+#define SCTLR_EL2_RES1    ((UL(3) << 28) | (UL(3) << 22) |    \
+            (UL(1) << 18) | (UL(1) << 16) |        \
+            (UL(1) << 11) | (UL(3) << 4))
+#define SCTLR_EL2_I    SCTLR_EL1_I
+#define SCTLR_EL2_C    SCTLR_EL1_C
+#define SCTLR_EL2_M    SCTLR_EL1_M
+
 #ifndef __ASSEMBLY__
 #include <asm/ptrace.h>
 #include <asm/esr.h>


To run it:

lkvm run -f selftest.flat -c 1 -m 128 -p el2 --nested --irqchip gicv3 --console
serial

Thanks,
Alex
diff mbox series

Patch

diff --git a/arch/arm64/kvm/hyp/sysreg-sr.c b/arch/arm64/kvm/hyp/sysreg-sr.c
index 62866a68e852..2abb9c3ff24f 100644
--- a/arch/arm64/kvm/hyp/sysreg-sr.c
+++ b/arch/arm64/kvm/hyp/sysreg-sr.c
@@ -22,6 +22,7 @@ 
 #include <asm/kvm_asm.h>
 #include <asm/kvm_emulate.h>
 #include <asm/kvm_hyp.h>
+#include <asm/kvm_nested.h>
 
 /*
  * Non-VHE: Both host and guest must save everything.
@@ -51,11 +52,9 @@  static void __hyp_text __sysreg_save_user_state(struct kvm_cpu_context *ctxt)
 	ctxt->sys_regs[TPIDRRO_EL0]	= read_sysreg(tpidrro_el0);
 }
 
-static void __hyp_text __sysreg_save_el1_state(struct kvm_cpu_context *ctxt)
+static void __hyp_text __sysreg_save_vel1_state(struct kvm_cpu_context *ctxt)
 {
-	ctxt->sys_regs[CSSELR_EL1]	= read_sysreg(csselr_el1);
 	ctxt->sys_regs[SCTLR_EL1]	= read_sysreg_el1(SYS_SCTLR);
-	ctxt->sys_regs[ACTLR_EL1]	= read_sysreg(actlr_el1);
 	ctxt->sys_regs[CPACR_EL1]	= read_sysreg_el1(SYS_CPACR);
 	ctxt->sys_regs[TTBR0_EL1]	= read_sysreg_el1(SYS_TTBR0);
 	ctxt->sys_regs[TTBR1_EL1]	= read_sysreg_el1(SYS_TTBR1);
@@ -69,14 +68,58 @@  static void __hyp_text __sysreg_save_el1_state(struct kvm_cpu_context *ctxt)
 	ctxt->sys_regs[CONTEXTIDR_EL1]	= read_sysreg_el1(SYS_CONTEXTIDR);
 	ctxt->sys_regs[AMAIR_EL1]	= read_sysreg_el1(SYS_AMAIR);
 	ctxt->sys_regs[CNTKCTL_EL1]	= read_sysreg_el1(SYS_CNTKCTL);
-	ctxt->sys_regs[PAR_EL1]		= read_sysreg(par_el1);
-	ctxt->sys_regs[TPIDR_EL1]	= read_sysreg(tpidr_el1);
 
 	ctxt->gp_regs.sp_el1		= read_sysreg(sp_el1);
 	ctxt->gp_regs.elr_el1		= read_sysreg_el1(SYS_ELR);
 	ctxt->gp_regs.spsr[KVM_SPSR_EL1]= read_sysreg_el1(SYS_SPSR);
 }
 
+static void __sysreg_save_vel2_state(struct kvm_cpu_context *ctxt)
+{
+	ctxt->sys_regs[ESR_EL2]		= read_sysreg_el1(SYS_ESR);
+	ctxt->sys_regs[AFSR0_EL2]	= read_sysreg_el1(SYS_AFSR0);
+	ctxt->sys_regs[AFSR1_EL2]	= read_sysreg_el1(SYS_AFSR1);
+	ctxt->sys_regs[FAR_EL2]		= read_sysreg_el1(SYS_FAR);
+	ctxt->sys_regs[MAIR_EL2]	= read_sysreg_el1(SYS_MAIR);
+	ctxt->sys_regs[VBAR_EL2]	= read_sysreg_el1(SYS_VBAR);
+	ctxt->sys_regs[CONTEXTIDR_EL2]	= read_sysreg_el1(SYS_CONTEXTIDR);
+	ctxt->sys_regs[AMAIR_EL2]	= read_sysreg_el1(SYS_AMAIR);
+
+	/*
+	 * In VHE mode those registers are compatible between EL1 and EL2,
+	 * and the guest uses the _EL1 versions on the CPU naturally.
+	 * So we save them into their _EL2 versions here.
+	 * For nVHE mode we trap accesses to those registers, so our
+	 * _EL2 copy in sys_regs[] is always up-to-date and we don't need
+	 * to save anything here.
+	 */
+	if (__vcpu_el2_e2h_is_set(ctxt)) {
+		ctxt->sys_regs[SCTLR_EL2]	= read_sysreg_el1(SYS_SCTLR);
+		ctxt->sys_regs[CPTR_EL2]	= read_sysreg_el1(SYS_CPACR);
+		ctxt->sys_regs[TTBR0_EL2]	= read_sysreg_el1(SYS_TTBR0);
+		ctxt->sys_regs[TTBR1_EL2]	= read_sysreg_el1(SYS_TTBR1);
+		ctxt->sys_regs[TCR_EL2]		= read_sysreg_el1(SYS_TCR);
+		ctxt->sys_regs[CNTHCTL_EL2]	= read_sysreg_el1(SYS_CNTKCTL);
+	}
+
+	ctxt->sys_regs[SP_EL2]		= read_sysreg(sp_el1);
+	ctxt->sys_regs[ELR_EL2]		= read_sysreg_el1(SYS_ELR);
+	ctxt->sys_regs[SPSR_EL2]	= __fixup_spsr_el2_read(ctxt, read_sysreg_el1(SYS_SPSR));
+}
+
+static void __hyp_text __sysreg_save_el1_state(struct kvm_cpu_context *ctxt)
+{
+	ctxt->sys_regs[CSSELR_EL1]	= read_sysreg(csselr_el1);
+	ctxt->sys_regs[ACTLR_EL1]	= read_sysreg(actlr_el1);
+	ctxt->sys_regs[PAR_EL1]		= read_sysreg(par_el1);
+	ctxt->sys_regs[TPIDR_EL1]	= read_sysreg(tpidr_el1);
+
+	if (unlikely(__is_hyp_ctxt(ctxt)))
+		__sysreg_save_vel2_state(ctxt);
+	else
+		__sysreg_save_vel1_state(ctxt);
+}
+
 static void __hyp_text __sysreg_save_el2_return_state(struct kvm_cpu_context *ctxt)
 {
 	ctxt->gp_regs.regs.pc		= read_sysreg_el2(SYS_ELR);
@@ -124,10 +167,91 @@  static void __hyp_text __sysreg_restore_user_state(struct kvm_cpu_context *ctxt)
 	write_sysreg(ctxt->sys_regs[TPIDRRO_EL0],	tpidrro_el0);
 }
 
-static void __hyp_text __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt)
+static void __sysreg_restore_vel2_state(struct kvm_cpu_context *ctxt)
 {
+	u64 val;
+
+	write_sysreg(read_cpuid_id(),			vpidr_el2);
 	write_sysreg(ctxt->sys_regs[MPIDR_EL1],		vmpidr_el2);
-	write_sysreg(ctxt->sys_regs[CSSELR_EL1],	csselr_el1);
+	write_sysreg_el1(ctxt->sys_regs[MAIR_EL2],	SYS_MAIR);
+	write_sysreg_el1(ctxt->sys_regs[VBAR_EL2],	SYS_VBAR);
+	write_sysreg_el1(ctxt->sys_regs[CONTEXTIDR_EL2],SYS_CONTEXTIDR);
+	write_sysreg_el1(ctxt->sys_regs[AMAIR_EL2],	SYS_AMAIR);
+
+	if (__vcpu_el2_e2h_is_set(ctxt)) {
+		/*
+		 * In VHE mode those registers are compatible between
+		 * EL1 and EL2.
+		 */
+		write_sysreg_el1(ctxt->sys_regs[SCTLR_EL2],	SYS_SCTLR);
+		write_sysreg_el1(ctxt->sys_regs[CPTR_EL2],	SYS_CPACR);
+		write_sysreg_el1(ctxt->sys_regs[TTBR0_EL2],	SYS_TTBR0);
+		write_sysreg_el1(ctxt->sys_regs[TTBR1_EL2],	SYS_TTBR1);
+		write_sysreg_el1(ctxt->sys_regs[TCR_EL2],	SYS_TCR);
+		write_sysreg_el1(ctxt->sys_regs[CNTHCTL_EL2],	SYS_CNTKCTL);
+	} else {
+		write_sysreg_el1(translate_sctlr(ctxt->sys_regs[SCTLR_EL2]),
+				 SYS_SCTLR);
+		write_sysreg_el1(translate_cptr(ctxt->sys_regs[CPTR_EL2]),
+				 SYS_CPACR);
+		write_sysreg_el1(translate_ttbr0(ctxt->sys_regs[TTBR0_EL2]),
+				 SYS_TTBR0);
+		write_sysreg_el1(translate_tcr(ctxt->sys_regs[TCR_EL2]),
+				 SYS_TCR);
+		write_sysreg_el1(translate_cnthctl(ctxt->sys_regs[CNTHCTL_EL2]),
+				 SYS_CNTKCTL);
+	}
+
+	/*
+	 * These registers can be modified behind our back by a fault
+	 * taken inside vEL2. Save them, always.
+	 */
+	write_sysreg_el1(ctxt->sys_regs[ESR_EL2],	SYS_ESR);
+	write_sysreg_el1(ctxt->sys_regs[AFSR0_EL2],	SYS_AFSR0);
+	write_sysreg_el1(ctxt->sys_regs[AFSR1_EL2],	SYS_AFSR1);
+	write_sysreg_el1(ctxt->sys_regs[FAR_EL2],	SYS_FAR);
+	write_sysreg(ctxt->sys_regs[SP_EL2],		sp_el1);
+	write_sysreg_el1(ctxt->sys_regs[ELR_EL2],	SYS_ELR);
+
+	val = __fixup_spsr_el2_write(ctxt, ctxt->sys_regs[SPSR_EL2]);
+	write_sysreg_el1(val,	SYS_SPSR);
+}
+
+static void __hyp_text __sysreg_restore_vel1_state(struct kvm_cpu_context *ctxt)
+{
+	u64 mpidr;
+
+	if (has_vhe()) {
+		struct kvm_vcpu *vcpu;
+
+		/*
+		 * Warning: this hack only works on VHE, because we only
+		 * call this with the *guest* context, which is part of
+		 * struct kvm_vcpu. On a host context, you'd get pure junk.
+		 */
+		vcpu = container_of(ctxt, struct kvm_vcpu, arch.ctxt);
+
+		if (nested_virt_in_use(vcpu)) {
+			/*
+			 * Only set VPIDR_EL2 for nested VMs, as this is the
+			 * only time it changes. We'll restore the MIDR_EL1
+			 * view on put.
+			 */
+			write_sysreg(ctxt->sys_regs[VPIDR_EL2],	vpidr_el2);
+
+			/*
+			 * As we're restoring a nested guest, set the value
+			 * provided by the guest hypervisor.
+			 */
+			mpidr = ctxt->sys_regs[VMPIDR_EL2];
+		} else {
+			mpidr = ctxt->sys_regs[MPIDR_EL1];
+		}
+	} else {
+		mpidr = ctxt->sys_regs[MPIDR_EL1];
+	}
+
+	write_sysreg(mpidr,				vmpidr_el2);
 	write_sysreg_el1(ctxt->sys_regs[SCTLR_EL1],	SYS_SCTLR);
 	write_sysreg(ctxt->sys_regs[ACTLR_EL1],		actlr_el1);
 	write_sysreg_el1(ctxt->sys_regs[CPACR_EL1],	SYS_CPACR);
@@ -151,6 +275,19 @@  static void __hyp_text __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt)
 	write_sysreg_el1(ctxt->gp_regs.spsr[KVM_SPSR_EL1],SYS_SPSR);
 }
 
+static void __hyp_text __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt)
+{
+	write_sysreg(ctxt->sys_regs[CSSELR_EL1],	csselr_el1);
+	write_sysreg(ctxt->sys_regs[ACTLR_EL1],	  	actlr_el1);
+	write_sysreg(ctxt->sys_regs[PAR_EL1],		par_el1);
+	write_sysreg(ctxt->sys_regs[TPIDR_EL1],		tpidr_el1);
+
+	if (__is_hyp_ctxt(ctxt))
+		__sysreg_restore_vel2_state(ctxt);
+	else
+		__sysreg_restore_vel1_state(ctxt);
+}
+
 static void __hyp_text
 __sysreg_restore_el2_return_state(struct kvm_cpu_context *ctxt)
 {
@@ -307,6 +444,15 @@  void kvm_vcpu_put_sysregs(struct kvm_vcpu *vcpu)
 	/* Restore host user state */
 	__sysreg_restore_user_state(host_ctxt);
 
+	/*
+	 * If leaving a nesting guest, restore MPIDR_EL1 default view. It is
+	 * slightly ugly to do it here, but the alternative is to penalize
+	 * all non-nesting guests by forcing this on every load. Instead, we
+	 * choose to only penalize nesting VMs.
+	 */
+	if (nested_virt_in_use(vcpu))
+		write_sysreg(read_cpuid_id(),	vpidr_el2);
+
 	vcpu->arch.sysregs_loaded_on_cpu = false;
 }