diff mbox series

[4/5] arm64: KVM: Prevent speculative S1 PTW when restoring vcpu context

Message ID 20190925111941.88103-5-maz@kernel.org (mailing list archive)
State New, archived
Headers show
Series arm64: KVM: Add workaround for errata 1319367 and 1319537 | expand

Commit Message

Marc Zyngier Sept. 25, 2019, 11:19 a.m. UTC
When handling erratum 1319367, we must ensure that the page table
walker cannot parse the S1 page tables while the guest is in an
inconsistent state. This is done as follows:

On guest entry:
- TCR_EL1.EPD{0,1} are set, ensuring that no PTW can occur
- all system registers are restored, except for TCR_EL1 and SCTLR_EL1
- stage-2 is restored
- SCTLR_EL1 and TCR_EL1 are restored

On guest exit:
- SCTLR_EL1.M and TCR_EL1.EPD{0,1} are set, ensuring that no PTW can occur
- stage-2 is disabled
- All host system registers are restored

Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 arch/arm64/kvm/hyp/switch.c    | 31 +++++++++++++++++++++++++++++++
 arch/arm64/kvm/hyp/sysreg-sr.c | 14 ++++++++++++--
 2 files changed, 43 insertions(+), 2 deletions(-)

Comments

James Morse Oct. 3, 2019, 11:09 a.m. UTC | #1
Hi Marc,

On 25/09/2019 12:19, Marc Zyngier wrote:
> When handling erratum 1319367, we must ensure that the page table
> walker cannot parse the S1 page tables while the guest is in an
> inconsistent state. This is done as follows:
> 
> On guest entry:
> - TCR_EL1.EPD{0,1} are set, ensuring that no PTW can occur
> - all system registers are restored, except for TCR_EL1 and SCTLR_EL1
> - stage-2 is restored
> - SCTLR_EL1 and TCR_EL1 are restored
> 
> On guest exit:
> - SCTLR_EL1.M and TCR_EL1.EPD{0,1} are set, ensuring that no PTW can occur
> - stage-2 is disabled
> - All host system registers are restored

> diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c
> index e6adb90c12ae..4df47d013bec 100644
> --- a/arch/arm64/kvm/hyp/switch.c
> +++ b/arch/arm64/kvm/hyp/switch.c
> @@ -156,6 +170,23 @@ static void __hyp_text __deactivate_traps_nvhe(void)
>  {
>  	u64 mdcr_el2 = read_sysreg(mdcr_el2);
>  
> +	if (cpus_have_const_cap(ARM64_WORKAROUND_1319367)) {
> +		u64 val;
> +
> +		/*
> +		 * Set the TCR and SCTLR registers in the exact opposite
> +		 * sequence as __activate_traps_nvhe (first prevent walks,
> +		 * then force the MMU on). A generous sprinkling of isb()
> +		 * ensure that things happen in this exact order.
> +		 */
> +		val = read_sysreg_el1(SYS_TCR);
> +		write_sysreg_el1(val | TCR_EPD1_MASK | TCR_EPD0_MASK, SYS_TCR);
> +		isb();
> +		val = read_sysreg_el1(SYS_SCTLR);
> +		write_sysreg_el1(val | SCTLR_ELx_M, SYS_SCTLR);
> +		isb();
> +	}

We are exiting the guest, and heading back to the host.
This change forces stage-1 off. Stage-2 is still enabled, but its about to be disabled and
have the host VMID restore in __deactivate_vm(). All good so far.

Then we hit __sysreg_restore_state_nvhe() for the host, which calls
__sysreg_restore_el1_state()...


> diff --git a/arch/arm64/kvm/hyp/sysreg-sr.c b/arch/arm64/kvm/hyp/sysreg-sr.c
> index 7ddbc849b580..adabdceacc10 100644
> --- a/arch/arm64/kvm/hyp/sysreg-sr.c
> +++ b/arch/arm64/kvm/hyp/sysreg-sr.c
> @@ -117,12 +117,22 @@ static void __hyp_text __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt)
>  {
>  	write_sysreg(ctxt->sys_regs[MPIDR_EL1],		vmpidr_el2);
>  	write_sysreg(ctxt->sys_regs[CSSELR_EL1],	csselr_el1);
> -	write_sysreg_el1(ctxt->sys_regs[SCTLR_EL1],	SYS_SCTLR);
> +
> +	/* Must only be done for guest registers, hence the context test */
> +	if (cpus_have_const_cap(ARM64_WORKAROUND_1319367) &&
> +	    !ctxt->__hyp_running_vcpu) {
> +		write_sysreg_el1(ctxt->sys_regs[TCR_EL1] |
> +				 TCR_EPD1_MASK | TCR_EPD0_MASK,	SYS_TCR);
> +		isb();
> +	} else {

... which will come in here.

> +		write_sysreg_el1(ctxt->sys_regs[SCTLR_EL1],	SYS_SCTLR);
> +		write_sysreg_el1(ctxt->sys_regs[TCR_EL1],	SYS_TCR);

This reverses what we did in __deactivate_traps_nvhe(), but we haven't restored the host
TTBRs yet. I don't think the vttbr_el2 write has been sync'd either.

A speculative AT at this point could see the TCR EPDx bits clear, but the guest's TTBR
values. It may also see the guest-VMID.


I think the change to this function needs splitting up. Restore of guest state needs to be
as you have it here, before the guest TTBRs are written.

Restore of the host state needs to only clear the EPDx bits after the TTBRs are written,
and sync'd.


Assuming I'm making sense ... with that:
Reviewed-by: James Morse <james.morse@arm.com>

for the series.


> +	}
> +
>  	write_sysreg(ctxt->sys_regs[ACTLR_EL1],		actlr_el1);
>  	write_sysreg_el1(ctxt->sys_regs[CPACR_EL1],	SYS_CPACR);
>  	write_sysreg_el1(ctxt->sys_regs[TTBR0_EL1],	SYS_TTBR0);
>  	write_sysreg_el1(ctxt->sys_regs[TTBR1_EL1],	SYS_TTBR1);
> -	write_sysreg_el1(ctxt->sys_regs[TCR_EL1],	SYS_TCR);
>  	write_sysreg_el1(ctxt->sys_regs[ESR_EL1],	SYS_ESR);
>  	write_sysreg_el1(ctxt->sys_regs[AFSR0_EL1],	SYS_AFSR0);
>  	write_sysreg_el1(ctxt->sys_regs[AFSR1_EL1],	SYS_AFSR1);


Thanks,

James
diff mbox series

Patch

diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c
index e6adb90c12ae..4df47d013bec 100644
--- a/arch/arm64/kvm/hyp/switch.c
+++ b/arch/arm64/kvm/hyp/switch.c
@@ -118,6 +118,20 @@  static void __hyp_text __activate_traps_nvhe(struct kvm_vcpu *vcpu)
 	}
 
 	write_sysreg(val, cptr_el2);
+
+	if (cpus_have_const_cap(ARM64_WORKAROUND_1319367)) {
+		struct kvm_cpu_context *ctxt = &vcpu->arch.ctxt;
+
+		isb();
+		/*
+		 * At this stage, and thanks to the above isb(), S2 is
+		 * configured and enabled. We can now restore the guest's S1
+		 * configuration: SCTLR, and only then TCR.
+		 */
+		write_sysreg_el1(ctxt->sys_regs[SCTLR_EL1],	SYS_SCTLR);
+		isb();
+		write_sysreg_el1(ctxt->sys_regs[TCR_EL1],	SYS_TCR);
+	}
 }
 
 static void __hyp_text __activate_traps(struct kvm_vcpu *vcpu)
@@ -156,6 +170,23 @@  static void __hyp_text __deactivate_traps_nvhe(void)
 {
 	u64 mdcr_el2 = read_sysreg(mdcr_el2);
 
+	if (cpus_have_const_cap(ARM64_WORKAROUND_1319367)) {
+		u64 val;
+
+		/*
+		 * Set the TCR and SCTLR registers in the exact opposite
+		 * sequence as __activate_traps_nvhe (first prevent walks,
+		 * then force the MMU on). A generous sprinkling of isb()
+		 * ensure that things happen in this exact order.
+		 */
+		val = read_sysreg_el1(SYS_TCR);
+		write_sysreg_el1(val | TCR_EPD1_MASK | TCR_EPD0_MASK, SYS_TCR);
+		isb();
+		val = read_sysreg_el1(SYS_SCTLR);
+		write_sysreg_el1(val | SCTLR_ELx_M, SYS_SCTLR);
+		isb();
+	}
+
 	__deactivate_traps_common();
 
 	mdcr_el2 &= MDCR_EL2_HPMN_MASK;
diff --git a/arch/arm64/kvm/hyp/sysreg-sr.c b/arch/arm64/kvm/hyp/sysreg-sr.c
index 7ddbc849b580..adabdceacc10 100644
--- a/arch/arm64/kvm/hyp/sysreg-sr.c
+++ b/arch/arm64/kvm/hyp/sysreg-sr.c
@@ -117,12 +117,22 @@  static void __hyp_text __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt)
 {
 	write_sysreg(ctxt->sys_regs[MPIDR_EL1],		vmpidr_el2);
 	write_sysreg(ctxt->sys_regs[CSSELR_EL1],	csselr_el1);
-	write_sysreg_el1(ctxt->sys_regs[SCTLR_EL1],	SYS_SCTLR);
+
+	/* Must only be done for guest registers, hence the context test */
+	if (cpus_have_const_cap(ARM64_WORKAROUND_1319367) &&
+	    !ctxt->__hyp_running_vcpu) {
+		write_sysreg_el1(ctxt->sys_regs[TCR_EL1] |
+				 TCR_EPD1_MASK | TCR_EPD0_MASK,	SYS_TCR);
+		isb();
+	} else {
+		write_sysreg_el1(ctxt->sys_regs[SCTLR_EL1],	SYS_SCTLR);
+		write_sysreg_el1(ctxt->sys_regs[TCR_EL1],	SYS_TCR);
+	}
+
 	write_sysreg(ctxt->sys_regs[ACTLR_EL1],		actlr_el1);
 	write_sysreg_el1(ctxt->sys_regs[CPACR_EL1],	SYS_CPACR);
 	write_sysreg_el1(ctxt->sys_regs[TTBR0_EL1],	SYS_TTBR0);
 	write_sysreg_el1(ctxt->sys_regs[TTBR1_EL1],	SYS_TTBR1);
-	write_sysreg_el1(ctxt->sys_regs[TCR_EL1],	SYS_TCR);
 	write_sysreg_el1(ctxt->sys_regs[ESR_EL1],	SYS_ESR);
 	write_sysreg_el1(ctxt->sys_regs[AFSR0_EL1],	SYS_AFSR0);
 	write_sysreg_el1(ctxt->sys_regs[AFSR1_EL1],	SYS_AFSR1);