diff mbox series

[v3,14/18] KVM: arm64: nv: Add SW walker for AT S1 emulation

Message ID 20240813100540.1955263-15-maz@kernel.org (mailing list archive)
State New, archived
Headers show
Series KVM: arm64: nv: Add support for address translation instructions | expand

Commit Message

Marc Zyngier Aug. 13, 2024, 10:05 a.m. UTC
In order to plug the brokenness of our current AT implementation,
we need a SW walker that is going to... err.. walk the S1 tables
and tell us what it finds.

Of course, it builds on top of our S2 walker, and share similar
concepts. The beauty of it is that since it uses kvm_read_guest(),
it is able to bring back pages that have been otherwise evicted.

This is then plugged in the two AT S1 emulation functions as
a "slow path" fallback. I'm not sure it is that slow, but hey.

Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 arch/arm64/kvm/at.c | 607 +++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 605 insertions(+), 2 deletions(-)

Comments

Marc Zyngier Aug. 14, 2024, 10:08 a.m. UTC | #1
On 2024-08-13 11:05, Marc Zyngier wrote:
> In order to plug the brokenness of our current AT implementation,
> we need a SW walker that is going to... err.. walk the S1 tables
> and tell us what it finds.
> 
> Of course, it builds on top of our S2 walker, and share similar
> concepts. The beauty of it is that since it uses kvm_read_guest(),
> it is able to bring back pages that have been otherwise evicted.
> 
> This is then plugged in the two AT S1 emulation functions as
> a "slow path" fallback. I'm not sure it is that slow, but hey.
> 
> Signed-off-by: Marc Zyngier <maz@kernel.org>
> ---
>  arch/arm64/kvm/at.c | 607 +++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 605 insertions(+), 2 deletions(-)

[...]

> +static u64 handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
> +{
> +	bool perm_fail, ur, uw, ux, pr, pw, px;
> +	struct s1_walk_result wr = {};
> +	struct s1_walk_info wi = {};
> +	int ret, idx;
> +
> +	ret = setup_s1_walk(vcpu, op, &wi, &wr, vaddr);
> +	if (ret)
> +		goto compute_par;
> +
> +	if (wr.level == S1_MMU_DISABLED)
> +		goto compute_par;
> +
> +	idx = srcu_read_lock(&vcpu->kvm->srcu);
> +
> +	ret = walk_s1(vcpu, &wi, &wr, vaddr);
> +
> +	srcu_read_unlock(&vcpu->kvm->srcu, idx);
> +
> +	if (ret)
> +		goto compute_par;
> +
> +	/* FIXME: revisit when adding indirect permission support */
> +	/* AArch64.S1DirectBasePermissions() */
> +	if (wi.regime != TR_EL2) {
> +		switch (FIELD_GET(PTE_USER | PTE_RDONLY, wr.desc)) {
> +		case 0b00:
> +			pr = pw = true;
> +			ur = uw = false;
> +			break;
> +		case 0b01:
> +			pr = pw = ur = uw = true;
> +			break;
> +		case 0b10:
> +			pr = true;
> +			pw = ur = uw = false;
> +			break;
> +		case 0b11:
> +			pr = ur = true;
> +			pw = uw = false;
> +			break;
> +		}
> +
> +		switch (wr.APTable) {
> +		case 0b00:
> +			break;
> +		case 0b01:
> +			ur = uw = false;
> +			break;
> +		case 0b10:
> +			pw = uw = false;
> +			break;
> +		case 0b11:
> +			pw = ur = uw = false;
> +			break;
> +		}
> +
> +		/* We don't use px for anything yet, but hey... */
> +		px = !((wr.desc & PTE_PXN) || wr.PXNTable || pw);

Annoying (but so far harmless) bug here: the last term should be 'uw',
and not 'pw'. It is a *userspace* writable permission that disables
privileged execution. 'pw' would only make sense if with WXN, and
that has nothing to do with AT at all.

I've fixed that locally.

         M.
Alexandru Elisei Aug. 15, 2024, 4:44 p.m. UTC | #2
Hi Marc,

On Tue, Aug 13, 2024 at 11:05:36AM +0100, Marc Zyngier wrote:
> In order to plug the brokenness of our current AT implementation,
> we need a SW walker that is going to... err.. walk the S1 tables
> and tell us what it finds.
> 
> Of course, it builds on top of our S2 walker, and share similar
> concepts. The beauty of it is that since it uses kvm_read_guest(),
> it is able to bring back pages that have been otherwise evicted.
> 
> This is then plugged in the two AT S1 emulation functions as
> a "slow path" fallback. I'm not sure it is that slow, but hey.
> 
> Signed-off-by: Marc Zyngier <maz@kernel.org>
> ---
>  arch/arm64/kvm/at.c | 607 +++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 605 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c
> index 9865d29b3149..6d5555e98557 100644
> --- a/arch/arm64/kvm/at.c
> +++ b/arch/arm64/kvm/at.c
> @@ -4,9 +4,405 @@
>   * Author: Jintack Lim <jintack.lim@linaro.org>
>   */
>  
> +#include <linux/kvm_host.h>
> +
> +#include <asm/esr.h>
>  #include <asm/kvm_hyp.h>
>  #include <asm/kvm_mmu.h>
>  
> +enum trans_regime {
> +	TR_EL10,
> +	TR_EL20,
> +	TR_EL2,
> +};
> +
> +struct s1_walk_info {
> +	u64	     		baddr;
> +	enum trans_regime	regime;
> +	unsigned int		max_oa_bits;
> +	unsigned int		pgshift;
> +	unsigned int		txsz;
> +	int 	     		sl;
> +	bool	     		hpd;
> +	bool	     		be;
> +	bool	     		s2;
> +};
> +
> +struct s1_walk_result {
> +	union {
> +		struct {
> +			u64	desc;
> +			u64	pa;
> +			s8	level;
> +			u8	APTable;
> +			bool	UXNTable;
> +			bool	PXNTable;
> +		};
> +		struct {
> +			u8	fst;
> +			bool	ptw;
> +			bool	s2;
> +		};
> +	};
> +	bool	failed;
> +};
> +
> +static void fail_s1_walk(struct s1_walk_result *wr, u8 fst, bool ptw, bool s2)
> +{
> +	wr->fst		= fst;
> +	wr->ptw		= ptw;
> +	wr->s2		= s2;
> +	wr->failed	= true;
> +}
> +
> +#define S1_MMU_DISABLED		(-127)
> +
> +static int get_ia_size(struct s1_walk_info *wi)
> +{
> +	return 64 - wi->txsz;
> +}
> +
> +/* Return true if the IPA is out of the OA range */
> +static bool check_output_size(u64 ipa, struct s1_walk_info *wi)
> +{
> +	return wi->max_oa_bits < 48 && (ipa & GENMASK_ULL(47, wi->max_oa_bits));
> +}
> +
> +/* Return the translation regime that applies to an AT instruction */
> +static enum trans_regime compute_translation_regime(struct kvm_vcpu *vcpu, u32 op)
> +{
> +	/*
> +	 * We only get here from guest EL2, so the translation
> +	 * regime AT applies to is solely defined by {E2H,TGE}.
> +	 */
> +	switch (op) {
> +	case OP_AT_S1E2R:
> +	case OP_AT_S1E2W:
> +		return vcpu_el2_e2h_is_set(vcpu) ? TR_EL20 : TR_EL2;
> +		break;
> +	default:
> +		return (vcpu_el2_e2h_is_set(vcpu) &&
> +			vcpu_el2_tge_is_set(vcpu)) ? TR_EL20 : TR_EL10;
> +	}
> +}
> +
> +static int setup_s1_walk(struct kvm_vcpu *vcpu, u32 op, struct s1_walk_info *wi,
> +			 struct s1_walk_result *wr, u64 va)
> +{
> +	u64 sctlr, tcr, tg, ps, ia_bits, ttbr;
> +	unsigned int stride, x;
> +	bool va55, tbi, lva, as_el0;
> +
> +	wi->regime = compute_translation_regime(vcpu, op);
> +	as_el0 = (op == OP_AT_S1E0R || op == OP_AT_S1E0W);
> +
> +	va55 = va & BIT(55);
> +
> +	if (wi->regime == TR_EL2 && va55)
> +		goto addrsz;
> +
> +	wi->s2 = (wi->regime == TR_EL10 &&
> +		  (__vcpu_sys_reg(vcpu, HCR_EL2) & (HCR_VM | HCR_DC)));

This could be written on one line if there were a local variable for the HCR_EL2
register (which is already read multiple times in the function).

> +
> +	switch (wi->regime) {
> +	case TR_EL10:
> +		sctlr	= vcpu_read_sys_reg(vcpu, SCTLR_EL1);
> +		tcr	= vcpu_read_sys_reg(vcpu, TCR_EL1);
> +		ttbr	= (va55 ?
> +			   vcpu_read_sys_reg(vcpu, TTBR1_EL1) :
> +			   vcpu_read_sys_reg(vcpu, TTBR0_EL1));
> +		break;
> +	case TR_EL2:
> +	case TR_EL20:
> +		sctlr	= vcpu_read_sys_reg(vcpu, SCTLR_EL2);
> +		tcr	= vcpu_read_sys_reg(vcpu, TCR_EL2);
> +		ttbr	= (va55 ?
> +			   vcpu_read_sys_reg(vcpu, TTBR1_EL2) :
> +			   vcpu_read_sys_reg(vcpu, TTBR0_EL2));
> +		break;
> +	default:
> +		BUG();
> +	}
> +
> +	tbi = (wi->regime == TR_EL2 ?
> +	       FIELD_GET(TCR_EL2_TBI, tcr) :
> +	       (va55 ?
> +		FIELD_GET(TCR_TBI1, tcr) :
> +		FIELD_GET(TCR_TBI0, tcr)));
> +
> +	if (!tbi && (u64)sign_extend64(va, 55) != va)
> +		goto addrsz;
> +
> +	va = (u64)sign_extend64(va, 55);
> +
> +	/* Let's put the MMU disabled case aside immediately */
> +	switch (wi->regime) {
> +	case TR_EL10:
> +		/*
> +		 * If dealing with the EL1&0 translation regime, 3 things
> +		 * can disable the S1 translation:
> +		 *
> +		 * - HCR_EL2.DC = 1
> +		 * - HCR_EL2.{E2H,TGE} = {0,1}
> +		 * - SCTLR_EL1.M = 0
> +		 *
> +		 * The TGE part is interesting. If we have decided that this
> +		 * is EL1&0, then it means that either {E2H,TGE} == {1,0} or
> +		 * {0,x}, and we only need to test for TGE == 1.
> +		 */
> +		if (__vcpu_sys_reg(vcpu, HCR_EL2) & (HCR_DC | HCR_TGE))
> +			wr->level = S1_MMU_DISABLED;

There's no need to fallthrough and check SCTLR_ELx.M if the MMU disabled check
here is true.

> +		fallthrough;
> +	case TR_EL2:
> +	case TR_EL20:
> +		if (!(sctlr & SCTLR_ELx_M))
> +			wr->level = S1_MMU_DISABLED;
> +		break;
> +	}
> +
> +	if (wr->level == S1_MMU_DISABLED) {
> +		if (va >= BIT(kvm_get_pa_bits(vcpu->kvm)))
> +			goto addrsz;
> +
> +		wr->pa = va;
> +		return 0;
> +	}
> +
> +	wi->be = sctlr & SCTLR_ELx_EE;
> +
> +	wi->hpd  = kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, HPDS, IMP);
> +	wi->hpd &= (wi->regime == TR_EL2 ?
> +		    FIELD_GET(TCR_EL2_HPD, tcr) :
> +		    (va55 ?
> +		     FIELD_GET(TCR_HPD1, tcr) :
> +		     FIELD_GET(TCR_HPD0, tcr)));
> +
> +	/* Someone was silly enough to encode TG0/TG1 differently */
> +	if (va55) {
> +		wi->txsz = FIELD_GET(TCR_T1SZ_MASK, tcr);
> +		tg = FIELD_GET(TCR_TG1_MASK, tcr);
> +
> +		switch (tg << TCR_TG1_SHIFT) {
> +		case TCR_TG1_4K:
> +			wi->pgshift = 12;	 break;
> +		case TCR_TG1_16K:
> +			wi->pgshift = 14;	 break;
> +		case TCR_TG1_64K:
> +		default:	    /* IMPDEF: treat any other value as 64k */
> +			wi->pgshift = 16;	 break;
> +		}

Just a thought, wi->pgshift is used in several places to identify the guest page
size, might be useful to have something like PAGE_SHIFT_{4K,16K,64K}. That would
also make its usage consistent, because in some places wi->pgshift is compared
directly to 12, 14 or 16, in other places the page size is computed from
wi->pgshift and compared to SZ_4K, SZ_16K or SZ_64K.

> +	} else {
> +		wi->txsz = FIELD_GET(TCR_T0SZ_MASK, tcr);
> +		tg = FIELD_GET(TCR_TG0_MASK, tcr);
> +
> +		switch (tg << TCR_TG0_SHIFT) {
> +		case TCR_TG0_4K:
> +			wi->pgshift = 12;	 break;
> +		case TCR_TG0_16K:
> +			wi->pgshift = 14;	 break;
> +		case TCR_TG0_64K:
> +		default:	    /* IMPDEF: treat any other value as 64k */
> +			wi->pgshift = 16;	 break;
> +		}
> +	}
> +
> +	/* R_PLCGL, R_YXNYW */
> +	if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR2_EL1, ST, 48_47)) {
> +		if (wi->txsz > 39)
> +			goto transfault_l0;
> +	} else {
> +		if (wi->txsz > 48 || (wi->pgshift == 16 && wi->txsz > 47))
> +			goto transfault_l0;
> +	}
> +
> +	/* R_GTJBY, R_SXWGM */
> +	switch (BIT(wi->pgshift)) {
> +	case SZ_4K:
> +		lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN4, 52_BIT);
> +		lva &= tcr & (wi->regime == TR_EL2 ? TCR_EL2_DS : TCR_DS);
> +		break;
> +	case SZ_16K:
> +		lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN16, 52_BIT);
> +		lva &= tcr & (wi->regime == TR_EL2 ? TCR_EL2_DS : TCR_DS);
> +		break;
> +	case SZ_64K:
> +		lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR2_EL1, VARange, 52);
> +		break;
> +	}
> +
> +	if ((lva && wi->txsz < 12) || wi->txsz < 16)
> +		goto transfault_l0;

Let's assume lva = true, wi->txsz greater than 12, but smaller than 16, which is
architecturally allowed according to R_GTJBY and AArch64.S1MinTxSZ().

(lva && wi->txsz < 12) = false
wi->txsz < 16 = true

KVM treats it as a fault.

> +
> +	ia_bits = get_ia_size(wi);
> +
> +	/* R_YYVYV, I_THCZK */
> +	if ((!va55 && va > GENMASK(ia_bits - 1, 0)) ||
> +	    (va55 && va < GENMASK(63, ia_bits)))
> +		goto transfault_l0;
> +
> +	/* I_ZFSYQ */
> +	if (wi->regime != TR_EL2 &&
> +	    (tcr & ((va55) ? TCR_EPD1_MASK : TCR_EPD0_MASK)))
> +		goto transfault_l0;

Extra paranthesis around va55?

> +
> +	/* R_BNDVG and following statements */
> +	if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR2_EL1, E0PD, IMP) &&
> +	    as_el0 && (tcr & ((va55) ? TCR_E0PD1 : TCR_E0PD0)))
> +		goto transfault_l0;

Same here with the extra paranthesis around va55.

> +
> +	/* AArch64.S1StartLevel() */
> +	stride = wi->pgshift - 3;
> +	wi->sl = 3 - (((ia_bits - 1) - wi->pgshift) / stride);
> +
> +	ps = (wi->regime == TR_EL2 ?
> +	      FIELD_GET(TCR_EL2_PS_MASK, tcr) : FIELD_GET(TCR_IPS_MASK, tcr));
> +
> +	wi->max_oa_bits = min(get_kvm_ipa_limit(), ps_to_output_size(ps));
> +
> +	/* Compute minimal alignment */
> +	x = 3 + ia_bits - ((3 - wi->sl) * stride + wi->pgshift);
> +
> +	wi->baddr = ttbr & TTBRx_EL1_BADDR;
> +
> +	/* R_VPBBF */
> +	if (check_output_size(wi->baddr, wi))
> +		goto transfault_l0;

I think R_VPBBF says that an Address size fault is generated here, not a
translation fault.

> +
> +	wi->baddr &= GENMASK_ULL(wi->max_oa_bits - 1, x);
> +
> +	return 0;
> +
> +addrsz:				/* Address Size Fault level 0 */
> +	fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(0), false, false);
> +	return -EFAULT;
> +
> +transfault_l0:			/* Translation Fault level 0 */
> +	fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(0), false, false);
> +	return -EFAULT;
> +}

[..]

> +static bool par_check_s1_perm_fault(u64 par)
> +{
> +	u8 fst = FIELD_GET(SYS_PAR_EL1_FST, par);
> +
> +	return  ((fst & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_PERM &&
> +		 !(par & SYS_PAR_EL1_S));

ESR_ELx_FSC_PERM = 0x0c is a permission fault, level 0, which Arm ARM says can
only happen when FEAT_LPA2. I think the code should check that the value for
PAR_EL1.FST is in the interval (ESR_ELx_FSC_PERM_L(0), ESR_ELx_FSC_PERM_L(3)].

With the remaining minor issues fixed:

Reviewed-by: Alexandru Elisei <alexandru.elisei@Arm.com>

Thanks,
Alex

> +}
> +
>  void __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
>  {
>  	u64 par = __kvm_at_s1e01_fast(vcpu, op, vaddr);
>  
> +	/*
> +	 * If PAR_EL1 reports that AT failed on a S1 permission fault, we
> +	 * know for sure that the PTW was able to walk the S1 tables and
> +	 * there's nothing else to do.
> +	 *
> +	 * If AT failed for any other reason, then we must walk the guest S1
> +	 * to emulate the instruction.
> +	 */
> +	if ((par & SYS_PAR_EL1_F) && !par_check_s1_perm_fault(par))
> +		par = handle_at_slow(vcpu, op, vaddr);
> +
>  	vcpu_write_sys_reg(vcpu, par, PAR_EL1);
>  }
>  
> @@ -407,6 +1006,10 @@ void __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
>  		isb();
>  	}
>  
> +	/* We failed the translation, let's replay it in slow motion */
> +	if ((par & SYS_PAR_EL1_F) && !par_check_s1_perm_fault(par))
> +		par = handle_at_slow(vcpu, op, vaddr);
> +
>  	vcpu_write_sys_reg(vcpu, par, PAR_EL1);
>  }
>  
> @@ -463,7 +1066,7 @@ void __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
>  	/* Check the access permission */
>  	if (!out.esr &&
>  	    ((!write && !out.readable) || (write && !out.writable)))
> -		out.esr = ESR_ELx_FSC_PERM | (out.level & 0x3);
> +		out.esr = ESR_ELx_FSC_PERM_L(out.level & 0x3);
>  
>  	par = compute_par_s12(vcpu, par, &out);
>  	vcpu_write_sys_reg(vcpu, par, PAR_EL1);
> -- 
> 2.39.2
>
Marc Zyngier Aug. 15, 2024, 6:28 p.m. UTC | #3
Hi Alex,

On Thu, 15 Aug 2024 17:44:02 +0100,
Alexandru Elisei <alexandru.elisei@arm.com> wrote:

[..]

> > +static int setup_s1_walk(struct kvm_vcpu *vcpu, u32 op, struct s1_walk_info *wi,
> > +			 struct s1_walk_result *wr, u64 va)
> > +{
> > +	u64 sctlr, tcr, tg, ps, ia_bits, ttbr;
> > +	unsigned int stride, x;
> > +	bool va55, tbi, lva, as_el0;
> > +
> > +	wi->regime = compute_translation_regime(vcpu, op);
> > +	as_el0 = (op == OP_AT_S1E0R || op == OP_AT_S1E0W);
> > +
> > +	va55 = va & BIT(55);
> > +
> > +	if (wi->regime == TR_EL2 && va55)
> > +		goto addrsz;
> > +
> > +	wi->s2 = (wi->regime == TR_EL10 &&
> > +		  (__vcpu_sys_reg(vcpu, HCR_EL2) & (HCR_VM | HCR_DC)));
> 
> This could be written on one line if there were a local variable for the HCR_EL2
> register (which is already read multiple times in the function).

Sure thing.

[...]

> > +	/* Let's put the MMU disabled case aside immediately */
> > +	switch (wi->regime) {
> > +	case TR_EL10:
> > +		/*
> > +		 * If dealing with the EL1&0 translation regime, 3 things
> > +		 * can disable the S1 translation:
> > +		 *
> > +		 * - HCR_EL2.DC = 1
> > +		 * - HCR_EL2.{E2H,TGE} = {0,1}
> > +		 * - SCTLR_EL1.M = 0
> > +		 *
> > +		 * The TGE part is interesting. If we have decided that this
> > +		 * is EL1&0, then it means that either {E2H,TGE} == {1,0} or
> > +		 * {0,x}, and we only need to test for TGE == 1.
> > +		 */
> > +		if (__vcpu_sys_reg(vcpu, HCR_EL2) & (HCR_DC | HCR_TGE))
> > +			wr->level = S1_MMU_DISABLED;
> 
> There's no need to fallthrough and check SCTLR_ELx.M if the MMU disabled check
> here is true.

I'm not sure it makes the code more readable. But if you do, why not.

[...]

> > +	/* Someone was silly enough to encode TG0/TG1 differently */
> > +	if (va55) {
> > +		wi->txsz = FIELD_GET(TCR_T1SZ_MASK, tcr);
> > +		tg = FIELD_GET(TCR_TG1_MASK, tcr);
> > +
> > +		switch (tg << TCR_TG1_SHIFT) {
> > +		case TCR_TG1_4K:
> > +			wi->pgshift = 12;	 break;
> > +		case TCR_TG1_16K:
> > +			wi->pgshift = 14;	 break;
> > +		case TCR_TG1_64K:
> > +		default:	    /* IMPDEF: treat any other value as 64k */
> > +			wi->pgshift = 16;	 break;
> > +		}
> 
> Just a thought, wi->pgshift is used in several places to identify the guest page
> size, might be useful to have something like PAGE_SHIFT_{4K,16K,64K}. That would
> also make its usage consistent, because in some places wi->pgshift is compared
> directly to 12, 14 or 16, in other places the page size is computed from
> wi->pgshift and compared to SZ_4K, SZ_16K or SZ_64K.

I only found a single place where we compare wi->pgshift to a
non-symbolic integer (as part of the R_YXNYW handling). Everywhere
else, we use BIT(wi->pgshift) and compare it to SZ_*K. We moved away
from the various PAGE_SHIFT_* macros some years ago, and I don't think
we want them back.

What I can do is to convert the places where we init pgshift to use an
explicit size using const_ilog2():

@@ -185,12 +188,12 @@ static int setup_s1_walk(struct kvm_vcpu *vcpu, u32 op, struct s1_walk_info *wi,
 
 		switch (tg << TCR_TG1_SHIFT) {
 		case TCR_TG1_4K:
-			wi->pgshift = 12;	 break;
+			wi->pgshift = const_ilog2(SZ_4K);	 break;
 		case TCR_TG1_16K:
-			wi->pgshift = 14;	 break;
+			wi->pgshift = const_ilog2(SZ_16K);	 break;
 		case TCR_TG1_64K:
 		default:	    /* IMPDEF: treat any other value as 64k */
-			wi->pgshift = 16;	 break;
+			wi->pgshift = const_ilog2(SZ_64K);	 break;
 		}
 	} else {
 		wi->txsz = FIELD_GET(TCR_T0SZ_MASK, tcr);
@@ -198,12 +201,12 @@ static int setup_s1_walk(struct kvm_vcpu *vcpu, u32 op, struct s1_walk_info *wi,
 
 		switch (tg << TCR_TG0_SHIFT) {
 		case TCR_TG0_4K:
-			wi->pgshift = 12;	 break;
+			wi->pgshift = const_ilog2(SZ_4K);	 break;
 		case TCR_TG0_16K:
-			wi->pgshift = 14;	 break;
+			wi->pgshift = const_ilog2(SZ_16K);	 break;
 		case TCR_TG0_64K:
 		default:	    /* IMPDEF: treat any other value as 64k */
-			wi->pgshift = 16;	 break;
+			wi->pgshift = const_ilog2(SZ_64K);	 break;
 		}
 	}
 
@@ -212,7 +215,7 @@ static int setup_s1_walk(struct kvm_vcpu *vcpu, u32 op, struct s1_walk_info *wi,
 		if (wi->txsz > 39)
 			goto transfault_l0;
 	} else {
-		if (wi->txsz > 48 || (wi->pgshift == 16 && wi->txsz > 47))
+		if (wi->txsz > 48 || (BIT(wi->pgshift) == SZ_64K && wi->txsz > 47))
 			goto transfault_l0;
 	}


> 
> > +	} else {
> > +		wi->txsz = FIELD_GET(TCR_T0SZ_MASK, tcr);
> > +		tg = FIELD_GET(TCR_TG0_MASK, tcr);
> > +
> > +		switch (tg << TCR_TG0_SHIFT) {
> > +		case TCR_TG0_4K:
> > +			wi->pgshift = 12;	 break;
> > +		case TCR_TG0_16K:
> > +			wi->pgshift = 14;	 break;
> > +		case TCR_TG0_64K:
> > +		default:	    /* IMPDEF: treat any other value as 64k */
> > +			wi->pgshift = 16;	 break;
> > +		}
> > +	}
> > +
> > +	/* R_PLCGL, R_YXNYW */
> > +	if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR2_EL1, ST, 48_47)) {
> > +		if (wi->txsz > 39)
> > +			goto transfault_l0;
> > +	} else {
> > +		if (wi->txsz > 48 || (wi->pgshift == 16 && wi->txsz > 47))
> > +			goto transfault_l0;
> > +	}
> > +
> > +	/* R_GTJBY, R_SXWGM */
> > +	switch (BIT(wi->pgshift)) {
> > +	case SZ_4K:
> > +		lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN4, 52_BIT);
> > +		lva &= tcr & (wi->regime == TR_EL2 ? TCR_EL2_DS : TCR_DS);
> > +		break;
> > +	case SZ_16K:
> > +		lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN16, 52_BIT);
> > +		lva &= tcr & (wi->regime == TR_EL2 ? TCR_EL2_DS : TCR_DS);
> > +		break;
> > +	case SZ_64K:
> > +		lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR2_EL1, VARange, 52);
> > +		break;
> > +	}
> > +
> > +	if ((lva && wi->txsz < 12) || wi->txsz < 16)
> > +		goto transfault_l0;
> 
> Let's assume lva = true, wi->txsz greater than 12, but smaller than 16, which is
> architecturally allowed according to R_GTJBY and AArch64.S1MinTxSZ().
> 
> (lva && wi->txsz < 12) = false
> wi->txsz < 16 = true
> 
> KVM treats it as a fault.

Gah... Fixed with:

@@ -231,7 +234,7 @@ static int setup_s1_walk(struct kvm_vcpu *vcpu, u32 op, struct s1_walk_info *wi,
 		break;
 	}
 
-	if ((lva && wi->txsz < 12) || wi->txsz < 16)
+	if ((lva && wi->txsz < 12) || (!lva && wi->txsz < 16))
 		goto transfault_l0;
 
 	ia_bits = get_ia_size(wi);

Not that it has an impact yet, given that we don't support any of the
52bit stuff yet, but thanks for spotting this!

[...]

> > +	/* R_VPBBF */
> > +	if (check_output_size(wi->baddr, wi))
> > +		goto transfault_l0;
> 
> I think R_VPBBF says that an Address size fault is generated here, not a
> translation fault.

Indeed, another one fixed.

> 
> > +
> > +	wi->baddr &= GENMASK_ULL(wi->max_oa_bits - 1, x);
> > +
> > +	return 0;
> > +
> > +addrsz:				/* Address Size Fault level 0 */
> > +	fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(0), false, false);
> > +	return -EFAULT;
> > +
> > +transfault_l0:			/* Translation Fault level 0 */
> > +	fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(0), false, false);
> > +	return -EFAULT;
> > +}
> 
> [..]
> 
> > +static bool par_check_s1_perm_fault(u64 par)
> > +{
> > +	u8 fst = FIELD_GET(SYS_PAR_EL1_FST, par);
> > +
> > +	return  ((fst & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_PERM &&
> > +		 !(par & SYS_PAR_EL1_S));
> 
> ESR_ELx_FSC_PERM = 0x0c is a permission fault, level 0, which Arm ARM says can
> only happen when FEAT_LPA2. I think the code should check that the value for
> PAR_EL1.FST is in the interval (ESR_ELx_FSC_PERM_L(0), ESR_ELx_FSC_PERM_L(3)].

I honestly don't want to second-guess the HW. If it reports something
that is the wrong level, why should we trust the FSC at all?

> With the remaining minor issues fixed:
> 
> Reviewed-by: Alexandru Elisei <alexandru.elisei@Arm.com>

Again, many thanks for your patience going through this.

	M.
Alexandru Elisei Aug. 16, 2024, 9:22 a.m. UTC | #4
Hi Marc,

On Thu, Aug 15, 2024 at 07:28:41PM +0100, Marc Zyngier wrote:
> 
> Hi Alex,
> 
> On Thu, 15 Aug 2024 17:44:02 +0100,
> Alexandru Elisei <alexandru.elisei@arm.com> wrote:
> 
> [..]
> 
> > > +static int setup_s1_walk(struct kvm_vcpu *vcpu, u32 op, struct s1_walk_info *wi,
> > > +			 struct s1_walk_result *wr, u64 va)
> > > +{
> > > +	u64 sctlr, tcr, tg, ps, ia_bits, ttbr;
> > > +	unsigned int stride, x;
> > > +	bool va55, tbi, lva, as_el0;
> > > +
> > > +	wi->regime = compute_translation_regime(vcpu, op);
> > > +	as_el0 = (op == OP_AT_S1E0R || op == OP_AT_S1E0W);
> > > +
> > > +	va55 = va & BIT(55);
> > > +
> > > +	if (wi->regime == TR_EL2 && va55)
> > > +		goto addrsz;
> > > +
> > > +	wi->s2 = (wi->regime == TR_EL10 &&
> > > +		  (__vcpu_sys_reg(vcpu, HCR_EL2) & (HCR_VM | HCR_DC)));
> > 
> > This could be written on one line if there were a local variable for the HCR_EL2
> > register (which is already read multiple times in the function).
> 
> Sure thing.
> 
> [...]
> 
> > > +	/* Let's put the MMU disabled case aside immediately */
> > > +	switch (wi->regime) {
> > > +	case TR_EL10:
> > > +		/*
> > > +		 * If dealing with the EL1&0 translation regime, 3 things
> > > +		 * can disable the S1 translation:
> > > +		 *
> > > +		 * - HCR_EL2.DC = 1
> > > +		 * - HCR_EL2.{E2H,TGE} = {0,1}
> > > +		 * - SCTLR_EL1.M = 0
> > > +		 *
> > > +		 * The TGE part is interesting. If we have decided that this
> > > +		 * is EL1&0, then it means that either {E2H,TGE} == {1,0} or
> > > +		 * {0,x}, and we only need to test for TGE == 1.
> > > +		 */
> > > +		if (__vcpu_sys_reg(vcpu, HCR_EL2) & (HCR_DC | HCR_TGE))
> > > +			wr->level = S1_MMU_DISABLED;
> > 
> > There's no need to fallthrough and check SCTLR_ELx.M if the MMU disabled check
> > here is true.
> 
> I'm not sure it makes the code more readable. But if you do, why not.
> 
> [...]
> 
> > > +	/* Someone was silly enough to encode TG0/TG1 differently */
> > > +	if (va55) {
> > > +		wi->txsz = FIELD_GET(TCR_T1SZ_MASK, tcr);
> > > +		tg = FIELD_GET(TCR_TG1_MASK, tcr);
> > > +
> > > +		switch (tg << TCR_TG1_SHIFT) {
> > > +		case TCR_TG1_4K:
> > > +			wi->pgshift = 12;	 break;
> > > +		case TCR_TG1_16K:
> > > +			wi->pgshift = 14;	 break;
> > > +		case TCR_TG1_64K:
> > > +		default:	    /* IMPDEF: treat any other value as 64k */
> > > +			wi->pgshift = 16;	 break;
> > > +		}
> > 
> > Just a thought, wi->pgshift is used in several places to identify the guest page
> > size, might be useful to have something like PAGE_SHIFT_{4K,16K,64K}. That would
> > also make its usage consistent, because in some places wi->pgshift is compared
> > directly to 12, 14 or 16, in other places the page size is computed from
> > wi->pgshift and compared to SZ_4K, SZ_16K or SZ_64K.
> 
> I only found a single place where we compare wi->pgshift to a
> non-symbolic integer (as part of the R_YXNYW handling). Everywhere
> else, we use BIT(wi->pgshift) and compare it to SZ_*K. We moved away
> from the various PAGE_SHIFT_* macros some years ago, and I don't think
> we want them back.

Oh, I wasn't aware of that bit of history. No need to change the current code
then, it's readable enough.

> 
> What I can do is to convert the places where we init pgshift to use an
> explicit size using const_ilog2():
> 
> @@ -185,12 +188,12 @@ static int setup_s1_walk(struct kvm_vcpu *vcpu, u32 op, struct s1_walk_info *wi,
>  
>  		switch (tg << TCR_TG1_SHIFT) {
>  		case TCR_TG1_4K:
> -			wi->pgshift = 12;	 break;
> +			wi->pgshift = const_ilog2(SZ_4K);	 break;
>  		case TCR_TG1_16K:
> -			wi->pgshift = 14;	 break;
> +			wi->pgshift = const_ilog2(SZ_16K);	 break;
>  		case TCR_TG1_64K:
>  		default:	    /* IMPDEF: treat any other value as 64k */
> -			wi->pgshift = 16;	 break;
> +			wi->pgshift = const_ilog2(SZ_64K);	 break;
>  		}
>  	} else {
>  		wi->txsz = FIELD_GET(TCR_T0SZ_MASK, tcr);
> @@ -198,12 +201,12 @@ static int setup_s1_walk(struct kvm_vcpu *vcpu, u32 op, struct s1_walk_info *wi,
>  
>  		switch (tg << TCR_TG0_SHIFT) {
>  		case TCR_TG0_4K:
> -			wi->pgshift = 12;	 break;
> +			wi->pgshift = const_ilog2(SZ_4K);	 break;
>  		case TCR_TG0_16K:
> -			wi->pgshift = 14;	 break;
> +			wi->pgshift = const_ilog2(SZ_16K);	 break;
>  		case TCR_TG0_64K:
>  		default:	    /* IMPDEF: treat any other value as 64k */
> -			wi->pgshift = 16;	 break;
> +			wi->pgshift = const_ilog2(SZ_64K);	 break;
>  		}
>  	}
>  
> @@ -212,7 +215,7 @@ static int setup_s1_walk(struct kvm_vcpu *vcpu, u32 op, struct s1_walk_info *wi,
>  		if (wi->txsz > 39)
>  			goto transfault_l0;
>  	} else {
> -		if (wi->txsz > 48 || (wi->pgshift == 16 && wi->txsz > 47))
> +		if (wi->txsz > 48 || (BIT(wi->pgshift) == SZ_64K && wi->txsz > 47))
>  			goto transfault_l0;
>  	}
> 
> 
> > 
> > > +	} else {
> > > +		wi->txsz = FIELD_GET(TCR_T0SZ_MASK, tcr);
> > > +		tg = FIELD_GET(TCR_TG0_MASK, tcr);
> > > +
> > > +		switch (tg << TCR_TG0_SHIFT) {
> > > +		case TCR_TG0_4K:
> > > +			wi->pgshift = 12;	 break;
> > > +		case TCR_TG0_16K:
> > > +			wi->pgshift = 14;	 break;
> > > +		case TCR_TG0_64K:
> > > +		default:	    /* IMPDEF: treat any other value as 64k */
> > > +			wi->pgshift = 16;	 break;
> > > +		}
> > > +	}
> > > +
> > > +	/* R_PLCGL, R_YXNYW */
> > > +	if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR2_EL1, ST, 48_47)) {
> > > +		if (wi->txsz > 39)
> > > +			goto transfault_l0;
> > > +	} else {
> > > +		if (wi->txsz > 48 || (wi->pgshift == 16 && wi->txsz > 47))
> > > +			goto transfault_l0;
> > > +	}
> > > +
> > > +	/* R_GTJBY, R_SXWGM */
> > > +	switch (BIT(wi->pgshift)) {
> > > +	case SZ_4K:
> > > +		lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN4, 52_BIT);
> > > +		lva &= tcr & (wi->regime == TR_EL2 ? TCR_EL2_DS : TCR_DS);
> > > +		break;
> > > +	case SZ_16K:
> > > +		lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN16, 52_BIT);
> > > +		lva &= tcr & (wi->regime == TR_EL2 ? TCR_EL2_DS : TCR_DS);
> > > +		break;
> > > +	case SZ_64K:
> > > +		lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR2_EL1, VARange, 52);
> > > +		break;
> > > +	}
> > > +
> > > +	if ((lva && wi->txsz < 12) || wi->txsz < 16)
> > > +		goto transfault_l0;
> > 
> > Let's assume lva = true, wi->txsz greater than 12, but smaller than 16, which is
> > architecturally allowed according to R_GTJBY and AArch64.S1MinTxSZ().
> > 
> > (lva && wi->txsz < 12) = false
> > wi->txsz < 16 = true
> > 
> > KVM treats it as a fault.
> 
> Gah... Fixed with:
> 
> @@ -231,7 +234,7 @@ static int setup_s1_walk(struct kvm_vcpu *vcpu, u32 op, struct s1_walk_info *wi,
>  		break;
>  	}
>  
> -	if ((lva && wi->txsz < 12) || wi->txsz < 16)
> +	if ((lva && wi->txsz < 12) || (!lva && wi->txsz < 16))
>  		goto transfault_l0;
>  
>  	ia_bits = get_ia_size(wi);
> 
> Not that it has an impact yet, given that we don't support any of the
> 52bit stuff yet, but thanks for spotting this!

The change looks correct to me.

> 
> [...]
> 
> > > +	/* R_VPBBF */
> > > +	if (check_output_size(wi->baddr, wi))
> > > +		goto transfault_l0;
> > 
> > I think R_VPBBF says that an Address size fault is generated here, not a
> > translation fault.
> 
> Indeed, another one fixed.
> 
> > 
> > > +
> > > +	wi->baddr &= GENMASK_ULL(wi->max_oa_bits - 1, x);
> > > +
> > > +	return 0;
> > > +
> > > +addrsz:				/* Address Size Fault level 0 */
> > > +	fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(0), false, false);
> > > +	return -EFAULT;
> > > +
> > > +transfault_l0:			/* Translation Fault level 0 */
> > > +	fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(0), false, false);
> > > +	return -EFAULT;
> > > +}
> > 
> > [..]
> > 
> > > +static bool par_check_s1_perm_fault(u64 par)
> > > +{
> > > +	u8 fst = FIELD_GET(SYS_PAR_EL1_FST, par);
> > > +
> > > +	return  ((fst & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_PERM &&
> > > +		 !(par & SYS_PAR_EL1_S));
> > 
> > ESR_ELx_FSC_PERM = 0x0c is a permission fault, level 0, which Arm ARM says can
> > only happen when FEAT_LPA2. I think the code should check that the value for
> > PAR_EL1.FST is in the interval (ESR_ELx_FSC_PERM_L(0), ESR_ELx_FSC_PERM_L(3)].
> 
> I honestly don't want to second-guess the HW. If it reports something
> that is the wrong level, why should we trust the FSC at all?

Sorry, I should have been clearer.

It's not about the hardware reporting a fault on level 0 of the translation
tables, it's about the function returning false if the hardware reports a
permission fault on levels 1, 2 or 3 of the translation tables.

For example, on a permssion fault on level 3, PAR_EL1. FST = 0b001111 = 0x0F,
which means that the condition:

(fst & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_PERM (which is 0x0C) is false and KVM
will fall back to the software walker.

Does that make sense to you?

Thanks,
Alex
Marc Zyngier Aug. 16, 2024, 10:37 a.m. UTC | #5
Hi Alex,

On Fri, 16 Aug 2024 10:22:43 +0100,
Alexandru Elisei <alexandru.elisei@arm.com> wrote:
> 
> Hi Marc,
> 
> On Thu, Aug 15, 2024 at 07:28:41PM +0100, Marc Zyngier wrote:
> > 
> > Hi Alex,
> > 
> > On Thu, 15 Aug 2024 17:44:02 +0100,
> > Alexandru Elisei <alexandru.elisei@arm.com> wrote:

[...]

> > > > +static bool par_check_s1_perm_fault(u64 par)
> > > > +{
> > > > +	u8 fst = FIELD_GET(SYS_PAR_EL1_FST, par);
> > > > +
> > > > +	return  ((fst & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_PERM &&
> > > > +		 !(par & SYS_PAR_EL1_S));
> > > 
> > > ESR_ELx_FSC_PERM = 0x0c is a permission fault, level 0, which Arm ARM says can
> > > only happen when FEAT_LPA2. I think the code should check that the value for
> > > PAR_EL1.FST is in the interval (ESR_ELx_FSC_PERM_L(0), ESR_ELx_FSC_PERM_L(3)].
> > 
> > I honestly don't want to second-guess the HW. If it reports something
> > that is the wrong level, why should we trust the FSC at all?
> 
> Sorry, I should have been clearer.
> 
> It's not about the hardware reporting a fault on level 0 of the translation
> tables, it's about the function returning false if the hardware reports a
> permission fault on levels 1, 2 or 3 of the translation tables.
> 
> For example, on a permssion fault on level 3, PAR_EL1. FST = 0b001111 = 0x0F,
> which means that the condition:
> 
> (fst & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_PERM (which is 0x0C) is false and KVM
> will fall back to the software walker.
> 
> Does that make sense to you?

I'm afraid I still don't get it.

From the kernel source:

#define ESR_ELx_FSC_TYPE	(0x3C)

This is a mask covering all fault types.

#define ESR_ELx_FSC_PERM	(0x0C)

This is the value for a permission fault, not encoding a level.

Taking your example:

(fst & ESR_ELx_FSC_TYPE) == (0x0F & 0x3C) == 0x0C == ESR_ELx_FSC_PERM

As I read it, the condition is true, as it catches a permission fault
on any level between 0 and 3.

You're obviously seeing something I don't, and I'm starting to
question my own sanity...

Thanks,

	M.
Alexandru Elisei Aug. 16, 2024, 11:02 a.m. UTC | #6
Hi Marc,

On Fri, Aug 16, 2024 at 11:37:24AM +0100, Marc Zyngier wrote:
> Hi Alex,
> 
> On Fri, 16 Aug 2024 10:22:43 +0100,
> Alexandru Elisei <alexandru.elisei@arm.com> wrote:
> > 
> > Hi Marc,
> > 
> > On Thu, Aug 15, 2024 at 07:28:41PM +0100, Marc Zyngier wrote:
> > > 
> > > Hi Alex,
> > > 
> > > On Thu, 15 Aug 2024 17:44:02 +0100,
> > > Alexandru Elisei <alexandru.elisei@arm.com> wrote:
> 
> [...]
> 
> > > > > +static bool par_check_s1_perm_fault(u64 par)
> > > > > +{
> > > > > +	u8 fst = FIELD_GET(SYS_PAR_EL1_FST, par);
> > > > > +
> > > > > +	return  ((fst & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_PERM &&
> > > > > +		 !(par & SYS_PAR_EL1_S));
> > > > 
> > > > ESR_ELx_FSC_PERM = 0x0c is a permission fault, level 0, which Arm ARM says can
> > > > only happen when FEAT_LPA2. I think the code should check that the value for
> > > > PAR_EL1.FST is in the interval (ESR_ELx_FSC_PERM_L(0), ESR_ELx_FSC_PERM_L(3)].
> > > 
> > > I honestly don't want to second-guess the HW. If it reports something
> > > that is the wrong level, why should we trust the FSC at all?
> > 
> > Sorry, I should have been clearer.
> > 
> > It's not about the hardware reporting a fault on level 0 of the translation
> > tables, it's about the function returning false if the hardware reports a
> > permission fault on levels 1, 2 or 3 of the translation tables.
> > 
> > For example, on a permssion fault on level 3, PAR_EL1. FST = 0b001111 = 0x0F,
> > which means that the condition:
> > 
> > (fst & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_PERM (which is 0x0C) is false and KVM
> > will fall back to the software walker.
> > 
> > Does that make sense to you?
> 
> I'm afraid I still don't get it.
> 
> From the kernel source:
> 
> #define ESR_ELx_FSC_TYPE	(0x3C)
> 
> This is a mask covering all fault types.
> 
> #define ESR_ELx_FSC_PERM	(0x0C)
> 
> This is the value for a permission fault, not encoding a level.
> 
> Taking your example:
> 
> (fst & ESR_ELx_FSC_TYPE) == (0x0F & 0x3C) == 0x0C == ESR_ELx_FSC_PERM
> 
> As I read it, the condition is true, as it catches a permission fault
> on any level between 0 and 3.
> 
> You're obviously seeing something I don't, and I'm starting to
> question my own sanity...

No, no, sorry for leading you on a wild goose chase, I read 0x3F for
ESR_ELx_FSC_TYPE, which the value for the variable directly above it, instead of
0x3C :(

My bad, the code is correct!

Thanks,
Alex
Marc Zyngier Aug. 16, 2024, 1:44 p.m. UTC | #7
On Fri, 16 Aug 2024 12:02:37 +0100,
Alexandru Elisei <alexandru.elisei@arm.com> wrote:
> 
> Hi Marc,
> 
> On Fri, Aug 16, 2024 at 11:37:24AM +0100, Marc Zyngier wrote:
> > Hi Alex,
> > 
> > On Fri, 16 Aug 2024 10:22:43 +0100,
> > Alexandru Elisei <alexandru.elisei@arm.com> wrote:
> > > 
> > > Hi Marc,
> > > 
> > > On Thu, Aug 15, 2024 at 07:28:41PM +0100, Marc Zyngier wrote:
> > > > 
> > > > Hi Alex,
> > > > 
> > > > On Thu, 15 Aug 2024 17:44:02 +0100,
> > > > Alexandru Elisei <alexandru.elisei@arm.com> wrote:
> > 
> > [...]
> > 
> > > > > > +static bool par_check_s1_perm_fault(u64 par)
> > > > > > +{
> > > > > > +	u8 fst = FIELD_GET(SYS_PAR_EL1_FST, par);
> > > > > > +
> > > > > > +	return  ((fst & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_PERM &&
> > > > > > +		 !(par & SYS_PAR_EL1_S));
> > > > > 
> > > > > ESR_ELx_FSC_PERM = 0x0c is a permission fault, level 0, which Arm ARM says can
> > > > > only happen when FEAT_LPA2. I think the code should check that the value for
> > > > > PAR_EL1.FST is in the interval (ESR_ELx_FSC_PERM_L(0), ESR_ELx_FSC_PERM_L(3)].
> > > > 
> > > > I honestly don't want to second-guess the HW. If it reports something
> > > > that is the wrong level, why should we trust the FSC at all?
> > > 
> > > Sorry, I should have been clearer.
> > > 
> > > It's not about the hardware reporting a fault on level 0 of the translation
> > > tables, it's about the function returning false if the hardware reports a
> > > permission fault on levels 1, 2 or 3 of the translation tables.
> > > 
> > > For example, on a permssion fault on level 3, PAR_EL1. FST = 0b001111 = 0x0F,
> > > which means that the condition:
> > > 
> > > (fst & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_PERM (which is 0x0C) is false and KVM
> > > will fall back to the software walker.
> > > 
> > > Does that make sense to you?
> > 
> > I'm afraid I still don't get it.
> > 
> > From the kernel source:
> > 
> > #define ESR_ELx_FSC_TYPE	(0x3C)
> > 
> > This is a mask covering all fault types.
> > 
> > #define ESR_ELx_FSC_PERM	(0x0C)
> > 
> > This is the value for a permission fault, not encoding a level.
> > 
> > Taking your example:
> > 
> > (fst & ESR_ELx_FSC_TYPE) == (0x0F & 0x3C) == 0x0C == ESR_ELx_FSC_PERM
> > 
> > As I read it, the condition is true, as it catches a permission fault
> > on any level between 0 and 3.
> > 
> > You're obviously seeing something I don't, and I'm starting to
> > question my own sanity...
> 
> No, no, sorry for leading you on a wild goose chase, I read 0x3F for
> ESR_ELx_FSC_TYPE, which the value for the variable directly above it, instead of
> 0x3C :(
> 
> My bad, the code is correct!

Ah, glad we agree, I was starting to worry!

Thanks,

	M.
diff mbox series

Patch

diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c
index 9865d29b3149..6d5555e98557 100644
--- a/arch/arm64/kvm/at.c
+++ b/arch/arm64/kvm/at.c
@@ -4,9 +4,405 @@ 
  * Author: Jintack Lim <jintack.lim@linaro.org>
  */
 
+#include <linux/kvm_host.h>
+
+#include <asm/esr.h>
 #include <asm/kvm_hyp.h>
 #include <asm/kvm_mmu.h>
 
+enum trans_regime {
+	TR_EL10,
+	TR_EL20,
+	TR_EL2,
+};
+
+struct s1_walk_info {
+	u64	     		baddr;
+	enum trans_regime	regime;
+	unsigned int		max_oa_bits;
+	unsigned int		pgshift;
+	unsigned int		txsz;
+	int 	     		sl;
+	bool	     		hpd;
+	bool	     		be;
+	bool	     		s2;
+};
+
+struct s1_walk_result {
+	union {
+		struct {
+			u64	desc;
+			u64	pa;
+			s8	level;
+			u8	APTable;
+			bool	UXNTable;
+			bool	PXNTable;
+		};
+		struct {
+			u8	fst;
+			bool	ptw;
+			bool	s2;
+		};
+	};
+	bool	failed;
+};
+
+static void fail_s1_walk(struct s1_walk_result *wr, u8 fst, bool ptw, bool s2)
+{
+	wr->fst		= fst;
+	wr->ptw		= ptw;
+	wr->s2		= s2;
+	wr->failed	= true;
+}
+
+#define S1_MMU_DISABLED		(-127)
+
+static int get_ia_size(struct s1_walk_info *wi)
+{
+	return 64 - wi->txsz;
+}
+
+/* Return true if the IPA is out of the OA range */
+static bool check_output_size(u64 ipa, struct s1_walk_info *wi)
+{
+	return wi->max_oa_bits < 48 && (ipa & GENMASK_ULL(47, wi->max_oa_bits));
+}
+
+/* Return the translation regime that applies to an AT instruction */
+static enum trans_regime compute_translation_regime(struct kvm_vcpu *vcpu, u32 op)
+{
+	/*
+	 * We only get here from guest EL2, so the translation
+	 * regime AT applies to is solely defined by {E2H,TGE}.
+	 */
+	switch (op) {
+	case OP_AT_S1E2R:
+	case OP_AT_S1E2W:
+		return vcpu_el2_e2h_is_set(vcpu) ? TR_EL20 : TR_EL2;
+		break;
+	default:
+		return (vcpu_el2_e2h_is_set(vcpu) &&
+			vcpu_el2_tge_is_set(vcpu)) ? TR_EL20 : TR_EL10;
+	}
+}
+
+static int setup_s1_walk(struct kvm_vcpu *vcpu, u32 op, struct s1_walk_info *wi,
+			 struct s1_walk_result *wr, u64 va)
+{
+	u64 sctlr, tcr, tg, ps, ia_bits, ttbr;
+	unsigned int stride, x;
+	bool va55, tbi, lva, as_el0;
+
+	wi->regime = compute_translation_regime(vcpu, op);
+	as_el0 = (op == OP_AT_S1E0R || op == OP_AT_S1E0W);
+
+	va55 = va & BIT(55);
+
+	if (wi->regime == TR_EL2 && va55)
+		goto addrsz;
+
+	wi->s2 = (wi->regime == TR_EL10 &&
+		  (__vcpu_sys_reg(vcpu, HCR_EL2) & (HCR_VM | HCR_DC)));
+
+	switch (wi->regime) {
+	case TR_EL10:
+		sctlr	= vcpu_read_sys_reg(vcpu, SCTLR_EL1);
+		tcr	= vcpu_read_sys_reg(vcpu, TCR_EL1);
+		ttbr	= (va55 ?
+			   vcpu_read_sys_reg(vcpu, TTBR1_EL1) :
+			   vcpu_read_sys_reg(vcpu, TTBR0_EL1));
+		break;
+	case TR_EL2:
+	case TR_EL20:
+		sctlr	= vcpu_read_sys_reg(vcpu, SCTLR_EL2);
+		tcr	= vcpu_read_sys_reg(vcpu, TCR_EL2);
+		ttbr	= (va55 ?
+			   vcpu_read_sys_reg(vcpu, TTBR1_EL2) :
+			   vcpu_read_sys_reg(vcpu, TTBR0_EL2));
+		break;
+	default:
+		BUG();
+	}
+
+	tbi = (wi->regime == TR_EL2 ?
+	       FIELD_GET(TCR_EL2_TBI, tcr) :
+	       (va55 ?
+		FIELD_GET(TCR_TBI1, tcr) :
+		FIELD_GET(TCR_TBI0, tcr)));
+
+	if (!tbi && (u64)sign_extend64(va, 55) != va)
+		goto addrsz;
+
+	va = (u64)sign_extend64(va, 55);
+
+	/* Let's put the MMU disabled case aside immediately */
+	switch (wi->regime) {
+	case TR_EL10:
+		/*
+		 * If dealing with the EL1&0 translation regime, 3 things
+		 * can disable the S1 translation:
+		 *
+		 * - HCR_EL2.DC = 1
+		 * - HCR_EL2.{E2H,TGE} = {0,1}
+		 * - SCTLR_EL1.M = 0
+		 *
+		 * The TGE part is interesting. If we have decided that this
+		 * is EL1&0, then it means that either {E2H,TGE} == {1,0} or
+		 * {0,x}, and we only need to test for TGE == 1.
+		 */
+		if (__vcpu_sys_reg(vcpu, HCR_EL2) & (HCR_DC | HCR_TGE))
+			wr->level = S1_MMU_DISABLED;
+		fallthrough;
+	case TR_EL2:
+	case TR_EL20:
+		if (!(sctlr & SCTLR_ELx_M))
+			wr->level = S1_MMU_DISABLED;
+		break;
+	}
+
+	if (wr->level == S1_MMU_DISABLED) {
+		if (va >= BIT(kvm_get_pa_bits(vcpu->kvm)))
+			goto addrsz;
+
+		wr->pa = va;
+		return 0;
+	}
+
+	wi->be = sctlr & SCTLR_ELx_EE;
+
+	wi->hpd  = kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, HPDS, IMP);
+	wi->hpd &= (wi->regime == TR_EL2 ?
+		    FIELD_GET(TCR_EL2_HPD, tcr) :
+		    (va55 ?
+		     FIELD_GET(TCR_HPD1, tcr) :
+		     FIELD_GET(TCR_HPD0, tcr)));
+
+	/* Someone was silly enough to encode TG0/TG1 differently */
+	if (va55) {
+		wi->txsz = FIELD_GET(TCR_T1SZ_MASK, tcr);
+		tg = FIELD_GET(TCR_TG1_MASK, tcr);
+
+		switch (tg << TCR_TG1_SHIFT) {
+		case TCR_TG1_4K:
+			wi->pgshift = 12;	 break;
+		case TCR_TG1_16K:
+			wi->pgshift = 14;	 break;
+		case TCR_TG1_64K:
+		default:	    /* IMPDEF: treat any other value as 64k */
+			wi->pgshift = 16;	 break;
+		}
+	} else {
+		wi->txsz = FIELD_GET(TCR_T0SZ_MASK, tcr);
+		tg = FIELD_GET(TCR_TG0_MASK, tcr);
+
+		switch (tg << TCR_TG0_SHIFT) {
+		case TCR_TG0_4K:
+			wi->pgshift = 12;	 break;
+		case TCR_TG0_16K:
+			wi->pgshift = 14;	 break;
+		case TCR_TG0_64K:
+		default:	    /* IMPDEF: treat any other value as 64k */
+			wi->pgshift = 16;	 break;
+		}
+	}
+
+	/* R_PLCGL, R_YXNYW */
+	if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR2_EL1, ST, 48_47)) {
+		if (wi->txsz > 39)
+			goto transfault_l0;
+	} else {
+		if (wi->txsz > 48 || (wi->pgshift == 16 && wi->txsz > 47))
+			goto transfault_l0;
+	}
+
+	/* R_GTJBY, R_SXWGM */
+	switch (BIT(wi->pgshift)) {
+	case SZ_4K:
+		lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN4, 52_BIT);
+		lva &= tcr & (wi->regime == TR_EL2 ? TCR_EL2_DS : TCR_DS);
+		break;
+	case SZ_16K:
+		lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN16, 52_BIT);
+		lva &= tcr & (wi->regime == TR_EL2 ? TCR_EL2_DS : TCR_DS);
+		break;
+	case SZ_64K:
+		lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR2_EL1, VARange, 52);
+		break;
+	}
+
+	if ((lva && wi->txsz < 12) || wi->txsz < 16)
+		goto transfault_l0;
+
+	ia_bits = get_ia_size(wi);
+
+	/* R_YYVYV, I_THCZK */
+	if ((!va55 && va > GENMASK(ia_bits - 1, 0)) ||
+	    (va55 && va < GENMASK(63, ia_bits)))
+		goto transfault_l0;
+
+	/* I_ZFSYQ */
+	if (wi->regime != TR_EL2 &&
+	    (tcr & ((va55) ? TCR_EPD1_MASK : TCR_EPD0_MASK)))
+		goto transfault_l0;
+
+	/* R_BNDVG and following statements */
+	if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR2_EL1, E0PD, IMP) &&
+	    as_el0 && (tcr & ((va55) ? TCR_E0PD1 : TCR_E0PD0)))
+		goto transfault_l0;
+
+	/* AArch64.S1StartLevel() */
+	stride = wi->pgshift - 3;
+	wi->sl = 3 - (((ia_bits - 1) - wi->pgshift) / stride);
+
+	ps = (wi->regime == TR_EL2 ?
+	      FIELD_GET(TCR_EL2_PS_MASK, tcr) : FIELD_GET(TCR_IPS_MASK, tcr));
+
+	wi->max_oa_bits = min(get_kvm_ipa_limit(), ps_to_output_size(ps));
+
+	/* Compute minimal alignment */
+	x = 3 + ia_bits - ((3 - wi->sl) * stride + wi->pgshift);
+
+	wi->baddr = ttbr & TTBRx_EL1_BADDR;
+
+	/* R_VPBBF */
+	if (check_output_size(wi->baddr, wi))
+		goto transfault_l0;
+
+	wi->baddr &= GENMASK_ULL(wi->max_oa_bits - 1, x);
+
+	return 0;
+
+addrsz:				/* Address Size Fault level 0 */
+	fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(0), false, false);
+	return -EFAULT;
+
+transfault_l0:			/* Translation Fault level 0 */
+	fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(0), false, false);
+	return -EFAULT;
+}
+
+static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
+		   struct s1_walk_result *wr, u64 va)
+{
+	u64 va_top, va_bottom, baddr, desc;
+	int level, stride, ret;
+
+	level = wi->sl;
+	stride = wi->pgshift - 3;
+	baddr = wi->baddr;
+
+	va_top = get_ia_size(wi) - 1;
+
+	while (1) {
+		u64 index, ipa;
+
+		va_bottom = (3 - level) * stride + wi->pgshift;
+		index = (va & GENMASK_ULL(va_top, va_bottom)) >> (va_bottom - 3);
+
+		ipa = baddr | index;
+
+		if (wi->s2) {
+			struct kvm_s2_trans s2_trans = {};
+
+			ret = kvm_walk_nested_s2(vcpu, ipa, &s2_trans);
+			if (ret) {
+				fail_s1_walk(wr,
+					     (s2_trans.esr & ~ESR_ELx_FSC_LEVEL) | level,
+					     true, true);
+				return ret;
+			}
+
+			if (!kvm_s2_trans_readable(&s2_trans)) {
+				fail_s1_walk(wr, ESR_ELx_FSC_PERM_L(level),
+					     true, true);
+
+				return -EPERM;
+			}
+
+			ipa = kvm_s2_trans_output(&s2_trans);
+		}
+
+		ret = kvm_read_guest(vcpu->kvm, ipa, &desc, sizeof(desc));
+		if (ret) {
+			fail_s1_walk(wr, ESR_ELx_FSC_SEA_TTW(level),
+				     true, false);
+			return ret;
+		}
+
+		if (wi->be)
+			desc = be64_to_cpu((__force __be64)desc);
+		else
+			desc = le64_to_cpu((__force __le64)desc);
+
+		/* Invalid descriptor */
+		if (!(desc & BIT(0)))
+			goto transfault;
+
+		/* Block mapping, check validity down the line */
+		if (!(desc & BIT(1)))
+			break;
+
+		/* Page mapping */
+		if (level == 3)
+			break;
+
+		/* Table handling */
+		if (!wi->hpd) {
+			wr->APTable  |= FIELD_GET(S1_TABLE_AP, desc);
+			wr->UXNTable |= FIELD_GET(PMD_TABLE_UXN, desc);
+			wr->PXNTable |= FIELD_GET(PMD_TABLE_PXN, desc);
+		}
+
+		baddr = desc & GENMASK_ULL(47, wi->pgshift);
+
+		/* Check for out-of-range OA */
+		if (check_output_size(baddr, wi))
+			goto addrsz;
+
+		/* Prepare for next round */
+		va_top = va_bottom - 1;
+		level++;
+	}
+
+	/* Block mapping, check the validity of the level */
+	if (!(desc & BIT(1))) {
+		bool valid_block = false;
+
+		switch (BIT(wi->pgshift)) {
+		case SZ_4K:
+			valid_block = level == 1 || level == 2;
+			break;
+		case SZ_16K:
+		case SZ_64K:
+			valid_block = level == 2;
+			break;
+		}
+
+		if (!valid_block)
+			goto transfault;
+	}
+
+	if (check_output_size(desc & GENMASK(47, va_bottom), wi))
+		goto addrsz;
+
+	va_bottom += contiguous_bit_shift(desc, wi, level);
+
+	wr->failed = false;
+	wr->level = level;
+	wr->desc = desc;
+	wr->pa = desc & GENMASK(47, va_bottom);
+	wr->pa |= va & GENMASK_ULL(va_bottom - 1, 0);
+
+	return 0;
+
+addrsz:
+	fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(level), true, false);
+	return -EINVAL;
+transfault:
+	fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(level), true, false);
+	return -ENOENT;
+}
+
 struct mmu_config {
 	u64	ttbr0;
 	u64	ttbr1;
@@ -188,6 +584,16 @@  static u8 compute_sh(u8 attr, u64 desc)
 	return sh;
 }
 
+static u8 combine_sh(u8 s1_sh, u8 s2_sh)
+{
+	if (s1_sh == ATTR_OSH || s2_sh == ATTR_OSH)
+		return ATTR_OSH;
+	if (s1_sh == ATTR_ISH || s2_sh == ATTR_ISH)
+		return ATTR_ISH;
+
+	return ATTR_NSH;
+}
+
 static u64 compute_par_s12(struct kvm_vcpu *vcpu, u64 s1_par,
 			   struct kvm_s2_trans *tr)
 {
@@ -260,11 +666,185 @@  static u64 compute_par_s12(struct kvm_vcpu *vcpu, u64 s1_par,
 	par  = FIELD_PREP(SYS_PAR_EL1_ATTR, final_attr);
 	par |= tr->output & GENMASK(47, 12);
 	par |= FIELD_PREP(SYS_PAR_EL1_SH,
-			  compute_sh(final_attr, tr->desc));
+			  combine_sh(FIELD_GET(SYS_PAR_EL1_SH, s1_par),
+				     compute_sh(final_attr, tr->desc)));
 
 	return par;
 }
 
+static u64 compute_par_s1(struct kvm_vcpu *vcpu, struct s1_walk_result *wr,
+			  enum trans_regime regime)
+{
+	u64 par;
+
+	if (wr->failed) {
+		par = SYS_PAR_EL1_RES1;
+		par |= SYS_PAR_EL1_F;
+		par |= FIELD_PREP(SYS_PAR_EL1_FST, wr->fst);
+		par |= wr->ptw ? SYS_PAR_EL1_PTW : 0;
+		par |= wr->s2 ? SYS_PAR_EL1_S : 0;
+	} else if (wr->level == S1_MMU_DISABLED) {
+		/* MMU off or HCR_EL2.DC == 1 */
+		par  = SYS_PAR_EL1_NSE;
+		par |= wr->pa & GENMASK_ULL(47, 12);
+
+		if (regime == TR_EL10 &&
+		    (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_DC)) {
+			par |= FIELD_PREP(SYS_PAR_EL1_ATTR,
+					  MEMATTR(WbRaWa, WbRaWa));
+			par |= FIELD_PREP(SYS_PAR_EL1_SH, ATTR_NSH);
+		} else {
+			par |= FIELD_PREP(SYS_PAR_EL1_ATTR, 0); /* nGnRnE */
+			par |= FIELD_PREP(SYS_PAR_EL1_SH, ATTR_OSH);
+		}
+	} else {
+		u64 mair, sctlr;
+		u8 sh;
+
+		par  = SYS_PAR_EL1_NSE;
+
+		mair = (regime == TR_EL10 ?
+			vcpu_read_sys_reg(vcpu, MAIR_EL1) :
+			vcpu_read_sys_reg(vcpu, MAIR_EL2));
+
+		mair >>= FIELD_GET(PTE_ATTRINDX_MASK, wr->desc) * 8;
+		mair &= 0xff;
+
+		sctlr = (regime == TR_EL10 ?
+			 vcpu_read_sys_reg(vcpu, SCTLR_EL1) :
+			 vcpu_read_sys_reg(vcpu, SCTLR_EL2));
+
+		/* Force NC for memory if SCTLR_ELx.C is clear */
+		if (!(sctlr & SCTLR_EL1_C) && !MEMATTR_IS_DEVICE(mair))
+			mair = MEMATTR(NC, NC);
+
+		par |= FIELD_PREP(SYS_PAR_EL1_ATTR, mair);
+		par |= wr->pa & GENMASK_ULL(47, 12);
+
+		sh = compute_sh(mair, wr->desc);
+		par |= FIELD_PREP(SYS_PAR_EL1_SH, sh);
+	}
+
+	return par;
+}
+
+static u64 handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
+{
+	bool perm_fail, ur, uw, ux, pr, pw, px;
+	struct s1_walk_result wr = {};
+	struct s1_walk_info wi = {};
+	int ret, idx;
+
+	ret = setup_s1_walk(vcpu, op, &wi, &wr, vaddr);
+	if (ret)
+		goto compute_par;
+
+	if (wr.level == S1_MMU_DISABLED)
+		goto compute_par;
+
+	idx = srcu_read_lock(&vcpu->kvm->srcu);
+
+	ret = walk_s1(vcpu, &wi, &wr, vaddr);
+
+	srcu_read_unlock(&vcpu->kvm->srcu, idx);
+
+	if (ret)
+		goto compute_par;
+
+	/* FIXME: revisit when adding indirect permission support */
+	/* AArch64.S1DirectBasePermissions() */
+	if (wi.regime != TR_EL2) {
+		switch (FIELD_GET(PTE_USER | PTE_RDONLY, wr.desc)) {
+		case 0b00:
+			pr = pw = true;
+			ur = uw = false;
+			break;
+		case 0b01:
+			pr = pw = ur = uw = true;
+			break;
+		case 0b10:
+			pr = true;
+			pw = ur = uw = false;
+			break;
+		case 0b11:
+			pr = ur = true;
+			pw = uw = false;
+			break;
+		}
+
+		switch (wr.APTable) {
+		case 0b00:
+			break;
+		case 0b01:
+			ur = uw = false;
+			break;
+		case 0b10:
+			pw = uw = false;
+			break;
+		case 0b11:
+			pw = ur = uw = false;
+			break;
+		}
+
+		/* We don't use px for anything yet, but hey... */
+		px = !((wr.desc & PTE_PXN) || wr.PXNTable || pw);
+		ux = !((wr.desc & PTE_UXN) || wr.UXNTable);
+
+		if (op == OP_AT_S1E1RP || op == OP_AT_S1E1WP) {
+			bool pan;
+
+			pan = *vcpu_cpsr(vcpu) & PSR_PAN_BIT;
+			pan &= ur || uw;
+			pw &= !pan;
+			pr &= !pan;
+		}
+	} else {
+		ur = uw = ux = false;
+
+		if (!(wr.desc & PTE_RDONLY)) {
+			pr = pw = true;
+		} else {
+			pr = true;
+			pw = false;
+		}
+
+		if (wr.APTable & BIT(1))
+			pw = false;
+
+		/* XN maps to UXN */
+		px = !((wr.desc & PTE_UXN) || wr.UXNTable);
+	}
+
+	perm_fail = false;
+
+	switch (op) {
+	case OP_AT_S1E1RP:
+	case OP_AT_S1E1R:
+	case OP_AT_S1E2R:
+		perm_fail = !pr;
+		break;
+	case OP_AT_S1E1WP:
+	case OP_AT_S1E1W:
+	case OP_AT_S1E2W:
+		perm_fail = !pw;
+		break;
+	case OP_AT_S1E0R:
+		perm_fail = !ur;
+		break;
+	case OP_AT_S1E0W:
+		perm_fail = !uw;
+		break;
+	default:
+		BUG();
+	}
+
+	if (perm_fail)
+		fail_s1_walk(&wr, ESR_ELx_FSC_PERM_L(wr.level), false, false);
+
+compute_par:
+	return compute_par_s1(vcpu, &wr, wi.regime);
+}
+
 /*
  * Return the PAR_EL1 value as the result of a valid translation.
  *
@@ -352,10 +932,29 @@  static u64 __kvm_at_s1e01_fast(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
 	return par;
 }
 
+static bool par_check_s1_perm_fault(u64 par)
+{
+	u8 fst = FIELD_GET(SYS_PAR_EL1_FST, par);
+
+	return  ((fst & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_PERM &&
+		 !(par & SYS_PAR_EL1_S));
+}
+
 void __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
 {
 	u64 par = __kvm_at_s1e01_fast(vcpu, op, vaddr);
 
+	/*
+	 * If PAR_EL1 reports that AT failed on a S1 permission fault, we
+	 * know for sure that the PTW was able to walk the S1 tables and
+	 * there's nothing else to do.
+	 *
+	 * If AT failed for any other reason, then we must walk the guest S1
+	 * to emulate the instruction.
+	 */
+	if ((par & SYS_PAR_EL1_F) && !par_check_s1_perm_fault(par))
+		par = handle_at_slow(vcpu, op, vaddr);
+
 	vcpu_write_sys_reg(vcpu, par, PAR_EL1);
 }
 
@@ -407,6 +1006,10 @@  void __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
 		isb();
 	}
 
+	/* We failed the translation, let's replay it in slow motion */
+	if ((par & SYS_PAR_EL1_F) && !par_check_s1_perm_fault(par))
+		par = handle_at_slow(vcpu, op, vaddr);
+
 	vcpu_write_sys_reg(vcpu, par, PAR_EL1);
 }
 
@@ -463,7 +1066,7 @@  void __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
 	/* Check the access permission */
 	if (!out.esr &&
 	    ((!write && !out.readable) || (write && !out.writable)))
-		out.esr = ESR_ELx_FSC_PERM | (out.level & 0x3);
+		out.esr = ESR_ELx_FSC_PERM_L(out.level & 0x3);
 
 	par = compute_par_s12(vcpu, par, &out);
 	vcpu_write_sys_reg(vcpu, par, PAR_EL1);