diff mbox series

[v2,13/17] KVM: arm64: nv: Add SW walker for AT S1 emulation

Message ID 20240731194030.1991237-14-maz@kernel.org (mailing list archive)
State New, archived
Headers show
Series KVM: arm64: nv: Add support for address translation instructions | expand

Commit Message

Marc Zyngier July 31, 2024, 7:40 p.m. UTC
In order to plug the brokenness of our current AT implementation,
we need a SW walker that is going to... err.. walk the S1 tables
and tell us what it finds.

Of course, it builds on top of our S2 walker, and share similar
concepts. The beauty of it is that since it uses kvm_read_guest(),
it is able to bring back pages that have been otherwise evicted.

This is then plugged in the two AT S1 emulation functions as
a "slow path" fallback. I'm not sure it is that slow, but hey.

Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 arch/arm64/kvm/at.c | 567 +++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 565 insertions(+), 2 deletions(-)

Comments

Alexandru Elisei Aug. 9, 2024, 12:43 p.m. UTC | #1
Hi Marc,

Finally managed to go through the patch. Bunch of nitpicks from me (can be
safely ignored), and some corner cases where KVM deviates from the spec.

On Wed, Jul 31, 2024 at 08:40:26PM +0100, Marc Zyngier wrote:
> In order to plug the brokenness of our current AT implementation,
> we need a SW walker that is going to... err.. walk the S1 tables
> and tell us what it finds.
> 
> Of course, it builds on top of our S2 walker, and share similar
> concepts. The beauty of it is that since it uses kvm_read_guest(),
> it is able to bring back pages that have been otherwise evicted.
> 
> This is then plugged in the two AT S1 emulation functions as
> a "slow path" fallback. I'm not sure it is that slow, but hey.
> 
> Signed-off-by: Marc Zyngier <maz@kernel.org>
> ---
>  arch/arm64/kvm/at.c | 567 +++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 565 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c
> index 9865d29b3149..8e1f0837e309 100644
> --- a/arch/arm64/kvm/at.c
> +++ b/arch/arm64/kvm/at.c
> @@ -4,9 +4,372 @@
>   * Author: Jintack Lim <jintack.lim@linaro.org>
>   */
>  
> +#include <linux/kvm_host.h>
> +
> +#include <asm/esr.h>
>  #include <asm/kvm_hyp.h>
>  #include <asm/kvm_mmu.h>
>  
> +enum trans_regime {
> +	TR_EL10,
> +	TR_EL20,
> +	TR_EL2,
> +};
> +
> +struct s1_walk_info {
> +	u64	     		baddr;
> +	enum trans_regime	regime;
> +	unsigned int		max_oa_bits;
> +	unsigned int		pgshift;
> +	unsigned int		txsz;
> +	int 	     		sl;
> +	bool	     		hpd;
> +	bool	     		be;
> +	bool	     		s2;
> +};
> +
> +struct s1_walk_result {
> +	union {
> +		struct {
> +			u64	desc;
> +			u64	pa;
> +			s8	level;
> +			u8	APTable;
> +			bool	UXNTable;
> +			bool	PXNTable;
> +		};
> +		struct {
> +			u8	fst;
> +			bool	ptw;
> +			bool	s2;
> +		};
> +	};
> +	bool	failed;
> +};
> +
> +static void fail_s1_walk(struct s1_walk_result *wr, u8 fst, bool ptw, bool s2)
> +{
> +	wr->fst		= fst;
> +	wr->ptw		= ptw;
> +	wr->s2		= s2;
> +	wr->failed	= true;
> +}
> +
> +#define S1_MMU_DISABLED		(-127)
> +
> +static int get_ia_size(struct s1_walk_info *wi)
> +{
> +	return 64 - wi->txsz;
> +}
> +
> +/* return true of the IPA is out of the OA range */

*R*eturn true *if* the IPA is out of the OA range?

> +static bool check_output_size(u64 ipa, struct s1_walk_info *wi)
> +{
> +	return wi->max_oa_bits < 48 && (ipa & GENMASK_ULL(47, wi->max_oa_bits));
> +}

Matches AArch64.OAOutOfRange(), where KVM supports a maximum oasize of 48 bits,
and AArch64.PAMax() is get_kvm_ipa_limit().

> +
> +/* Return the translation regime that applies to an AT instruction */
> +static enum trans_regime compute_translation_regime(struct kvm_vcpu *vcpu, u32 op)
> +{
> +	/*
> +	 * We only get here from guest EL2, so the translation
> +	 * regime AT applies to is solely defined by {E2H,TGE}.
> +	 */
> +	switch (op) {
> +	case OP_AT_S1E2R:
> +	case OP_AT_S1E2W:
> +		return vcpu_el2_e2h_is_set(vcpu) ? TR_EL20 : TR_EL2;

This matches the pseudocode for the instructions, which calls
AArch64.AT(el_in=EL2). AT(el_in=EL2) calls TranslationRegime(el=EL2), which
returns Regime_EL20 if E2H is set (in ELIsInHost(el=EL2)), otherwise Regime_EL2.


> +		break;
> +	default:
> +		return (vcpu_el2_e2h_is_set(vcpu) &&
> +			vcpu_el2_tge_is_set(vcpu)) ? TR_EL20 : TR_EL10;

This also looks correct to me. Following the pseudocode was not trivial, so I'm
leaving this here in case I made a mistake somewhere.

For the S1E0* variants: AArch64.AT(el_in=EL0) => TranslationRegime(el=EL0) =>
ELIsInHost(el=EL0); ELIsInHost(el=EL0) is true if {E2H,TGE} = {1,1}, and in this
case TranslationRegime(el=EL0) returns Regime_EL20, otherwise Regime_EL10.

For the S1E1* variants: AArch64.AT(el_in=EL1), where:

- if ELIsInHost(el=EL0) is true, then 'el' is changed to EL2, where
  ELIsInHost(el=EL0) is true if {E2H,TGE} = {1,1}. In this case,
  TranslationRegime(el=EL2) will return Regime_EL20.

- if ELIsInHost(el=EL0) is false, then 'el' remains EL1, and
  TranslationRegime(el=EL1) returns Regime_EL10.

> +	}
> +}
> +
> +static int setup_s1_walk(struct kvm_vcpu *vcpu, u32 op, struct s1_walk_info *wi,
> +			 struct s1_walk_result *wr, u64 va)
> +{
> +	u64 sctlr, tcr, tg, ps, ia_bits, ttbr;
> +	unsigned int stride, x;
> +	bool va55, tbi, lva, as_el0;
> +
> +	wi->regime = compute_translation_regime(vcpu, op);
> +	as_el0 = (op == OP_AT_S1E0R || op == OP_AT_S1E0W);
> +
> +	va55 = va & BIT(55);
> +
> +	if (wi->regime == TR_EL2 && va55)
> +		goto addrsz;
> +
> +	wi->s2 = wi->regime == TR_EL10 && (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_VM);

According to AArch64.NSS2TTWParams(), stage 2 is enabled if HCR_EL2.VM or
HCR_EL2.DC.

From the description of HCR_EL2.DC, when the bit is set:

'The PE behaves as if the value of the HCR_EL2.VM field is 1 for all purposes
other than returning the value of a direct read of HCR_EL2.'

> +
> +	switch (wi->regime) {
> +	case TR_EL10:
> +		sctlr	= vcpu_read_sys_reg(vcpu, SCTLR_EL1);
> +		tcr	= vcpu_read_sys_reg(vcpu, TCR_EL1);
> +		ttbr	= (va55 ?
> +			   vcpu_read_sys_reg(vcpu, TTBR1_EL1) :
> +			   vcpu_read_sys_reg(vcpu, TTBR0_EL1));
> +		break;
> +	case TR_EL2:
> +	case TR_EL20:
> +		sctlr	= vcpu_read_sys_reg(vcpu, SCTLR_EL2);
> +		tcr	= vcpu_read_sys_reg(vcpu, TCR_EL2);
> +		ttbr	= (va55 ?
> +			   vcpu_read_sys_reg(vcpu, TTBR1_EL2) :
> +			   vcpu_read_sys_reg(vcpu, TTBR0_EL2));
> +		break;
> +	default:
> +		BUG();
> +	}
> +
> +	tbi = (wi->regime == TR_EL2 ?
> +	       FIELD_GET(TCR_EL2_TBI, tcr) :
> +	       (va55 ?
> +		FIELD_GET(TCR_TBI1, tcr) :
> +		FIELD_GET(TCR_TBI0, tcr)));
> +
> +	if (!tbi && (u64)sign_extend64(va, 55) != va)
> +		goto addrsz;
> +
> +	va = (u64)sign_extend64(va, 55);
> +
> +	/* Let's put the MMU disabled case aside immediately */
> +	if (!(sctlr & SCTLR_ELx_M) ||
> +	    (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_DC)) {

I think the condition doesn't match AArch64.S1Enabled():

- if regime is Regime_EL20 or Regime_EL2, then S1 is disabled if and only if
  SCTLR_EL2.M is not set; it doesn't matter if HCR_EL2.DC is set, because "[..]
  this field has no effect on the EL2, EL2&0, and EL3 translation regimes."
  (HCR_EL2.DC bit field description).

- if regime is Regime_EL10, then S1 is disabled if SCTLR_EL1.M == 0 or
  HCR_EL2.TGE = 0 or the effective value of HCR_EL2.DC* is 0.

From the description of HCR_EL2.TGE, when the bit is set:

'If EL1 is using AArch64, the SCTLR_EL1.M field is treated as being 0 for all
purposes other than returning the result of a direct read of SCTLR_EL1.'

From the description of HCR_EL2.DC, when the bit is set:

'When EL1 is using AArch64, the PE behaves as if the value of the SCTLR_EL1.M
field is 0 for all purposes other than returning the value of a direct read of
SCTLR_EL1.'

*The description of HCR_EL2.DC states:

'When the Effective value of HCR_EL2.{E2H,TGE} is {1,1}, this field behaves as
0 for all purposes other than a direct read of the value of this field.'

But if {E2H,TGE} = {1,1} then the regime is Regime_EL20, which ignores the DC
bit.  If we're looking at the DC bit, then that means that the regime is EL10,
({E2H,TGE} != {1,1} in compute_translation_regime()) and the effective value of
HCR_EL2.DC is the same as the DC bit.

> +		if (va >= BIT(kvm_get_pa_bits(vcpu->kvm)))
> +			goto addrsz;
> +
> +		wr->level = S1_MMU_DISABLED;
> +		wr->pa = va;
> +		return 0;
> +	}
> +
> +	wi->be = sctlr & SCTLR_ELx_EE;
> +
> +	wi->hpd  = kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, HPDS, IMP);
> +	wi->hpd &= (wi->regime == TR_EL2 ?
> +		    FIELD_GET(TCR_EL2_HPD, tcr) :
> +		    (va55 ?
> +		     FIELD_GET(TCR_HPD1, tcr) :
> +		     FIELD_GET(TCR_HPD0, tcr)));
> +
> +	/* Someone was silly enough to encode TG0/TG1 differently */
> +	if (va55) {
> +		wi->txsz = FIELD_GET(TCR_T1SZ_MASK, tcr);
> +		tg = FIELD_GET(TCR_TG1_MASK, tcr);
> +
> +		switch (tg << TCR_TG1_SHIFT) {
> +		case TCR_TG1_4K:
> +			wi->pgshift = 12;	 break;
> +		case TCR_TG1_16K:
> +			wi->pgshift = 14;	 break;
> +		case TCR_TG1_64K:
> +		default:	    /* IMPDEF: treat any other value as 64k */
> +			wi->pgshift = 16;	 break;
> +		}
> +	} else {
> +		wi->txsz = FIELD_GET(TCR_T0SZ_MASK, tcr);
> +		tg = FIELD_GET(TCR_TG0_MASK, tcr);
> +
> +		switch (tg << TCR_TG0_SHIFT) {
> +		case TCR_TG0_4K:
> +			wi->pgshift = 12;	 break;
> +		case TCR_TG0_16K:
> +			wi->pgshift = 14;	 break;
> +		case TCR_TG0_64K:
> +		default:	    /* IMPDEF: treat any other value as 64k */
> +			wi->pgshift = 16;	 break;
> +		}
> +	}
> +
> +	/* R_PLCGL, R_YXNYW */
> +	if ((!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR2_EL1, ST, 48_47) &&
> +	     wi->txsz > 39) ||
> +	    (wi->pgshift == 16 && wi->txsz > 47) || wi->txsz > 48)
> +		goto transfault_l0;

It's probably just me, but I find this hard to parse. There are three cases when
the condition (!FEAT_TTST && TxSZ > 39) evaluates to false. But the other two
conditions make sense to check only if !FEAT_TTST is false and wi->txsz > 39 is
true.

I find this easier to read:

	if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR2_EL1, ST, 48_47)) {
		if (wi->txsz > 39)
			goto transfault_l0;
	} else {
		if (wi->txsz > 48 || (wi->pgshift == 16 && wi->txsz > 47))
			goto transfault_l0;
	}

What do you think?

> +
> +	/* R_GTJBY, R_SXWGM */
> +	switch (BIT(wi->pgshift)) {
> +	case SZ_4K:
> +		lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN4, 52_BIT);
> +		lva &= tcr & (wi->regime == TR_EL2 ? TCR_EL2_DS : TCR_DS);
> +		break;
> +	case SZ_16K:
> +		lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN16, 52_BIT);
> +		lva &= tcr & (wi->regime == TR_EL2 ? TCR_EL2_DS : TCR_DS);
> +		break;
> +	case SZ_64K:
> +		lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR2_EL1, VARange, 52);
> +		break;
> +	}
> +
> +	if ((lva && wi->txsz < 12) || wi->txsz < 16)
> +		goto transfault_l0;
> +
> +	ia_bits = get_ia_size(wi);
> +
> +	/* R_YYVYV, I_THCZK */
> +	if ((!va55 && va > GENMASK(ia_bits - 1, 0)) ||
> +	    (va55 && va < GENMASK(63, ia_bits)))
> +		goto transfault_l0;
> +
> +	/* R_ZFSYQ */

This is rather pedantic, but I think that should be I_ZFSYQ.

> +	if (wi->regime != TR_EL2 &&
> +	    (tcr & ((va55) ? TCR_EPD1_MASK : TCR_EPD0_MASK)))
> +		goto transfault_l0;
> +
> +	/* R_BNDVG and following statements */
> +	if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR2_EL1, E0PD, IMP) &&
> +	    as_el0 && (tcr & ((va55) ? TCR_E0PD1 : TCR_E0PD0)))
> +		goto transfault_l0;
> +
> +	/* AArch64.S1StartLevel() */
> +	stride = wi->pgshift - 3;
> +	wi->sl = 3 - (((ia_bits - 1) - wi->pgshift) / stride);
> +
> +	ps = (wi->regime == TR_EL2 ?
> +	      FIELD_GET(TCR_EL2_PS_MASK, tcr) : FIELD_GET(TCR_IPS_MASK, tcr));
> +
> +	wi->max_oa_bits = min(get_kvm_ipa_limit(), ps_to_output_size(ps));
> +
> +	/* Compute minimal alignment */
> +	x = 3 + ia_bits - ((3 - wi->sl) * stride + wi->pgshift);
> +
> +	wi->baddr = ttbr & TTBRx_EL1_BADDR;
> +
> +	/* R_VPBBF */
> +	if (check_output_size(wi->baddr, wi))
> +		goto transfault_l0;
> +
> +	wi->baddr &= GENMASK_ULL(wi->max_oa_bits - 1, x);
> +
> +	return 0;
> +
> +addrsz:				/* Address Size Fault level 0 */
> +	fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(0), false, false);
> +	return -EFAULT;
> +
> +transfault_l0:			/* Translation Fault level 0 */
> +	fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(0), false, false);
> +	return -EFAULT;
> +}
> +
> +static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
> +		   struct s1_walk_result *wr, u64 va)
> +{
> +	u64 va_top, va_bottom, baddr, desc;
> +	int level, stride, ret;
> +
> +	level = wi->sl;
> +	stride = wi->pgshift - 3;
> +	baddr = wi->baddr;
> +
> +	va_top = get_ia_size(wi) - 1;
> +
> +	while (1) {
> +		u64 index, ipa;
> +
> +		va_bottom = (3 - level) * stride + wi->pgshift;
> +		index = (va & GENMASK_ULL(va_top, va_bottom)) >> (va_bottom - 3);
> +
> +		ipa = baddr | index;
> +
> +		if (wi->s2) {
> +			struct kvm_s2_trans s2_trans = {};
> +
> +			ret = kvm_walk_nested_s2(vcpu, ipa, &s2_trans);
> +			if (ret) {
> +				fail_s1_walk(wr,
> +					     (s2_trans.esr & ~ESR_ELx_FSC_LEVEL) | level,
> +					     true, true);
> +				return ret;
> +			}
> +
> +			if (!kvm_s2_trans_readable(&s2_trans)) {
> +				fail_s1_walk(wr, ESR_ELx_FSC_PERM_L(level),
> +					     true, true);
> +
> +				return -EPERM;
> +			}
> +
> +			ipa = kvm_s2_trans_output(&s2_trans);
> +		}
> +
> +		ret = kvm_read_guest(vcpu->kvm, ipa, &desc, sizeof(desc));
> +		if (ret) {
> +			fail_s1_walk(wr, ESR_ELx_FSC_SEA_TTW(level),
> +				     true, false);
> +			return ret;
> +		}
> +
> +		if (wi->be)
> +			desc = be64_to_cpu((__force __be64)desc);
> +		else
> +			desc = le64_to_cpu((__force __le64)desc);
> +
> +		if (!(desc & 1) || ((desc & 3) == 1 && level == 3))
> +			goto transfault;

The check for block mapping at level 3 is replicated below, when the level of
the block is checked for correctnes.

Also, would you consider rewriting the valid descriptor check to
(desc & BIT(0)), to match the block level check?

> +
> +		/* We found a leaf, handle that */
> +		if ((desc & 3) == 1 || level == 3)
> +			break;

Here we know that (desc & 1), do you think rewriting the check to !(desc &
BIT(1)), again matching the block level check, would be a good idea?

> +
> +		if (!wi->hpd) {
> +			wr->APTable  |= FIELD_GET(S1_TABLE_AP, desc);
> +			wr->UXNTable |= FIELD_GET(PMD_TABLE_UXN, desc);
> +			wr->PXNTable |= FIELD_GET(PMD_TABLE_PXN, desc);
> +		}
> +
> +		baddr = desc & GENMASK_ULL(47, wi->pgshift);
> +
> +		/* Check for out-of-range OA */
> +		if (check_output_size(baddr, wi))
> +			goto addrsz;
> +
> +		/* Prepare for next round */
> +		va_top = va_bottom - 1;
> +		level++;
> +	}
> +
> +	/* Block mapping, check the validity of the level */
> +	if (!(desc & BIT(1))) {
> +		bool valid_block = false;
> +
> +		switch (BIT(wi->pgshift)) {
> +		case SZ_4K:
> +			valid_block = level == 1 || level == 2;
> +			break;
> +		case SZ_16K:
> +		case SZ_64K:
> +			valid_block = level == 2;
> +			break;
> +		}
> +
> +		if (!valid_block)
> +			goto transfault;
> +	}

Matches AArch64.BlockDescSupported(), with the caveat that KVM doesn't yet
support 52 bit PAs.

> +
> +	if (check_output_size(desc & GENMASK(47, va_bottom), wi))
> +		goto addrsz;
> +
> +	wr->failed = false;
> +	wr->level = level;
> +	wr->desc = desc;
> +	wr->pa = desc & GENMASK(47, va_bottom);
> +	if (va_bottom > 12)
> +		wr->pa |= va & GENMASK_ULL(va_bottom - 1, 12);

I'm looking at StageOA(), and I don't think this matches what happens when the
contiguous bit is set and the contiguous OA isn't aligned as per Table D8-98.
Yes, I know, that's something super niche and unlikely to happen in practice.

Let's assume 4K pages, level = 3 (so va_bottom = 12), the first page in the
contiguous OA range is 0x23_000 (so not aligned to 64K), and the input address
that maps to the first page from the contiguous OA range is 0xf0_000 (aligned to
64K).

According to the present code:

wr->pa = 0x23_000

According to StageOA():

tsize  = 12
csize  = 4
oa     = 0x23_000 & GENMASK_ULL(47, 16) | 0xf0_000 & GENMASK_ULL(15, 0) = 0x20_000
wr->pa = oa & ~GENMASK_ULL(11, 0) = 0x20_000

If the stage 1 is correctly programmed and the OA aligned to the required
alignment, the two outputs match

On a different topic, but still regarding wr->pa. I guess the code aligns wr->pa
to 4K because that's how the OA in PAR_EL1 is reported.

Would it make more sense to have wr->pa represent the real output address, i.e,
also contain the 12 least significant bits of the input address?  It wouldn't
change how PAR_EL1 is computed (bits 11:0 are already masked out), but it would
make wr->pa consistent with what the output address of a given input address
should be (according to StageOA()).

> +
> +	return 0;
> +
> +addrsz:
> +	fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(level), true, false);
> +	return -EINVAL;
> +transfault:
> +	fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(level), true, false);
> +	return -ENOENT;
> +}
> +
>  struct mmu_config {
>  	u64	ttbr0;
>  	u64	ttbr1;
> @@ -188,6 +551,16 @@ static u8 compute_sh(u8 attr, u64 desc)
>  	return sh;
>  }
>  
> +static u8 combine_sh(u8 s1_sh, u8 s2_sh)
> +{
> +	if (s1_sh == ATTR_OSH || s2_sh == ATTR_OSH)
> +		return ATTR_OSH;
> +	if (s1_sh == ATTR_ISH || s2_sh == ATTR_ISH)
> +		return ATTR_ISH;
> +
> +	return ATTR_NSH;
> +}
> +
>  static u64 compute_par_s12(struct kvm_vcpu *vcpu, u64 s1_par,
>  			   struct kvm_s2_trans *tr)
>  {
> @@ -260,11 +633,181 @@ static u64 compute_par_s12(struct kvm_vcpu *vcpu, u64 s1_par,
>  	par  = FIELD_PREP(SYS_PAR_EL1_ATTR, final_attr);
>  	par |= tr->output & GENMASK(47, 12);
>  	par |= FIELD_PREP(SYS_PAR_EL1_SH,
> -			  compute_sh(final_attr, tr->desc));
> +			  combine_sh(FIELD_GET(SYS_PAR_EL1_SH, s1_par),
> +				     compute_sh(final_attr, tr->desc)));
>  
>  	return par;
>  }
>  
> +static u64 compute_par_s1(struct kvm_vcpu *vcpu, struct s1_walk_result *wr,
> +			  enum trans_regime regime)
> +{
> +	u64 par;
> +
> +	if (wr->failed) {
> +		par = SYS_PAR_EL1_RES1;
> +		par |= SYS_PAR_EL1_F;
> +		par |= FIELD_PREP(SYS_PAR_EL1_FST, wr->fst);
> +		par |= wr->ptw ? SYS_PAR_EL1_PTW : 0;
> +		par |= wr->s2 ? SYS_PAR_EL1_S : 0;
> +	} else if (wr->level == S1_MMU_DISABLED) {
> +		/* MMU off or HCR_EL2.DC == 1 */
> +		par = wr->pa & GENMASK_ULL(47, 12);
> +
> +		if (!(__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_DC)) {
> +			par |= FIELD_PREP(SYS_PAR_EL1_ATTR, 0); /* nGnRnE */
> +			par |= FIELD_PREP(SYS_PAR_EL1_SH, 0b10); /* OS */

s/0b10/ATTR_OSH ?

> +		} else {
> +			par |= FIELD_PREP(SYS_PAR_EL1_ATTR,
> +					  MEMATTR(WbRaWa, WbRaWa));
> +			par |= FIELD_PREP(SYS_PAR_EL1_SH, 0b00); /* NS */

s/0b00/ATTR_NSH ?

> +		}

HCR_EL2.DC applies only to Regime_EL10, I think the bit should be ignored for
the EL2 and EL20 regimes.

> +	} else {
> +		u64 mair, sctlr;
> +		u8 sh;
> +
> +		mair = (regime == TR_EL10 ?
> +			vcpu_read_sys_reg(vcpu, MAIR_EL1) :
> +			vcpu_read_sys_reg(vcpu, MAIR_EL2));
> +
> +		mair >>= FIELD_GET(PTE_ATTRINDX_MASK, wr->desc) * 8;
> +		mair &= 0xff;
> +
> +		sctlr = (regime == TR_EL10 ?
> +			 vcpu_read_sys_reg(vcpu, SCTLR_EL1) :
> +			 vcpu_read_sys_reg(vcpu, SCTLR_EL2));
> +
> +		/* Force NC for memory if SCTLR_ELx.C is clear */
> +		if (!(sctlr & SCTLR_EL1_C) && !MEMATTR_IS_DEVICE(mair))
> +			mair = MEMATTR(NC, NC);
> +
> +		par  = FIELD_PREP(SYS_PAR_EL1_ATTR, mair);
> +		par |= wr->pa & GENMASK_ULL(47, 12);
> +
> +		sh = compute_sh(mair, wr->desc);
> +		par |= FIELD_PREP(SYS_PAR_EL1_SH, sh);
> +	}

When PAR_EL1.F = 0 and !FEAT_RME, bit 11 (NSE) is RES1, according to the
description of the register and EncodePAR().

> +
> +	return par;
> +}
> +
> +static u64 handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
> +{
> +	bool perm_fail, ur, uw, ux, pr, pw, px;
> +	struct s1_walk_result wr = {};
> +	struct s1_walk_info wi = {};
> +	int ret, idx;
> +
> +	ret = setup_s1_walk(vcpu, op, &wi, &wr, vaddr);
> +	if (ret)
> +		goto compute_par;
> +
> +	if (wr.level == S1_MMU_DISABLED)
> +		goto compute_par;
> +
> +	idx = srcu_read_lock(&vcpu->kvm->srcu);
> +
> +	ret = walk_s1(vcpu, &wi, &wr, vaddr);
> +
> +	srcu_read_unlock(&vcpu->kvm->srcu, idx);
> +
> +	if (ret)
> +		goto compute_par;
> +
> +	/* FIXME: revisit when adding indirect permission support */
> +	/* AArch64.S1DirectBasePermissions() */
> +	if (wi.regime != TR_EL2) {
> +		switch (FIELD_GET(PTE_USER | PTE_RDONLY, wr.desc)) {
> +		case 0b00:
> +			pr = pw = true;
> +			ur = uw = false;
> +			break;
> +		case 0b01:
> +			pr = pw = ur = uw = true;
> +			break;
> +		case 0b10:
> +			pr = true;
> +			pw = ur = uw = false;
> +			break;
> +		case 0b11:
> +			pr = ur = true;
> +			pw = uw = false;
> +			break;
> +		}
> +
> +		switch (wr.APTable) {
> +		case 0b00:
> +			break;
> +		case 0b01:
> +			ur = uw = false;
> +			break;
> +		case 0b10:
> +			pw = uw = false;
> +			break;
> +		case 0b11:
> +			pw = ur = uw = false;
> +			break;
> +		}
> +
> +		/* We don't use px for anything yet, but hey... */
> +		px = !((wr.desc & PTE_PXN) || wr.PXNTable || pw);
> +		ux = !((wr.desc & PTE_UXN) || wr.UXNTable);
> +
> +		if (op == OP_AT_S1E1RP || op == OP_AT_S1E1WP) {
> +			bool pan;
> +
> +			pan = *vcpu_cpsr(vcpu) & PSR_PAN_BIT;
> +			pan &= ur || uw;
> +			pw &= !pan;
> +			pr &= !pan;
> +		}
> +	} else {
> +		ur = uw = ux = false;
> +
> +		if (!(wr.desc & PTE_RDONLY)) {
> +			pr = pw = true;
> +		} else {
> +			pr = true;
> +			pw = false;
> +		}
> +
> +		if (wr.APTable & BIT(1))
> +			pw = false;
> +
> +		/* XN maps to UXN */
> +		px = !((wr.desc & PTE_UXN) || wr.UXNTable);
> +	}
> +
> +	perm_fail = false;
> +
> +	switch (op) {
> +	case OP_AT_S1E1RP:
> +	case OP_AT_S1E1R:
> +	case OP_AT_S1E2R:
> +		perm_fail = !pr;
> +		break;
> +	case OP_AT_S1E1WP:
> +	case OP_AT_S1E1W:
> +	case OP_AT_S1E2W:
> +		perm_fail = !pw;
> +		break;
> +	case OP_AT_S1E0R:
> +		perm_fail = !ur;
> +		break;
> +	case OP_AT_S1E0W:
> +		perm_fail = !uw;
> +		break;
> +	default:
> +		BUG();
> +	}
> +
> +	if (perm_fail)
> +		fail_s1_walk(&wr, ESR_ELx_FSC_PERM_L(wr.level), false, false);
> +
> +compute_par:
> +	return compute_par_s1(vcpu, &wr, wi.regime);
> +}
> +
>  /*
>   * Return the PAR_EL1 value as the result of a valid translation.
>   *
> @@ -352,10 +895,26 @@ static u64 __kvm_at_s1e01_fast(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
>  	return par;
>  }
>  
> +static bool par_check_s1_perm_fault(u64 par)
> +{
> +	u8 fst = FIELD_GET(SYS_PAR_EL1_FST, par);
> +
> +	return  ((fst & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_PERM &&
> +		 !(par & SYS_PAR_EL1_S));
> +}
> +
>  void __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
>  {
>  	u64 par = __kvm_at_s1e01_fast(vcpu, op, vaddr);
>  
> +	/*
> +	 * If we see a permission fault at S1, then the fast path
> +	 * succeedded. By failing. Otherwise, take a walk on the wild
> +	 * side.

This is rather ambiguous. Maybe something along the lines would be more helpful:

'If PAR_EL1 encodes a permission fault, we know for sure that the hardware
translation table walker was able to walk the stage 1 tables and there's nothing
else that KVM needs to do. On the other hand, if the AT instruction failed for
any other reason, then KVM must walk the guest stage 1 tables (and possibly the
virtual stage 2 tables) to emulate the instruction.'

Thanks,
Alex

> +	 */
> +	if ((par & SYS_PAR_EL1_F) && !par_check_s1_perm_fault(par))
> +		par = handle_at_slow(vcpu, op, vaddr);
> +
>  	vcpu_write_sys_reg(vcpu, par, PAR_EL1);
>  }
>  
> @@ -407,6 +966,10 @@ void __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
>  		isb();
>  	}
>  
> +	/* We failed the translation, let's replay it in slow motion */
> +	if ((par & SYS_PAR_EL1_F) && !par_check_s1_perm_fault(par))
> +		par = handle_at_slow(vcpu, op, vaddr);
> +
>  	vcpu_write_sys_reg(vcpu, par, PAR_EL1);
>  }
>  
> @@ -463,7 +1026,7 @@ void __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
>  	/* Check the access permission */
>  	if (!out.esr &&
>  	    ((!write && !out.readable) || (write && !out.writable)))
> -		out.esr = ESR_ELx_FSC_PERM | (out.level & 0x3);
> +		out.esr = ESR_ELx_FSC_PERM_L(out.level & 0x3);
>  
>  	par = compute_par_s12(vcpu, par, &out);
>  	vcpu_write_sys_reg(vcpu, par, PAR_EL1);
> -- 
> 2.39.2
>
Marc Zyngier Aug. 10, 2024, 10:16 a.m. UTC | #2
Hi Alex,

On Fri, 09 Aug 2024 13:43:33 +0100,
Alexandru Elisei <alexandru.elisei@arm.com> wrote:
> 
> Hi Marc,
> 
> Finally managed to go through the patch. Bunch of nitpicks from me (can be
> safely ignored), and some corner cases where KVM deviates from the spec.

Thanks for taking the time to go through this mess.

[...]

> > +		break;
> > +	default:
> > +		return (vcpu_el2_e2h_is_set(vcpu) &&
> > +			vcpu_el2_tge_is_set(vcpu)) ? TR_EL20 : TR_EL10;
> 
> This also looks correct to me. Following the pseudocode was not trivial, so I'm
> leaving this here in case I made a mistake somewhere.
> 
> For the S1E0* variants: AArch64.AT(el_in=EL0) => TranslationRegime(el=EL0) =>
> ELIsInHost(el=EL0); ELIsInHost(el=EL0) is true if {E2H,TGE} = {1,1}, and in this
> case TranslationRegime(el=EL0) returns Regime_EL20, otherwise Regime_EL10.
> 
> For the S1E1* variants: AArch64.AT(el_in=EL1), where:
> 
> - if ELIsInHost(el=EL0) is true, then 'el' is changed to EL2, where
>   ELIsInHost(el=EL0) is true if {E2H,TGE} = {1,1}. In this case,
>   TranslationRegime(el=EL2) will return Regime_EL20.
> 
> - if ELIsInHost(el=EL0) is false, then 'el' remains EL1, and
>   TranslationRegime(el=EL1) returns Regime_EL10.

Yup. Somehow, the C version is far easier to understand!

> 
> > +	}
> > +}
> > +
> > +static int setup_s1_walk(struct kvm_vcpu *vcpu, u32 op, struct s1_walk_info *wi,
> > +			 struct s1_walk_result *wr, u64 va)
> > +{
> > +	u64 sctlr, tcr, tg, ps, ia_bits, ttbr;
> > +	unsigned int stride, x;
> > +	bool va55, tbi, lva, as_el0;
> > +
> > +	wi->regime = compute_translation_regime(vcpu, op);
> > +	as_el0 = (op == OP_AT_S1E0R || op == OP_AT_S1E0W);
> > +
> > +	va55 = va & BIT(55);
> > +
> > +	if (wi->regime == TR_EL2 && va55)
> > +		goto addrsz;
> > +
> > +	wi->s2 = wi->regime == TR_EL10 && (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_VM);
> 
> According to AArch64.NSS2TTWParams(), stage 2 is enabled if HCR_EL2.VM or
> HCR_EL2.DC.
> 
> From the description of HCR_EL2.DC, when the bit is set:
> 
> 'The PE behaves as if the value of the HCR_EL2.VM field is 1 for all purposes
> other than returning the value of a direct read of HCR_EL2.'

Yup, good point. And as you noticed further down, the HCR_EL2.DC
handling is currently a mess.

[...]

> > +	/* Let's put the MMU disabled case aside immediately */
> > +	if (!(sctlr & SCTLR_ELx_M) ||
> > +	    (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_DC)) {
> 
> I think the condition doesn't match AArch64.S1Enabled():
> 
> - if regime is Regime_EL20 or Regime_EL2, then S1 is disabled if and only if
>   SCTLR_EL2.M is not set; it doesn't matter if HCR_EL2.DC is set, because "[..]
>   this field has no effect on the EL2, EL2&0, and EL3 translation regimes."
>   (HCR_EL2.DC bit field description).
> 
> - if regime is Regime_EL10, then S1 is disabled if SCTLR_EL1.M == 0 or
>   HCR_EL2.TGE = 0 or the effective value of HCR_EL2.DC* is 0.
> 
> From the description of HCR_EL2.TGE, when the bit is set:
> 
> 'If EL1 is using AArch64, the SCTLR_EL1.M field is treated as being 0 for all
> purposes other than returning the result of a direct read of SCTLR_EL1.'
> 
> From the description of HCR_EL2.DC, when the bit is set:
> 
> 'When EL1 is using AArch64, the PE behaves as if the value of the SCTLR_EL1.M
> field is 0 for all purposes other than returning the value of a direct read of
> SCTLR_EL1.'
> 
> *The description of HCR_EL2.DC states:
> 
> 'When the Effective value of HCR_EL2.{E2H,TGE} is {1,1}, this field behaves as
> 0 for all purposes other than a direct read of the value of this field.'
> 
> But if {E2H,TGE} = {1,1} then the regime is Regime_EL20, which ignores the DC
> bit.  If we're looking at the DC bit, then that means that the regime is EL10,
> ({E2H,TGE} != {1,1} in compute_translation_regime()) and the effective value of
> HCR_EL2.DC is the same as the DC bit.

Yup. That's what I've stashed on top:

@@ -136,12 +137,22 @@ static int setup_s1_walk(struct kvm_vcpu *vcpu, u32 op, struct s1_walk_info *wi,
 	va = (u64)sign_extend64(va, 55);
 
 	/* Let's put the MMU disabled case aside immediately */
-	if (!(sctlr & SCTLR_ELx_M) ||
-	    (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_DC)) {
+	switch (wi->regime) {
+	case TR_EL10:
+		if (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_DC)
+			wr->level = S1_MMU_DISABLED;
+		fallthrough;
+	case TR_EL2:
+	case TR_EL20:
+		if (!(sctlr & SCTLR_ELx_M))
+			wr->level = S1_MMU_DISABLED;
+		break;
+	}
+
+	if (wr->level == S1_MMU_DISABLED) {
 		if (va >= BIT(kvm_get_pa_bits(vcpu->kvm)))
 			goto addrsz;
 
-		wr->level = S1_MMU_DISABLED;
 		wr->pa = va;
 		return 0;
 	}

[...]

> > +	/* R_PLCGL, R_YXNYW */
> > +	if ((!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR2_EL1, ST, 48_47) &&
> > +	     wi->txsz > 39) ||
> > +	    (wi->pgshift == 16 && wi->txsz > 47) || wi->txsz > 48)
> > +		goto transfault_l0;
> 
> It's probably just me, but I find this hard to parse. There are three cases when
> the condition (!FEAT_TTST && TxSZ > 39) evaluates to false. But the other two
> conditions make sense to check only if !FEAT_TTST is false and wi->txsz > 39 is
> true.
> 
> I find this easier to read:
> 
> 	if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR2_EL1, ST, 48_47)) {
> 		if (wi->txsz > 39)
> 			goto transfault_l0;
> 	} else {
> 		if (wi->txsz > 48 || (wi->pgshift == 16 && wi->txsz > 47))
> 			goto transfault_l0;
> 	}
> 
> What do you think?

Sure, happy to rewrite it like this if it helps.

[...]

> > +	/* R_YYVYV, I_THCZK */
> > +	if ((!va55 && va > GENMASK(ia_bits - 1, 0)) ||
> > +	    (va55 && va < GENMASK(63, ia_bits)))
> > +		goto transfault_l0;
> > +
> > +	/* R_ZFSYQ */
> 
> This is rather pedantic, but I think that should be I_ZFSYQ.

Well, these references are supposed to help. If they are incorrect,
they serve no purpose. Thanks for spotting this.

[...]

> > +		if (!(desc & 1) || ((desc & 3) == 1 && level == 3))
> > +			goto transfault;
> 
> The check for block mapping at level 3 is replicated below, when the level of
> the block is checked for correctnes.
> 
> Also, would you consider rewriting the valid descriptor check to
> (desc & BIT(0)), to match the block level check?
> 
> > +
> > +		/* We found a leaf, handle that */
> > +		if ((desc & 3) == 1 || level == 3)
> > +			break;
> 
> Here we know that (desc & 1), do you think rewriting the check to !(desc &
> BIT(1)), again matching the block level check, would be a good idea?

Other that the BIT() stuff, I'm not completely clear what you are
asking for. Are you objecting to the fact that the code is slightly
redundant? If so, I may be able to clean things up for you:

@@ -309,13 +323,19 @@ static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
 		else
 			desc = le64_to_cpu((__force __le64)desc);
 
-		if (!(desc & 1) || ((desc & 3) == 1 && level == 3))
+		/* Invalid descriptor */
+		if (!(desc & BIT(0)))
 			goto transfault;
 
-		/* We found a leaf, handle that */
-		if ((desc & 3) == 1 || level == 3)
+		/* Block mapping, check validity down the line */
+		if (!(desc & BIT(1)))
+			break;
+
+		/* Page mapping */
+		if (level == 3)
 			break;
 
+		/* Table handling */
 		if (!wi->hpd) {
 			wr->APTable  |= FIELD_GET(S1_TABLE_AP, desc);
 			wr->UXNTable |= FIELD_GET(PMD_TABLE_UXN, desc);

Do you like this one better? Each bit only gets tested once.

[...]

> > +
> > +	if (check_output_size(desc & GENMASK(47, va_bottom), wi))
> > +		goto addrsz;
> > +
> > +	wr->failed = false;
> > +	wr->level = level;
> > +	wr->desc = desc;
> > +	wr->pa = desc & GENMASK(47, va_bottom);
> > +	if (va_bottom > 12)
> > +		wr->pa |= va & GENMASK_ULL(va_bottom - 1, 12);
> 
> I'm looking at StageOA(), and I don't think this matches what happens when the
> contiguous bit is set and the contiguous OA isn't aligned as per Table D8-98.
> Yes, I know, that's something super niche and unlikely to happen in practice.
> 
> Let's assume 4K pages, level = 3 (so va_bottom = 12), the first page in the
> contiguous OA range is 0x23_000 (so not aligned to 64K), and the input address
> that maps to the first page from the contiguous OA range is 0xf0_000 (aligned to
> 64K).
> 
> According to the present code:
> 
> wr->pa = 0x23_000
> 
> According to StageOA():
> 
> tsize  = 12
> csize  = 4
> oa     = 0x23_000 & GENMASK_ULL(47, 16) | 0xf0_000 & GENMASK_ULL(15, 0) = 0x20_000
> wr->pa = oa & ~GENMASK_ULL(11, 0) = 0x20_000
> 
> If the stage 1 is correctly programmed and the OA aligned to the required
> alignment, the two outputs match

Huh, that's another nice catch. I had the (dumb) idea that if we
didn't use the Contiguous bit as a TLB hint, we didn't need to do
anything special when it came to the alignment of the OA.

But clearly, the spec says that this alignment must be honoured, and
there is no way out of it. Which means that the S2 walker also has a
bit of a problem on that front.

> On a different topic, but still regarding wr->pa. I guess the code aligns wr->pa
> to 4K because that's how the OA in PAR_EL1 is reported.
> 
> Would it make more sense to have wr->pa represent the real output address, i.e,
> also contain the 12 least significant bits of the input address?  It wouldn't
> change how PAR_EL1 is computed (bits 11:0 are already masked out), but it would
> make wr->pa consistent with what the output address of a given input address
> should be (according to StageOA()).

Yup. That'd be consistent with the way wr->pa is reported when S1 is disabled.

I ended-up with this (untested):

@@ -354,12 +374,24 @@ static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
 	if (check_output_size(desc & GENMASK(47, va_bottom), wi))
 		goto addrsz;
 
+	if (desc & PTE_CONT) {
+		switch (BIT(wi->pgshift)) {
+		case SZ_4K:
+			va_bottom += 4;
+			break;
+		case SZ_16K:
+			va_bottom += level == 2 ? 5 : 7;
+			break;
+		case SZ_64K:
+			va_bottom += 5;
+		}
+	}
+
 	wr->failed = false;
 	wr->level = level;
 	wr->desc = desc;
 	wr->pa = desc & GENMASK(47, va_bottom);
-	if (va_bottom > 12)
-		wr->pa |= va & GENMASK_ULL(va_bottom - 1, 12);
+	wr->pa |= va & GENMASK_ULL(va_bottom - 1, 0);
 
 	return 0;
 

Note that I'm still checking for the address size before the
contiguous bit alignment, as per R_JHQPP.

[...]

> > +		if (!(__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_DC)) {
> > +			par |= FIELD_PREP(SYS_PAR_EL1_ATTR, 0); /* nGnRnE */
> > +			par |= FIELD_PREP(SYS_PAR_EL1_SH, 0b10); /* OS */
> 
> s/0b10/ATTR_OSH ?
> 
> > +		} else {
> > +			par |= FIELD_PREP(SYS_PAR_EL1_ATTR,
> > +					  MEMATTR(WbRaWa, WbRaWa));
> > +			par |= FIELD_PREP(SYS_PAR_EL1_SH, 0b00); /* NS */
> 
> s/0b00/ATTR_NSH ?
> 
> > +		}
> 
> HCR_EL2.DC applies only to Regime_EL10, I think the bit should be ignored for
> the EL2 and EL20 regimes.

Yup, now fixed.

> 
> > +	} else {
> > +		u64 mair, sctlr;
> > +		u8 sh;
> > +
> > +		mair = (regime == TR_EL10 ?
> > +			vcpu_read_sys_reg(vcpu, MAIR_EL1) :
> > +			vcpu_read_sys_reg(vcpu, MAIR_EL2));
> > +
> > +		mair >>= FIELD_GET(PTE_ATTRINDX_MASK, wr->desc) * 8;
> > +		mair &= 0xff;
> > +
> > +		sctlr = (regime == TR_EL10 ?
> > +			 vcpu_read_sys_reg(vcpu, SCTLR_EL1) :
> > +			 vcpu_read_sys_reg(vcpu, SCTLR_EL2));
> > +
> > +		/* Force NC for memory if SCTLR_ELx.C is clear */
> > +		if (!(sctlr & SCTLR_EL1_C) && !MEMATTR_IS_DEVICE(mair))
> > +			mair = MEMATTR(NC, NC);
> > +
> > +		par  = FIELD_PREP(SYS_PAR_EL1_ATTR, mair);
> > +		par |= wr->pa & GENMASK_ULL(47, 12);
> > +
> > +		sh = compute_sh(mair, wr->desc);
> > +		par |= FIELD_PREP(SYS_PAR_EL1_SH, sh);
> > +	}
> 
> When PAR_EL1.F = 0 and !FEAT_RME, bit 11 (NSE) is RES1, according to the
> description of the register and EncodePAR().

Fixed.

[...]

> >  void __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
> >  {
> >  	u64 par = __kvm_at_s1e01_fast(vcpu, op, vaddr);
> >  
> > +	/*
> > +	 * If we see a permission fault at S1, then the fast path
> > +	 * succeedded. By failing. Otherwise, take a walk on the wild
> > +	 * side.
> 
> This is rather ambiguous. Maybe something along the lines would be more helpful:
> 
> 'If PAR_EL1 encodes a permission fault, we know for sure that the hardware
> translation table walker was able to walk the stage 1 tables and there's nothing
> else that KVM needs to do. On the other hand, if the AT instruction failed for
> any other reason, then KVM must walk the guest stage 1 tables (and possibly the
> virtual stage 2 tables) to emulate the instruction.'

Sure. I've adopted a slightly less verbose version of this:

@@ -930,9 +966,12 @@ void __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
 	u64 par = __kvm_at_s1e01_fast(vcpu, op, vaddr);
 
 	/*
-	 * If we see a permission fault at S1, then the fast path
-	 * succeedded. By failing. Otherwise, take a walk on the wild
-	 * side.
+	 * If PAR_EL1 reports that AT failed on a S1 permission fault, we
+	 * know for sure that the PTW was able to walk the S1 tables and
+	 * there's nothing else to do.
+	 *
+	 * If AT failed for any other reason, then we must walk the guest S1
+	 * to emulate the instruction.
 	 */
 	if ((par & SYS_PAR_EL1_F) && !par_check_s1_perm_fault(par))
 		par = handle_at_slow(vcpu, op, vaddr);


I'll retest things over the weekend and post a new version early next
week.

Thanks again for your review, this is awesome work!

	M.
Alexandru Elisei Aug. 12, 2024, 3:11 p.m. UTC | #3
Hi Marc,

On Sat, Aug 10, 2024 at 11:16:15AM +0100, Marc Zyngier wrote:
> Hi Alex,
> 
> On Fri, 09 Aug 2024 13:43:33 +0100,
> Alexandru Elisei <alexandru.elisei@arm.com> wrote:
> > 
> > Hi Marc,
> > 
> > Finally managed to go through the patch. Bunch of nitpicks from me (can be
> > safely ignored), and some corner cases where KVM deviates from the spec.
> 
> Thanks for taking the time to go through this mess.
> 
> [...]
> 
> > > +		break;
> > > +	default:
> > > +		return (vcpu_el2_e2h_is_set(vcpu) &&
> > > +			vcpu_el2_tge_is_set(vcpu)) ? TR_EL20 : TR_EL10;
> > 
> > This also looks correct to me. Following the pseudocode was not trivial, so I'm
> > leaving this here in case I made a mistake somewhere.
> > 
> > For the S1E0* variants: AArch64.AT(el_in=EL0) => TranslationRegime(el=EL0) =>
> > ELIsInHost(el=EL0); ELIsInHost(el=EL0) is true if {E2H,TGE} = {1,1}, and in this
> > case TranslationRegime(el=EL0) returns Regime_EL20, otherwise Regime_EL10.
> > 
> > For the S1E1* variants: AArch64.AT(el_in=EL1), where:
> > 
> > - if ELIsInHost(el=EL0) is true, then 'el' is changed to EL2, where
> >   ELIsInHost(el=EL0) is true if {E2H,TGE} = {1,1}. In this case,
> >   TranslationRegime(el=EL2) will return Regime_EL20.
> > 
> > - if ELIsInHost(el=EL0) is false, then 'el' remains EL1, and
> >   TranslationRegime(el=EL1) returns Regime_EL10.
> 
> Yup. Somehow, the C version is far easier to understand!
> 
> > 
> > > +	}
> > > +}
> > > +
> > > +static int setup_s1_walk(struct kvm_vcpu *vcpu, u32 op, struct s1_walk_info *wi,
> > > +			 struct s1_walk_result *wr, u64 va)
> > > +{
> > > +	u64 sctlr, tcr, tg, ps, ia_bits, ttbr;
> > > +	unsigned int stride, x;
> > > +	bool va55, tbi, lva, as_el0;
> > > +
> > > +	wi->regime = compute_translation_regime(vcpu, op);
> > > +	as_el0 = (op == OP_AT_S1E0R || op == OP_AT_S1E0W);
> > > +
> > > +	va55 = va & BIT(55);
> > > +
> > > +	if (wi->regime == TR_EL2 && va55)
> > > +		goto addrsz;
> > > +
> > > +	wi->s2 = wi->regime == TR_EL10 && (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_VM);
> > 
> > According to AArch64.NSS2TTWParams(), stage 2 is enabled if HCR_EL2.VM or
> > HCR_EL2.DC.
> > 
> > From the description of HCR_EL2.DC, when the bit is set:
> > 
> > 'The PE behaves as if the value of the HCR_EL2.VM field is 1 for all purposes
> > other than returning the value of a direct read of HCR_EL2.'
> 
> Yup, good point. And as you noticed further down, the HCR_EL2.DC
> handling is currently a mess.
> 
> [...]
> 
> > > +	/* Let's put the MMU disabled case aside immediately */
> > > +	if (!(sctlr & SCTLR_ELx_M) ||
> > > +	    (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_DC)) {
> > 
> > I think the condition doesn't match AArch64.S1Enabled():
> > 
> > - if regime is Regime_EL20 or Regime_EL2, then S1 is disabled if and only if
> >   SCTLR_EL2.M is not set; it doesn't matter if HCR_EL2.DC is set, because "[..]
> >   this field has no effect on the EL2, EL2&0, and EL3 translation regimes."
> >   (HCR_EL2.DC bit field description).
> > 
> > - if regime is Regime_EL10, then S1 is disabled if SCTLR_EL1.M == 0 or
> >   HCR_EL2.TGE = 0 or the effective value of HCR_EL2.DC* is 0.
> > 
> > From the description of HCR_EL2.TGE, when the bit is set:
> > 
> > 'If EL1 is using AArch64, the SCTLR_EL1.M field is treated as being 0 for all
> > purposes other than returning the result of a direct read of SCTLR_EL1.'
> > 
> > From the description of HCR_EL2.DC, when the bit is set:
> > 
> > 'When EL1 is using AArch64, the PE behaves as if the value of the SCTLR_EL1.M
> > field is 0 for all purposes other than returning the value of a direct read of
> > SCTLR_EL1.'
> > 
> > *The description of HCR_EL2.DC states:
> > 
> > 'When the Effective value of HCR_EL2.{E2H,TGE} is {1,1}, this field behaves as
> > 0 for all purposes other than a direct read of the value of this field.'
> > 
> > But if {E2H,TGE} = {1,1} then the regime is Regime_EL20, which ignores the DC
> > bit.  If we're looking at the DC bit, then that means that the regime is EL10,
> > ({E2H,TGE} != {1,1} in compute_translation_regime()) and the effective value of
> > HCR_EL2.DC is the same as the DC bit.
> 
> Yup. That's what I've stashed on top:
> 
> @@ -136,12 +137,22 @@ static int setup_s1_walk(struct kvm_vcpu *vcpu, u32 op, struct s1_walk_info *wi,
>  	va = (u64)sign_extend64(va, 55);
>  
>  	/* Let's put the MMU disabled case aside immediately */
> -	if (!(sctlr & SCTLR_ELx_M) ||
> -	    (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_DC)) {
> +	switch (wi->regime) {
> +	case TR_EL10:
> +		if (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_DC)
> +			wr->level = S1_MMU_DISABLED;

In compute_translation_regime(), for AT instructions other than AT S1E2*, when
{E2H,TGE} = {0,1}, regime is Regime_EL10. As far as I can tell, when regime is
Regime_EL10 and TGE is set, stage 1 is disabled, according to
AArch64.S1Enabled() and the decription of the TGE bit.

> +		fallthrough;
> +	case TR_EL2:
> +	case TR_EL20:
> +		if (!(sctlr & SCTLR_ELx_M))
> +			wr->level = S1_MMU_DISABLED;
> +		break;
> +	}
> +
> +	if (wr->level == S1_MMU_DISABLED) {
>  		if (va >= BIT(kvm_get_pa_bits(vcpu->kvm)))
>  			goto addrsz;
>  
> -		wr->level = S1_MMU_DISABLED;
>  		wr->pa = va;
>  		return 0;
>  	}
> 
> [...]
> 
> > > +	/* R_PLCGL, R_YXNYW */
> > > +	if ((!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR2_EL1, ST, 48_47) &&
> > > +	     wi->txsz > 39) ||
> > > +	    (wi->pgshift == 16 && wi->txsz > 47) || wi->txsz > 48)
> > > +		goto transfault_l0;
> > 
> > It's probably just me, but I find this hard to parse. There are three cases when
> > the condition (!FEAT_TTST && TxSZ > 39) evaluates to false. But the other two
> > conditions make sense to check only if !FEAT_TTST is false and wi->txsz > 39 is
> > true.
> > 
> > I find this easier to read:
> > 
> > 	if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR2_EL1, ST, 48_47)) {
> > 		if (wi->txsz > 39)
> > 			goto transfault_l0;
> > 	} else {
> > 		if (wi->txsz > 48 || (wi->pgshift == 16 && wi->txsz > 47))
> > 			goto transfault_l0;
> > 	}
> > 
> > What do you think?
> 
> Sure, happy to rewrite it like this if it helps.

Cool, thanks.

> 
> [...]
> 
> > > +	/* R_YYVYV, I_THCZK */
> > > +	if ((!va55 && va > GENMASK(ia_bits - 1, 0)) ||
> > > +	    (va55 && va < GENMASK(63, ia_bits)))
> > > +		goto transfault_l0;
> > > +
> > > +	/* R_ZFSYQ */
> > 
> > This is rather pedantic, but I think that should be I_ZFSYQ.
> 
> Well, these references are supposed to help. If they are incorrect,
> they serve no purpose. Thanks for spotting this.
> 
> [...]
> 
> > > +		if (!(desc & 1) || ((desc & 3) == 1 && level == 3))
> > > +			goto transfault;
> > 
> > The check for block mapping at level 3 is replicated below, when the level of
> > the block is checked for correctnes.
> > 
> > Also, would you consider rewriting the valid descriptor check to
> > (desc & BIT(0)), to match the block level check?
> > 
> > > +
> > > +		/* We found a leaf, handle that */
> > > +		if ((desc & 3) == 1 || level == 3)
> > > +			break;
> > 
> > Here we know that (desc & 1), do you think rewriting the check to !(desc &
> > BIT(1)), again matching the block level check, would be a good idea?
> 
> Other that the BIT() stuff, I'm not completely clear what you are
> asking for. Are you objecting to the fact that the code is slightly
> redundant? If so, I may be able to clean things up for you:

Yes, I was referring to the fact that the code is slightly redundant.

> 
> @@ -309,13 +323,19 @@ static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
>  		else
>  			desc = le64_to_cpu((__force __le64)desc);
>  
> -		if (!(desc & 1) || ((desc & 3) == 1 && level == 3))
> +		/* Invalid descriptor */
> +		if (!(desc & BIT(0)))
>  			goto transfault;
>  
> -		/* We found a leaf, handle that */
> -		if ((desc & 3) == 1 || level == 3)
> +		/* Block mapping, check validity down the line */
> +		if (!(desc & BIT(1)))
> +			break;
> +
> +		/* Page mapping */
> +		if (level == 3)
>  			break;
>  
> +		/* Table handling */
>  		if (!wi->hpd) {
>  			wr->APTable  |= FIELD_GET(S1_TABLE_AP, desc);
>  			wr->UXNTable |= FIELD_GET(PMD_TABLE_UXN, desc);
> 
> Do you like this one better? Each bit only gets tested once.

	switch (desc & GENMASK_ULL(1, 0)) {
	case 0b00:
	case 0b10:
		goto transfault;
	case 0b01:
		/* Block mapping */
		break;
	default:
		if (level == 3)
			break;
	}

Is this better? Perhaps slightly easier to match against the descriptor layouts,
but I'm not sure it's an improvement over your suggestion. Up to you, no point
in bikeshedding over it.

> 
> [...]
> 
> > > +
> > > +	if (check_output_size(desc & GENMASK(47, va_bottom), wi))
> > > +		goto addrsz;
> > > +
> > > +	wr->failed = false;
> > > +	wr->level = level;
> > > +	wr->desc = desc;
> > > +	wr->pa = desc & GENMASK(47, va_bottom);
> > > +	if (va_bottom > 12)
> > > +		wr->pa |= va & GENMASK_ULL(va_bottom - 1, 12);
> > 
> > I'm looking at StageOA(), and I don't think this matches what happens when the
> > contiguous bit is set and the contiguous OA isn't aligned as per Table D8-98.
> > Yes, I know, that's something super niche and unlikely to happen in practice.
> > 
> > Let's assume 4K pages, level = 3 (so va_bottom = 12), the first page in the
> > contiguous OA range is 0x23_000 (so not aligned to 64K), and the input address
> > that maps to the first page from the contiguous OA range is 0xf0_000 (aligned to
> > 64K).
> > 
> > According to the present code:
> > 
> > wr->pa = 0x23_000
> > 
> > According to StageOA():
> > 
> > tsize  = 12
> > csize  = 4
> > oa     = 0x23_000 & GENMASK_ULL(47, 16) | 0xf0_000 & GENMASK_ULL(15, 0) = 0x20_000
> > wr->pa = oa & ~GENMASK_ULL(11, 0) = 0x20_000
> > 
> > If the stage 1 is correctly programmed and the OA aligned to the required
> > alignment, the two outputs match
> 
> Huh, that's another nice catch. I had the (dumb) idea that if we
> didn't use the Contiguous bit as a TLB hint, we didn't need to do
> anything special when it came to the alignment of the OA.
> 
> But clearly, the spec says that this alignment must be honoured, and
> there is no way out of it. Which means that the S2 walker also has a
> bit of a problem on that front.
> 
> > On a different topic, but still regarding wr->pa. I guess the code aligns wr->pa
> > to 4K because that's how the OA in PAR_EL1 is reported.
> > 
> > Would it make more sense to have wr->pa represent the real output address, i.e,
> > also contain the 12 least significant bits of the input address?  It wouldn't
> > change how PAR_EL1 is computed (bits 11:0 are already masked out), but it would
> > make wr->pa consistent with what the output address of a given input address
> > should be (according to StageOA()).
> 
> Yup. That'd be consistent with the way wr->pa is reported when S1 is disabled.
> 
> I ended-up with this (untested):
> 
> @@ -354,12 +374,24 @@ static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
>  	if (check_output_size(desc & GENMASK(47, va_bottom), wi))
>  		goto addrsz;
>  
> +	if (desc & PTE_CONT) {
> +		switch (BIT(wi->pgshift)) {
> +		case SZ_4K:
> +			va_bottom += 4;
> +			break;
> +		case SZ_16K:
> +			va_bottom += level == 2 ? 5 : 7;
> +			break;
> +		case SZ_64K:
> +			va_bottom += 5;
> +		}
> +	}
> +

Matches ContiguousSize().

>  	wr->failed = false;
>  	wr->level = level;
>  	wr->desc = desc;
>  	wr->pa = desc & GENMASK(47, va_bottom);
> -	if (va_bottom > 12)
> -		wr->pa |= va & GENMASK_ULL(va_bottom - 1, 12);
> +	wr->pa |= va & GENMASK_ULL(va_bottom - 1, 0);

Matches StageOA().

>  
>  	return 0;
>  
> 
> Note that I'm still checking for the address size before the
> contiguous bit alignment, as per R_JHQPP.

Nicely spotted.

> 
> [...]
> 
> > > +		if (!(__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_DC)) {
> > > +			par |= FIELD_PREP(SYS_PAR_EL1_ATTR, 0); /* nGnRnE */
> > > +			par |= FIELD_PREP(SYS_PAR_EL1_SH, 0b10); /* OS */
> > 
> > s/0b10/ATTR_OSH ?
> > 
> > > +		} else {
> > > +			par |= FIELD_PREP(SYS_PAR_EL1_ATTR,
> > > +					  MEMATTR(WbRaWa, WbRaWa));
> > > +			par |= FIELD_PREP(SYS_PAR_EL1_SH, 0b00); /* NS */
> > 
> > s/0b00/ATTR_NSH ?
> > 
> > > +		}
> > 
> > HCR_EL2.DC applies only to Regime_EL10, I think the bit should be ignored for
> > the EL2 and EL20 regimes.
> 
> Yup, now fixed.
> 
> > 
> > > +	} else {
> > > +		u64 mair, sctlr;
> > > +		u8 sh;
> > > +
> > > +		mair = (regime == TR_EL10 ?
> > > +			vcpu_read_sys_reg(vcpu, MAIR_EL1) :
> > > +			vcpu_read_sys_reg(vcpu, MAIR_EL2));
> > > +
> > > +		mair >>= FIELD_GET(PTE_ATTRINDX_MASK, wr->desc) * 8;
> > > +		mair &= 0xff;
> > > +
> > > +		sctlr = (regime == TR_EL10 ?
> > > +			 vcpu_read_sys_reg(vcpu, SCTLR_EL1) :
> > > +			 vcpu_read_sys_reg(vcpu, SCTLR_EL2));
> > > +
> > > +		/* Force NC for memory if SCTLR_ELx.C is clear */
> > > +		if (!(sctlr & SCTLR_EL1_C) && !MEMATTR_IS_DEVICE(mair))
> > > +			mair = MEMATTR(NC, NC);
> > > +
> > > +		par  = FIELD_PREP(SYS_PAR_EL1_ATTR, mair);
> > > +		par |= wr->pa & GENMASK_ULL(47, 12);
> > > +
> > > +		sh = compute_sh(mair, wr->desc);
> > > +		par |= FIELD_PREP(SYS_PAR_EL1_SH, sh);
> > > +	}
> > 
> > When PAR_EL1.F = 0 and !FEAT_RME, bit 11 (NSE) is RES1, according to the
> > description of the register and EncodePAR().
> 
> Fixed.
> 
> [...]
> 
> > >  void __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
> > >  {
> > >  	u64 par = __kvm_at_s1e01_fast(vcpu, op, vaddr);
> > >  
> > > +	/*
> > > +	 * If we see a permission fault at S1, then the fast path
> > > +	 * succeedded. By failing. Otherwise, take a walk on the wild
> > > +	 * side.
> > 
> > This is rather ambiguous. Maybe something along the lines would be more helpful:
> > 
> > 'If PAR_EL1 encodes a permission fault, we know for sure that the hardware
> > translation table walker was able to walk the stage 1 tables and there's nothing
> > else that KVM needs to do. On the other hand, if the AT instruction failed for
> > any other reason, then KVM must walk the guest stage 1 tables (and possibly the
> > virtual stage 2 tables) to emulate the instruction.'
> 
> Sure. I've adopted a slightly less verbose version of this:
> 
> @@ -930,9 +966,12 @@ void __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
>  	u64 par = __kvm_at_s1e01_fast(vcpu, op, vaddr);
>  
>  	/*
> -	 * If we see a permission fault at S1, then the fast path
> -	 * succeedded. By failing. Otherwise, take a walk on the wild
> -	 * side.
> +	 * If PAR_EL1 reports that AT failed on a S1 permission fault, we
> +	 * know for sure that the PTW was able to walk the S1 tables and
> +	 * there's nothing else to do.
> +	 *
> +	 * If AT failed for any other reason, then we must walk the guest S1
> +	 * to emulate the instruction.

Looks good.

Thanks,
Alex

>  	 */
>  	if ((par & SYS_PAR_EL1_F) && !par_check_s1_perm_fault(par))
>  		par = handle_at_slow(vcpu, op, vaddr);
> 
> 
> I'll retest things over the weekend and post a new version early next
> week.
> 
> Thanks again for your review, this is awesome work!
> 
> 	M.
> 
> -- 
> Without deviation from the norm, progress is not possible.
Marc Zyngier Aug. 12, 2024, 5:58 p.m. UTC | #4
Hi Alex,

On Mon, 12 Aug 2024 16:11:02 +0100,
Alexandru Elisei <alexandru.elisei@arm.com> wrote:
> 
> Hi Marc,
> 
> On Sat, Aug 10, 2024 at 11:16:15AM +0100, Marc Zyngier wrote:
> > Hi Alex,
> > 
> > @@ -136,12 +137,22 @@ static int setup_s1_walk(struct kvm_vcpu *vcpu, u32 op, struct s1_walk_info *wi,
> >  	va = (u64)sign_extend64(va, 55);
> >  
> >  	/* Let's put the MMU disabled case aside immediately */
> > -	if (!(sctlr & SCTLR_ELx_M) ||
> > -	    (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_DC)) {
> > +	switch (wi->regime) {
> > +	case TR_EL10:
> > +		if (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_DC)
> > +			wr->level = S1_MMU_DISABLED;
> 
> In compute_translation_regime(), for AT instructions other than AT S1E2*, when
> {E2H,TGE} = {0,1}, regime is Regime_EL10. As far as I can tell, when regime is
> Regime_EL10 and TGE is set, stage 1 is disabled, according to
> AArch64.S1Enabled() and the decription of the TGE bit.

Grmbl... I really dislike E2H=0. May it die a painful death. How about
this on top?

diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c
index 10017d990bc3..870e77266f80 100644
--- a/arch/arm64/kvm/at.c
+++ b/arch/arm64/kvm/at.c
@@ -139,7 +139,19 @@ static int setup_s1_walk(struct kvm_vcpu *vcpu, u32 op, struct s1_walk_info *wi,
 	/* Let's put the MMU disabled case aside immediately */
 	switch (wi->regime) {
 	case TR_EL10:
-		if (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_DC)
+		/*
+		 * If dealing with the EL1&0 translation regime, 3 things
+		 * can disable the S1 translation:
+		 *
+		 * - HCR_EL2.DC = 0
+		 * - HCR_EL2.{E2H,TGE} = {0,1}
+		 * - SCTLR_EL1.M = 0
+		 *
+		 * The TGE part is interesting. If we have decided that this
+		 * is EL1&0, then it means that either {E2H,TGE} == {1,0} or
+		 * {0,x}, and we only need to test for TGE == 1.
+		 */
+		if (__vcpu_sys_reg(vcpu, HCR_EL2) & (HCR_DC | HCR_TGE))
 			wr->level = S1_MMU_DISABLED;
 		fallthrough;
 	case TR_EL2:

[...]

>
> 	switch (desc & GENMASK_ULL(1, 0)) {
> 	case 0b00:
> 	case 0b10:
> 		goto transfault;
> 	case 0b01:
> 		/* Block mapping */
> 		break;
> 	default:
> 		if (level == 3)
> 			break;
> 	}
> 
> Is this better? Perhaps slightly easier to match against the descriptor layouts,
> but I'm not sure it's an improvement over your suggestion. Up to you, no point
> in bikeshedding over it.

I think I'll leave it as is for now. I'm getting sick of this code...

Thanks,

	M.
Marc Zyngier Aug. 12, 2024, 6:04 p.m. UTC | #5
On Mon, 12 Aug 2024 18:58:24 +0100,
Marc Zyngier <maz@kernel.org> wrote:
> 
> Hi Alex,
> 
> On Mon, 12 Aug 2024 16:11:02 +0100,
> Alexandru Elisei <alexandru.elisei@arm.com> wrote:
> > 
> > Hi Marc,
> > 
> > On Sat, Aug 10, 2024 at 11:16:15AM +0100, Marc Zyngier wrote:
> > > Hi Alex,
> > > 
> > > @@ -136,12 +137,22 @@ static int setup_s1_walk(struct kvm_vcpu *vcpu, u32 op, struct s1_walk_info *wi,
> > >  	va = (u64)sign_extend64(va, 55);
> > >  
> > >  	/* Let's put the MMU disabled case aside immediately */
> > > -	if (!(sctlr & SCTLR_ELx_M) ||
> > > -	    (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_DC)) {
> > > +	switch (wi->regime) {
> > > +	case TR_EL10:
> > > +		if (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_DC)
> > > +			wr->level = S1_MMU_DISABLED;
> > 
> > In compute_translation_regime(), for AT instructions other than AT S1E2*, when
> > {E2H,TGE} = {0,1}, regime is Regime_EL10. As far as I can tell, when regime is
> > Regime_EL10 and TGE is set, stage 1 is disabled, according to
> > AArch64.S1Enabled() and the decription of the TGE bit.
> 
> Grmbl... I really dislike E2H=0. May it die a painful death. How about
> this on top?
> 
> diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c
> index 10017d990bc3..870e77266f80 100644
> --- a/arch/arm64/kvm/at.c
> +++ b/arch/arm64/kvm/at.c
> @@ -139,7 +139,19 @@ static int setup_s1_walk(struct kvm_vcpu *vcpu, u32 op, struct s1_walk_info *wi,
>  	/* Let's put the MMU disabled case aside immediately */
>  	switch (wi->regime) {
>  	case TR_EL10:
> -		if (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_DC)
> +		/*
> +		 * If dealing with the EL1&0 translation regime, 3 things
> +		 * can disable the S1 translation:
> +		 *
> +		 * - HCR_EL2.DC = 0

s/0/1/

	M.
Alexandru Elisei Aug. 13, 2024, 9:17 a.m. UTC | #6
Hi Marc,

On Mon, Aug 12, 2024 at 06:58:24PM +0100, Marc Zyngier wrote:
> Hi Alex,
> 
> On Mon, 12 Aug 2024 16:11:02 +0100,
> Alexandru Elisei <alexandru.elisei@arm.com> wrote:
> > 
> > Hi Marc,
> > 
> > On Sat, Aug 10, 2024 at 11:16:15AM +0100, Marc Zyngier wrote:
> > > Hi Alex,
> > > 
> > > @@ -136,12 +137,22 @@ static int setup_s1_walk(struct kvm_vcpu *vcpu, u32 op, struct s1_walk_info *wi,
> > >  	va = (u64)sign_extend64(va, 55);
> > >  
> > >  	/* Let's put the MMU disabled case aside immediately */
> > > -	if (!(sctlr & SCTLR_ELx_M) ||
> > > -	    (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_DC)) {
> > > +	switch (wi->regime) {
> > > +	case TR_EL10:
> > > +		if (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_DC)
> > > +			wr->level = S1_MMU_DISABLED;
> > 
> > In compute_translation_regime(), for AT instructions other than AT S1E2*, when
> > {E2H,TGE} = {0,1}, regime is Regime_EL10. As far as I can tell, when regime is
> > Regime_EL10 and TGE is set, stage 1 is disabled, according to
> > AArch64.S1Enabled() and the decription of the TGE bit.
> 
> Grmbl... I really dislike E2H=0. May it die a painful death. How about
> this on top?
> 
> diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c
> index 10017d990bc3..870e77266f80 100644
> --- a/arch/arm64/kvm/at.c
> +++ b/arch/arm64/kvm/at.c
> @@ -139,7 +139,19 @@ static int setup_s1_walk(struct kvm_vcpu *vcpu, u32 op, struct s1_walk_info *wi,
>  	/* Let's put the MMU disabled case aside immediately */
>  	switch (wi->regime) {
>  	case TR_EL10:
> -		if (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_DC)
> +		/*
> +		 * If dealing with the EL1&0 translation regime, 3 things
> +		 * can disable the S1 translation:
> +		 *
> +		 * - HCR_EL2.DC = 0
> +		 * - HCR_EL2.{E2H,TGE} = {0,1}
> +		 * - SCTLR_EL1.M = 0
> +		 *
> +		 * The TGE part is interesting. If we have decided that this
> +		 * is EL1&0, then it means that either {E2H,TGE} == {1,0} or
> +		 * {0,x}, and we only need to test for TGE == 1.
> +		 */
> +		if (__vcpu_sys_reg(vcpu, HCR_EL2) & (HCR_DC | HCR_TGE))
>  			wr->level = S1_MMU_DISABLED;

The condition looks good now.

>  		fallthrough;
>  	case TR_EL2:
> 
> [...]
> 
> >
> > 	switch (desc & GENMASK_ULL(1, 0)) {
> > 	case 0b00:
> > 	case 0b10:
> > 		goto transfault;
> > 	case 0b01:
> > 		/* Block mapping */
> > 		break;
> > 	default:
> > 		if (level == 3)
> > 			break;
> > 	}
> > 
> > Is this better? Perhaps slightly easier to match against the descriptor layouts,
> > but I'm not sure it's an improvement over your suggestion. Up to you, no point
> > in bikeshedding over it.
> 
> I think I'll leave it as is for now. I'm getting sick of this code...

Agreed!

Thanks,
Alex
diff mbox series

Patch

diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c
index 9865d29b3149..8e1f0837e309 100644
--- a/arch/arm64/kvm/at.c
+++ b/arch/arm64/kvm/at.c
@@ -4,9 +4,372 @@ 
  * Author: Jintack Lim <jintack.lim@linaro.org>
  */
 
+#include <linux/kvm_host.h>
+
+#include <asm/esr.h>
 #include <asm/kvm_hyp.h>
 #include <asm/kvm_mmu.h>
 
+enum trans_regime {
+	TR_EL10,
+	TR_EL20,
+	TR_EL2,
+};
+
+struct s1_walk_info {
+	u64	     		baddr;
+	enum trans_regime	regime;
+	unsigned int		max_oa_bits;
+	unsigned int		pgshift;
+	unsigned int		txsz;
+	int 	     		sl;
+	bool	     		hpd;
+	bool	     		be;
+	bool	     		s2;
+};
+
+struct s1_walk_result {
+	union {
+		struct {
+			u64	desc;
+			u64	pa;
+			s8	level;
+			u8	APTable;
+			bool	UXNTable;
+			bool	PXNTable;
+		};
+		struct {
+			u8	fst;
+			bool	ptw;
+			bool	s2;
+		};
+	};
+	bool	failed;
+};
+
+static void fail_s1_walk(struct s1_walk_result *wr, u8 fst, bool ptw, bool s2)
+{
+	wr->fst		= fst;
+	wr->ptw		= ptw;
+	wr->s2		= s2;
+	wr->failed	= true;
+}
+
+#define S1_MMU_DISABLED		(-127)
+
+static int get_ia_size(struct s1_walk_info *wi)
+{
+	return 64 - wi->txsz;
+}
+
+/* return true of the IPA is out of the OA range */
+static bool check_output_size(u64 ipa, struct s1_walk_info *wi)
+{
+	return wi->max_oa_bits < 48 && (ipa & GENMASK_ULL(47, wi->max_oa_bits));
+}
+
+/* Return the translation regime that applies to an AT instruction */
+static enum trans_regime compute_translation_regime(struct kvm_vcpu *vcpu, u32 op)
+{
+	/*
+	 * We only get here from guest EL2, so the translation
+	 * regime AT applies to is solely defined by {E2H,TGE}.
+	 */
+	switch (op) {
+	case OP_AT_S1E2R:
+	case OP_AT_S1E2W:
+		return vcpu_el2_e2h_is_set(vcpu) ? TR_EL20 : TR_EL2;
+		break;
+	default:
+		return (vcpu_el2_e2h_is_set(vcpu) &&
+			vcpu_el2_tge_is_set(vcpu)) ? TR_EL20 : TR_EL10;
+	}
+}
+
+static int setup_s1_walk(struct kvm_vcpu *vcpu, u32 op, struct s1_walk_info *wi,
+			 struct s1_walk_result *wr, u64 va)
+{
+	u64 sctlr, tcr, tg, ps, ia_bits, ttbr;
+	unsigned int stride, x;
+	bool va55, tbi, lva, as_el0;
+
+	wi->regime = compute_translation_regime(vcpu, op);
+	as_el0 = (op == OP_AT_S1E0R || op == OP_AT_S1E0W);
+
+	va55 = va & BIT(55);
+
+	if (wi->regime == TR_EL2 && va55)
+		goto addrsz;
+
+	wi->s2 = wi->regime == TR_EL10 && (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_VM);
+
+	switch (wi->regime) {
+	case TR_EL10:
+		sctlr	= vcpu_read_sys_reg(vcpu, SCTLR_EL1);
+		tcr	= vcpu_read_sys_reg(vcpu, TCR_EL1);
+		ttbr	= (va55 ?
+			   vcpu_read_sys_reg(vcpu, TTBR1_EL1) :
+			   vcpu_read_sys_reg(vcpu, TTBR0_EL1));
+		break;
+	case TR_EL2:
+	case TR_EL20:
+		sctlr	= vcpu_read_sys_reg(vcpu, SCTLR_EL2);
+		tcr	= vcpu_read_sys_reg(vcpu, TCR_EL2);
+		ttbr	= (va55 ?
+			   vcpu_read_sys_reg(vcpu, TTBR1_EL2) :
+			   vcpu_read_sys_reg(vcpu, TTBR0_EL2));
+		break;
+	default:
+		BUG();
+	}
+
+	tbi = (wi->regime == TR_EL2 ?
+	       FIELD_GET(TCR_EL2_TBI, tcr) :
+	       (va55 ?
+		FIELD_GET(TCR_TBI1, tcr) :
+		FIELD_GET(TCR_TBI0, tcr)));
+
+	if (!tbi && (u64)sign_extend64(va, 55) != va)
+		goto addrsz;
+
+	va = (u64)sign_extend64(va, 55);
+
+	/* Let's put the MMU disabled case aside immediately */
+	if (!(sctlr & SCTLR_ELx_M) ||
+	    (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_DC)) {
+		if (va >= BIT(kvm_get_pa_bits(vcpu->kvm)))
+			goto addrsz;
+
+		wr->level = S1_MMU_DISABLED;
+		wr->pa = va;
+		return 0;
+	}
+
+	wi->be = sctlr & SCTLR_ELx_EE;
+
+	wi->hpd  = kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, HPDS, IMP);
+	wi->hpd &= (wi->regime == TR_EL2 ?
+		    FIELD_GET(TCR_EL2_HPD, tcr) :
+		    (va55 ?
+		     FIELD_GET(TCR_HPD1, tcr) :
+		     FIELD_GET(TCR_HPD0, tcr)));
+
+	/* Someone was silly enough to encode TG0/TG1 differently */
+	if (va55) {
+		wi->txsz = FIELD_GET(TCR_T1SZ_MASK, tcr);
+		tg = FIELD_GET(TCR_TG1_MASK, tcr);
+
+		switch (tg << TCR_TG1_SHIFT) {
+		case TCR_TG1_4K:
+			wi->pgshift = 12;	 break;
+		case TCR_TG1_16K:
+			wi->pgshift = 14;	 break;
+		case TCR_TG1_64K:
+		default:	    /* IMPDEF: treat any other value as 64k */
+			wi->pgshift = 16;	 break;
+		}
+	} else {
+		wi->txsz = FIELD_GET(TCR_T0SZ_MASK, tcr);
+		tg = FIELD_GET(TCR_TG0_MASK, tcr);
+
+		switch (tg << TCR_TG0_SHIFT) {
+		case TCR_TG0_4K:
+			wi->pgshift = 12;	 break;
+		case TCR_TG0_16K:
+			wi->pgshift = 14;	 break;
+		case TCR_TG0_64K:
+		default:	    /* IMPDEF: treat any other value as 64k */
+			wi->pgshift = 16;	 break;
+		}
+	}
+
+	/* R_PLCGL, R_YXNYW */
+	if ((!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR2_EL1, ST, 48_47) &&
+	     wi->txsz > 39) ||
+	    (wi->pgshift == 16 && wi->txsz > 47) || wi->txsz > 48)
+		goto transfault_l0;
+
+	/* R_GTJBY, R_SXWGM */
+	switch (BIT(wi->pgshift)) {
+	case SZ_4K:
+		lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN4, 52_BIT);
+		lva &= tcr & (wi->regime == TR_EL2 ? TCR_EL2_DS : TCR_DS);
+		break;
+	case SZ_16K:
+		lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN16, 52_BIT);
+		lva &= tcr & (wi->regime == TR_EL2 ? TCR_EL2_DS : TCR_DS);
+		break;
+	case SZ_64K:
+		lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR2_EL1, VARange, 52);
+		break;
+	}
+
+	if ((lva && wi->txsz < 12) || wi->txsz < 16)
+		goto transfault_l0;
+
+	ia_bits = get_ia_size(wi);
+
+	/* R_YYVYV, I_THCZK */
+	if ((!va55 && va > GENMASK(ia_bits - 1, 0)) ||
+	    (va55 && va < GENMASK(63, ia_bits)))
+		goto transfault_l0;
+
+	/* R_ZFSYQ */
+	if (wi->regime != TR_EL2 &&
+	    (tcr & ((va55) ? TCR_EPD1_MASK : TCR_EPD0_MASK)))
+		goto transfault_l0;
+
+	/* R_BNDVG and following statements */
+	if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR2_EL1, E0PD, IMP) &&
+	    as_el0 && (tcr & ((va55) ? TCR_E0PD1 : TCR_E0PD0)))
+		goto transfault_l0;
+
+	/* AArch64.S1StartLevel() */
+	stride = wi->pgshift - 3;
+	wi->sl = 3 - (((ia_bits - 1) - wi->pgshift) / stride);
+
+	ps = (wi->regime == TR_EL2 ?
+	      FIELD_GET(TCR_EL2_PS_MASK, tcr) : FIELD_GET(TCR_IPS_MASK, tcr));
+
+	wi->max_oa_bits = min(get_kvm_ipa_limit(), ps_to_output_size(ps));
+
+	/* Compute minimal alignment */
+	x = 3 + ia_bits - ((3 - wi->sl) * stride + wi->pgshift);
+
+	wi->baddr = ttbr & TTBRx_EL1_BADDR;
+
+	/* R_VPBBF */
+	if (check_output_size(wi->baddr, wi))
+		goto transfault_l0;
+
+	wi->baddr &= GENMASK_ULL(wi->max_oa_bits - 1, x);
+
+	return 0;
+
+addrsz:				/* Address Size Fault level 0 */
+	fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(0), false, false);
+	return -EFAULT;
+
+transfault_l0:			/* Translation Fault level 0 */
+	fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(0), false, false);
+	return -EFAULT;
+}
+
+static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
+		   struct s1_walk_result *wr, u64 va)
+{
+	u64 va_top, va_bottom, baddr, desc;
+	int level, stride, ret;
+
+	level = wi->sl;
+	stride = wi->pgshift - 3;
+	baddr = wi->baddr;
+
+	va_top = get_ia_size(wi) - 1;
+
+	while (1) {
+		u64 index, ipa;
+
+		va_bottom = (3 - level) * stride + wi->pgshift;
+		index = (va & GENMASK_ULL(va_top, va_bottom)) >> (va_bottom - 3);
+
+		ipa = baddr | index;
+
+		if (wi->s2) {
+			struct kvm_s2_trans s2_trans = {};
+
+			ret = kvm_walk_nested_s2(vcpu, ipa, &s2_trans);
+			if (ret) {
+				fail_s1_walk(wr,
+					     (s2_trans.esr & ~ESR_ELx_FSC_LEVEL) | level,
+					     true, true);
+				return ret;
+			}
+
+			if (!kvm_s2_trans_readable(&s2_trans)) {
+				fail_s1_walk(wr, ESR_ELx_FSC_PERM_L(level),
+					     true, true);
+
+				return -EPERM;
+			}
+
+			ipa = kvm_s2_trans_output(&s2_trans);
+		}
+
+		ret = kvm_read_guest(vcpu->kvm, ipa, &desc, sizeof(desc));
+		if (ret) {
+			fail_s1_walk(wr, ESR_ELx_FSC_SEA_TTW(level),
+				     true, false);
+			return ret;
+		}
+
+		if (wi->be)
+			desc = be64_to_cpu((__force __be64)desc);
+		else
+			desc = le64_to_cpu((__force __le64)desc);
+
+		if (!(desc & 1) || ((desc & 3) == 1 && level == 3))
+			goto transfault;
+
+		/* We found a leaf, handle that */
+		if ((desc & 3) == 1 || level == 3)
+			break;
+
+		if (!wi->hpd) {
+			wr->APTable  |= FIELD_GET(S1_TABLE_AP, desc);
+			wr->UXNTable |= FIELD_GET(PMD_TABLE_UXN, desc);
+			wr->PXNTable |= FIELD_GET(PMD_TABLE_PXN, desc);
+		}
+
+		baddr = desc & GENMASK_ULL(47, wi->pgshift);
+
+		/* Check for out-of-range OA */
+		if (check_output_size(baddr, wi))
+			goto addrsz;
+
+		/* Prepare for next round */
+		va_top = va_bottom - 1;
+		level++;
+	}
+
+	/* Block mapping, check the validity of the level */
+	if (!(desc & BIT(1))) {
+		bool valid_block = false;
+
+		switch (BIT(wi->pgshift)) {
+		case SZ_4K:
+			valid_block = level == 1 || level == 2;
+			break;
+		case SZ_16K:
+		case SZ_64K:
+			valid_block = level == 2;
+			break;
+		}
+
+		if (!valid_block)
+			goto transfault;
+	}
+
+	if (check_output_size(desc & GENMASK(47, va_bottom), wi))
+		goto addrsz;
+
+	wr->failed = false;
+	wr->level = level;
+	wr->desc = desc;
+	wr->pa = desc & GENMASK(47, va_bottom);
+	if (va_bottom > 12)
+		wr->pa |= va & GENMASK_ULL(va_bottom - 1, 12);
+
+	return 0;
+
+addrsz:
+	fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(level), true, false);
+	return -EINVAL;
+transfault:
+	fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(level), true, false);
+	return -ENOENT;
+}
+
 struct mmu_config {
 	u64	ttbr0;
 	u64	ttbr1;
@@ -188,6 +551,16 @@  static u8 compute_sh(u8 attr, u64 desc)
 	return sh;
 }
 
+static u8 combine_sh(u8 s1_sh, u8 s2_sh)
+{
+	if (s1_sh == ATTR_OSH || s2_sh == ATTR_OSH)
+		return ATTR_OSH;
+	if (s1_sh == ATTR_ISH || s2_sh == ATTR_ISH)
+		return ATTR_ISH;
+
+	return ATTR_NSH;
+}
+
 static u64 compute_par_s12(struct kvm_vcpu *vcpu, u64 s1_par,
 			   struct kvm_s2_trans *tr)
 {
@@ -260,11 +633,181 @@  static u64 compute_par_s12(struct kvm_vcpu *vcpu, u64 s1_par,
 	par  = FIELD_PREP(SYS_PAR_EL1_ATTR, final_attr);
 	par |= tr->output & GENMASK(47, 12);
 	par |= FIELD_PREP(SYS_PAR_EL1_SH,
-			  compute_sh(final_attr, tr->desc));
+			  combine_sh(FIELD_GET(SYS_PAR_EL1_SH, s1_par),
+				     compute_sh(final_attr, tr->desc)));
 
 	return par;
 }
 
+static u64 compute_par_s1(struct kvm_vcpu *vcpu, struct s1_walk_result *wr,
+			  enum trans_regime regime)
+{
+	u64 par;
+
+	if (wr->failed) {
+		par = SYS_PAR_EL1_RES1;
+		par |= SYS_PAR_EL1_F;
+		par |= FIELD_PREP(SYS_PAR_EL1_FST, wr->fst);
+		par |= wr->ptw ? SYS_PAR_EL1_PTW : 0;
+		par |= wr->s2 ? SYS_PAR_EL1_S : 0;
+	} else if (wr->level == S1_MMU_DISABLED) {
+		/* MMU off or HCR_EL2.DC == 1 */
+		par = wr->pa & GENMASK_ULL(47, 12);
+
+		if (!(__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_DC)) {
+			par |= FIELD_PREP(SYS_PAR_EL1_ATTR, 0); /* nGnRnE */
+			par |= FIELD_PREP(SYS_PAR_EL1_SH, 0b10); /* OS */
+		} else {
+			par |= FIELD_PREP(SYS_PAR_EL1_ATTR,
+					  MEMATTR(WbRaWa, WbRaWa));
+			par |= FIELD_PREP(SYS_PAR_EL1_SH, 0b00); /* NS */
+		}
+	} else {
+		u64 mair, sctlr;
+		u8 sh;
+
+		mair = (regime == TR_EL10 ?
+			vcpu_read_sys_reg(vcpu, MAIR_EL1) :
+			vcpu_read_sys_reg(vcpu, MAIR_EL2));
+
+		mair >>= FIELD_GET(PTE_ATTRINDX_MASK, wr->desc) * 8;
+		mair &= 0xff;
+
+		sctlr = (regime == TR_EL10 ?
+			 vcpu_read_sys_reg(vcpu, SCTLR_EL1) :
+			 vcpu_read_sys_reg(vcpu, SCTLR_EL2));
+
+		/* Force NC for memory if SCTLR_ELx.C is clear */
+		if (!(sctlr & SCTLR_EL1_C) && !MEMATTR_IS_DEVICE(mair))
+			mair = MEMATTR(NC, NC);
+
+		par  = FIELD_PREP(SYS_PAR_EL1_ATTR, mair);
+		par |= wr->pa & GENMASK_ULL(47, 12);
+
+		sh = compute_sh(mair, wr->desc);
+		par |= FIELD_PREP(SYS_PAR_EL1_SH, sh);
+	}
+
+	return par;
+}
+
+static u64 handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
+{
+	bool perm_fail, ur, uw, ux, pr, pw, px;
+	struct s1_walk_result wr = {};
+	struct s1_walk_info wi = {};
+	int ret, idx;
+
+	ret = setup_s1_walk(vcpu, op, &wi, &wr, vaddr);
+	if (ret)
+		goto compute_par;
+
+	if (wr.level == S1_MMU_DISABLED)
+		goto compute_par;
+
+	idx = srcu_read_lock(&vcpu->kvm->srcu);
+
+	ret = walk_s1(vcpu, &wi, &wr, vaddr);
+
+	srcu_read_unlock(&vcpu->kvm->srcu, idx);
+
+	if (ret)
+		goto compute_par;
+
+	/* FIXME: revisit when adding indirect permission support */
+	/* AArch64.S1DirectBasePermissions() */
+	if (wi.regime != TR_EL2) {
+		switch (FIELD_GET(PTE_USER | PTE_RDONLY, wr.desc)) {
+		case 0b00:
+			pr = pw = true;
+			ur = uw = false;
+			break;
+		case 0b01:
+			pr = pw = ur = uw = true;
+			break;
+		case 0b10:
+			pr = true;
+			pw = ur = uw = false;
+			break;
+		case 0b11:
+			pr = ur = true;
+			pw = uw = false;
+			break;
+		}
+
+		switch (wr.APTable) {
+		case 0b00:
+			break;
+		case 0b01:
+			ur = uw = false;
+			break;
+		case 0b10:
+			pw = uw = false;
+			break;
+		case 0b11:
+			pw = ur = uw = false;
+			break;
+		}
+
+		/* We don't use px for anything yet, but hey... */
+		px = !((wr.desc & PTE_PXN) || wr.PXNTable || pw);
+		ux = !((wr.desc & PTE_UXN) || wr.UXNTable);
+
+		if (op == OP_AT_S1E1RP || op == OP_AT_S1E1WP) {
+			bool pan;
+
+			pan = *vcpu_cpsr(vcpu) & PSR_PAN_BIT;
+			pan &= ur || uw;
+			pw &= !pan;
+			pr &= !pan;
+		}
+	} else {
+		ur = uw = ux = false;
+
+		if (!(wr.desc & PTE_RDONLY)) {
+			pr = pw = true;
+		} else {
+			pr = true;
+			pw = false;
+		}
+
+		if (wr.APTable & BIT(1))
+			pw = false;
+
+		/* XN maps to UXN */
+		px = !((wr.desc & PTE_UXN) || wr.UXNTable);
+	}
+
+	perm_fail = false;
+
+	switch (op) {
+	case OP_AT_S1E1RP:
+	case OP_AT_S1E1R:
+	case OP_AT_S1E2R:
+		perm_fail = !pr;
+		break;
+	case OP_AT_S1E1WP:
+	case OP_AT_S1E1W:
+	case OP_AT_S1E2W:
+		perm_fail = !pw;
+		break;
+	case OP_AT_S1E0R:
+		perm_fail = !ur;
+		break;
+	case OP_AT_S1E0W:
+		perm_fail = !uw;
+		break;
+	default:
+		BUG();
+	}
+
+	if (perm_fail)
+		fail_s1_walk(&wr, ESR_ELx_FSC_PERM_L(wr.level), false, false);
+
+compute_par:
+	return compute_par_s1(vcpu, &wr, wi.regime);
+}
+
 /*
  * Return the PAR_EL1 value as the result of a valid translation.
  *
@@ -352,10 +895,26 @@  static u64 __kvm_at_s1e01_fast(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
 	return par;
 }
 
+static bool par_check_s1_perm_fault(u64 par)
+{
+	u8 fst = FIELD_GET(SYS_PAR_EL1_FST, par);
+
+	return  ((fst & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_PERM &&
+		 !(par & SYS_PAR_EL1_S));
+}
+
 void __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
 {
 	u64 par = __kvm_at_s1e01_fast(vcpu, op, vaddr);
 
+	/*
+	 * If we see a permission fault at S1, then the fast path
+	 * succeedded. By failing. Otherwise, take a walk on the wild
+	 * side.
+	 */
+	if ((par & SYS_PAR_EL1_F) && !par_check_s1_perm_fault(par))
+		par = handle_at_slow(vcpu, op, vaddr);
+
 	vcpu_write_sys_reg(vcpu, par, PAR_EL1);
 }
 
@@ -407,6 +966,10 @@  void __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
 		isb();
 	}
 
+	/* We failed the translation, let's replay it in slow motion */
+	if ((par & SYS_PAR_EL1_F) && !par_check_s1_perm_fault(par))
+		par = handle_at_slow(vcpu, op, vaddr);
+
 	vcpu_write_sys_reg(vcpu, par, PAR_EL1);
 }
 
@@ -463,7 +1026,7 @@  void __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
 	/* Check the access permission */
 	if (!out.esr &&
 	    ((!write && !out.readable) || (write && !out.writable)))
-		out.esr = ESR_ELx_FSC_PERM | (out.level & 0x3);
+		out.esr = ESR_ELx_FSC_PERM_L(out.level & 0x3);
 
 	par = compute_par_s12(vcpu, par, &out);
 	vcpu_write_sys_reg(vcpu, par, PAR_EL1);