diff mbox series

[10/12] KVM: arm64: nv: Add SW walker for AT S1 emulation

Message ID 20240708165800.1220065-1-maz@kernel.org (mailing list archive)
State New, archived
Headers show
Series KVM: arm64: nv: Add support for address translation instructions | expand

Commit Message

Marc Zyngier July 8, 2024, 4:57 p.m. UTC
In order to plug the brokenness of our current AT implementation,
we need a SW walker that is going to... err.. walk the S1 tables
and tell us what it finds.

Of course, it builds on top of our S2 walker, and share similar
concepts. The beauty of it is that since it uses kvm_read_guest(),
it is able to bring back pages that have been otherwise evicted.

This is then plugged in the two AT S1 emulation functions as
a "slow path" fallback. I'm not sure it is that slow, but hey.

Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 arch/arm64/kvm/at.c | 538 ++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 520 insertions(+), 18 deletions(-)

Comments

Alexandru Elisei July 10, 2024, 3:12 p.m. UTC | #1
Hi Marc,

On Mon, Jul 08, 2024 at 05:57:58PM +0100, Marc Zyngier wrote:
> In order to plug the brokenness of our current AT implementation,
> we need a SW walker that is going to... err.. walk the S1 tables
> and tell us what it finds.
> 
> Of course, it builds on top of our S2 walker, and share similar
> concepts. The beauty of it is that since it uses kvm_read_guest(),
> it is able to bring back pages that have been otherwise evicted.
> 
> This is then plugged in the two AT S1 emulation functions as
> a "slow path" fallback. I'm not sure it is that slow, but hey.
> 
> Signed-off-by: Marc Zyngier <maz@kernel.org>
> [..]
> @@ -331,18 +801,17 @@ void __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
>  	}
>  
>  	if (!fail)
> -		par = read_sysreg(par_el1);
> +		par = read_sysreg_par();
>  	else
>  		par = SYS_PAR_EL1_F;
>  
> +	retry_slow = !fail;
> +
>  	vcpu_write_sys_reg(vcpu, par, PAR_EL1);
>  
>  	/*
> -	 * Failed? let's leave the building now.
> -	 *
> -	 * FIXME: how about a failed translation because the shadow S2
> -	 * wasn't populated? We may need to perform a SW PTW,
> -	 * populating our shadow S2 and retry the instruction.
> +	 * Failed? let's leave the building now, unless we retry on
> +	 * the slow path.
>  	 */
>  	if (par & SYS_PAR_EL1_F)
>  		goto nopan;

This is what follows after the 'if' statement above, and before the 'switch'
below:

        /* No PAN? No problem. */
        if (!(*vcpu_cpsr(vcpu) & PSR_PAN_BIT))
                goto nopan;

When KVM is executing this statement, the following is true:

1. SYS_PAR_EL1_F is clear => the hardware translation table walk was successful.
2. retry_slow = true;

Then if the PAN bit is not set, the function jumps to the nopan label, and
performs a software translation table walk, even though the hardware walk
performed by AT was successful.

> @@ -354,29 +823,58 @@ void __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
>  	switch (op) {
>  	case OP_AT_S1E1RP:
>  	case OP_AT_S1E1WP:
> +		retry_slow = false;
>  		fail = check_at_pan(vcpu, vaddr, &par);
>  		break;
>  	default:
>  		goto nopan;
>  	}
>  
> +	if (fail) {
> +		vcpu_write_sys_reg(vcpu, SYS_PAR_EL1_F, PAR_EL1);
> +		goto nopan;
> +	}
> +
>  	/*
>  	 * If the EL0 translation has succeeded, we need to pretend
>  	 * the AT operation has failed, as the PAN setting forbids
>  	 * such a translation.
> -	 *
> -	 * FIXME: we hardcode a Level-3 permission fault. We really
> -	 * should return the real fault level.
>  	 */
> -	if (fail || !(par & SYS_PAR_EL1_F))
> -		vcpu_write_sys_reg(vcpu, (0xf << 1) | SYS_PAR_EL1_F, PAR_EL1);
> -
> +	if (par & SYS_PAR_EL1_F) {
> +		u8 fst = FIELD_GET(SYS_PAR_EL1_FST, par);
> +
> +		/*
> +		 * If we get something other than a permission fault, we
> +		 * need to retry, as we're likely to have missed in the PTs.
> +		 */
> +		if ((fst & ESR_ELx_FSC_TYPE) != ESR_ELx_FSC_PERM)
> +			retry_slow = true;

Shouldn't VCPU's PAR_EL1 register be updated here? As far as I can tell, at this
point the VCPU PAR_EL1 register has the result from the successful walk
performed by AT S1E1R or AT S1E1W in the first 'switch' statement.

Thanks,
Alex

> +	} else {
> +		/*
> +		 * The EL0 access succeded, but we don't have the full
> +		 * syndrom information to synthetize the failure. Go slow.
> +		 */
> +		retry_slow = true;
> +	}
>  nopan:
>  	__mmu_config_restore(&config);
>  out:
>  	local_irq_restore(flags);
>  
>  	write_unlock(&vcpu->kvm->mmu_lock);
> +
> +	/*
> +	 * If retry_slow is true, then we either are missing shadow S2
> +	 * entries, have paged out guest S1, or something is inconsistent.
> +	 *
> +	 * Either way, we need to walk the PTs by hand so that we can either
> +	 * fault things back, in or record accurate fault information along
> +	 * the way.
> +	 */
> +	if (retry_slow) {
> +		par = handle_at_slow(vcpu, op, vaddr);
> +		vcpu_write_sys_reg(vcpu, par, PAR_EL1);
> +	}
>  }
Marc Zyngier July 11, 2024, 8:05 a.m. UTC | #2
On Wed, 10 Jul 2024 16:12:53 +0100,
Alexandru Elisei <alexandru.elisei@arm.com> wrote:
> 
> Hi Marc,
> 
> On Mon, Jul 08, 2024 at 05:57:58PM +0100, Marc Zyngier wrote:
> > In order to plug the brokenness of our current AT implementation,
> > we need a SW walker that is going to... err.. walk the S1 tables
> > and tell us what it finds.
> > 
> > Of course, it builds on top of our S2 walker, and share similar
> > concepts. The beauty of it is that since it uses kvm_read_guest(),
> > it is able to bring back pages that have been otherwise evicted.
> > 
> > This is then plugged in the two AT S1 emulation functions as
> > a "slow path" fallback. I'm not sure it is that slow, but hey.
> > 
> > Signed-off-by: Marc Zyngier <maz@kernel.org>
> > [..]
> > @@ -331,18 +801,17 @@ void __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
> >  	}
> >  
> >  	if (!fail)
> > -		par = read_sysreg(par_el1);
> > +		par = read_sysreg_par();
> >  	else
> >  		par = SYS_PAR_EL1_F;
> >  
> > +	retry_slow = !fail;
> > +
> >  	vcpu_write_sys_reg(vcpu, par, PAR_EL1);
> >  
> >  	/*
> > -	 * Failed? let's leave the building now.
> > -	 *
> > -	 * FIXME: how about a failed translation because the shadow S2
> > -	 * wasn't populated? We may need to perform a SW PTW,
> > -	 * populating our shadow S2 and retry the instruction.
> > +	 * Failed? let's leave the building now, unless we retry on
> > +	 * the slow path.
> >  	 */
> >  	if (par & SYS_PAR_EL1_F)
> >  		goto nopan;
> 
> This is what follows after the 'if' statement above, and before the 'switch'
> below:
> 
>         /* No PAN? No problem. */
>         if (!(*vcpu_cpsr(vcpu) & PSR_PAN_BIT))
>                 goto nopan;
> 
> When KVM is executing this statement, the following is true:
> 
> 1. SYS_PAR_EL1_F is clear => the hardware translation table walk was successful.
> 2. retry_slow = true;
>
> Then if the PAN bit is not set, the function jumps to the nopan label, and
> performs a software translation table walk, even though the hardware walk
> performed by AT was successful.

Hmmm. Are you being polite and trying to avoid saying that this code
is broken and that I should look for a retirement home instead?
There, I've said it for you! ;-)

The more I stare at this code, the more I hate it. Trying to
interleave the replay condition with the many potential failure modes
of the HW walker feels completely wrong, and I feel that I'd better
split the whole thing in two:

void __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
{
	__kvm_at_s1e01_hw(vcpu, vaddr);
	if (vcpu_read_sys_reg(vcpu, PAR_EL1) & SYS_PAR_F)
		__kvm_at_s1e01_sw(vcpu, vaddr);
}

and completely stop messing with things. This is AT S1 we're talking
about, not something that has any sort of high-frequency. Apart for
Xen. But as Butch said: "Xen's dead, baby. Xen's dead.".

> 
> > @@ -354,29 +823,58 @@ void __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
> >  	switch (op) {
> >  	case OP_AT_S1E1RP:
> >  	case OP_AT_S1E1WP:
> > +		retry_slow = false;
> >  		fail = check_at_pan(vcpu, vaddr, &par);
> >  		break;
> >  	default:
> >  		goto nopan;
> >  	}
> >  
> > +	if (fail) {
> > +		vcpu_write_sys_reg(vcpu, SYS_PAR_EL1_F, PAR_EL1);
> > +		goto nopan;
> > +	}
> > +
> >  	/*
> >  	 * If the EL0 translation has succeeded, we need to pretend
> >  	 * the AT operation has failed, as the PAN setting forbids
> >  	 * such a translation.
> > -	 *
> > -	 * FIXME: we hardcode a Level-3 permission fault. We really
> > -	 * should return the real fault level.
> >  	 */
> > -	if (fail || !(par & SYS_PAR_EL1_F))
> > -		vcpu_write_sys_reg(vcpu, (0xf << 1) | SYS_PAR_EL1_F, PAR_EL1);
> > -
> > +	if (par & SYS_PAR_EL1_F) {
> > +		u8 fst = FIELD_GET(SYS_PAR_EL1_FST, par);
> > +
> > +		/*
> > +		 * If we get something other than a permission fault, we
> > +		 * need to retry, as we're likely to have missed in the PTs.
> > +		 */
> > +		if ((fst & ESR_ELx_FSC_TYPE) != ESR_ELx_FSC_PERM)
> > +			retry_slow = true;
> 
> Shouldn't VCPU's PAR_EL1 register be updated here? As far as I can tell, at this
> point the VCPU PAR_EL1 register has the result from the successful walk
> performed by AT S1E1R or AT S1E1W in the first 'switch' statement.

Yup, yet another sign that this flow is broken. I'll apply my last few
grey cells to it, and hopefully the next iteration will be a bit
better.

Thanks a lot for having a look!

	M.
Alexandru Elisei July 11, 2024, 10:56 a.m. UTC | #3
Hi,

On Mon, Jul 08, 2024 at 05:57:58PM +0100, Marc Zyngier wrote:
> In order to plug the brokenness of our current AT implementation,
> we need a SW walker that is going to... err.. walk the S1 tables
> and tell us what it finds.
> 
> Of course, it builds on top of our S2 walker, and share similar
> concepts. The beauty of it is that since it uses kvm_read_guest(),
> it is able to bring back pages that have been otherwise evicted.
> 
> This is then plugged in the two AT S1 emulation functions as
> a "slow path" fallback. I'm not sure it is that slow, but hey.
> [..]
>  	switch (op) {
>  	case OP_AT_S1E1RP:
>  	case OP_AT_S1E1WP:
> +		retry_slow = false;
>  		fail = check_at_pan(vcpu, vaddr, &par);
>  		break;
>  	default:
>  		goto nopan;
>  	}

For context, this is what check_at_pan() does:

static int check_at_pan(struct kvm_vcpu *vcpu, u64 vaddr, u64 *res)
{
        u64 par_e0;
        int error;

        /*
         * For PAN-involved AT operations, perform the same translation,
         * using EL0 this time. Twice. Much fun.
         */
        error = __kvm_at(OP_AT_S1E0R, vaddr);
        if (error)
                return error;

        par_e0 = read_sysreg_par();
        if (!(par_e0 & SYS_PAR_EL1_F))
                goto out;

        error = __kvm_at(OP_AT_S1E0W, vaddr);
        if (error)
                return error;

        par_e0 = read_sysreg_par();
out:
        *res = par_e0;
        return 0;
}

I'm having a hard time understanding why KVM is doing both AT S1E0R and AT S1E0W
regardless of the type of the access (read/write) in the PAN-aware AT
instruction. Would you mind elaborating on that?

> +	if (fail) {
> +		vcpu_write_sys_reg(vcpu, SYS_PAR_EL1_F, PAR_EL1);
> +		goto nopan;
> +	}
> [..]
> +	if (par & SYS_PAR_EL1_F) {
> +		u8 fst = FIELD_GET(SYS_PAR_EL1_FST, par);
> +
> +		/*
> +		 * If we get something other than a permission fault, we
> +		 * need to retry, as we're likely to have missed in the PTs.
> +		 */
> +		if ((fst & ESR_ELx_FSC_TYPE) != ESR_ELx_FSC_PERM)
> +			retry_slow = true;
> +	} else {
> +		/*
> +		 * The EL0 access succeded, but we don't have the full
> +		 * syndrom information to synthetize the failure. Go slow.
> +		 */
> +		retry_slow = true;
> +	}

This is what PSTATE.PAN controls:

If the Effective value of PSTATE.PAN is 1, then a privileged data access from
any of the following Exception levels to a virtual memory address that is
accessible to data accesses at EL0 generates a stage 1 Permission fault:

- A privileged data access from EL1.
- If HCR_EL2.E2H is 1, then a privileged data access from EL2.

With that in mind, I am really struggling to understand the logic.

If AT S1E0{R,W} (from check_at_pan()) failed, doesn't that mean that the virtual
memory address is not accessible to EL0? Add that to the fact that the AT
S1E1{R,W} (from the beginning of __kvm_at_s1e01()) succeeded, doesn't that mean
that AT S1E1{R,W}P should succeed, and furthermore the PAR_EL1 value should be
the one KVM got from AT S1E1{R,W}?

Thanks,
Alex

>  nopan:
>  	__mmu_config_restore(&config);
>  out:
>  	local_irq_restore(flags);
>  
>  	write_unlock(&vcpu->kvm->mmu_lock);
> +
> +	/*
> +	 * If retry_slow is true, then we either are missing shadow S2
> +	 * entries, have paged out guest S1, or something is inconsistent.
> +	 *
> +	 * Either way, we need to walk the PTs by hand so that we can either
> +	 * fault things back, in or record accurate fault information along
> +	 * the way.
> +	 */
> +	if (retry_slow) {
> +		par = handle_at_slow(vcpu, op, vaddr);
> +		vcpu_write_sys_reg(vcpu, par, PAR_EL1);
> +	}
>  }
>  
>  void __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
> @@ -433,6 +931,10 @@ void __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
>  
>  	write_unlock(&vcpu->kvm->mmu_lock);
>  
> +	/* We failed the translation, let's replay it in slow motion */
> +	if (!fail && (par & SYS_PAR_EL1_F))
> +		par = handle_at_slow(vcpu, op, vaddr);
> +
>  	vcpu_write_sys_reg(vcpu, par, PAR_EL1);
>  }
>  
> -- 
> 2.39.2
> 
>
Marc Zyngier July 11, 2024, 12:16 p.m. UTC | #4
On Thu, 11 Jul 2024 11:56:13 +0100,
Alexandru Elisei <alexandru.elisei@arm.com> wrote:
> 
> Hi,
> 
> On Mon, Jul 08, 2024 at 05:57:58PM +0100, Marc Zyngier wrote:
> > In order to plug the brokenness of our current AT implementation,
> > we need a SW walker that is going to... err.. walk the S1 tables
> > and tell us what it finds.
> > 
> > Of course, it builds on top of our S2 walker, and share similar
> > concepts. The beauty of it is that since it uses kvm_read_guest(),
> > it is able to bring back pages that have been otherwise evicted.
> > 
> > This is then plugged in the two AT S1 emulation functions as
> > a "slow path" fallback. I'm not sure it is that slow, but hey.
> > [..]
> >  	switch (op) {
> >  	case OP_AT_S1E1RP:
> >  	case OP_AT_S1E1WP:
> > +		retry_slow = false;
> >  		fail = check_at_pan(vcpu, vaddr, &par);
> >  		break;
> >  	default:
> >  		goto nopan;
> >  	}
> 
> For context, this is what check_at_pan() does:
> 
> static int check_at_pan(struct kvm_vcpu *vcpu, u64 vaddr, u64 *res)
> {
>         u64 par_e0;
>         int error;
> 
>         /*
>          * For PAN-involved AT operations, perform the same translation,
>          * using EL0 this time. Twice. Much fun.
>          */
>         error = __kvm_at(OP_AT_S1E0R, vaddr);
>         if (error)
>                 return error;
> 
>         par_e0 = read_sysreg_par();
>         if (!(par_e0 & SYS_PAR_EL1_F))
>                 goto out;
> 
>         error = __kvm_at(OP_AT_S1E0W, vaddr);
>         if (error)
>                 return error;
> 
>         par_e0 = read_sysreg_par();
> out:
>         *res = par_e0;
>         return 0;
> }
> 
> I'm having a hard time understanding why KVM is doing both AT S1E0R and AT S1E0W
> regardless of the type of the access (read/write) in the PAN-aware AT
> instruction. Would you mind elaborating on that?

Because that's the very definition of an AT S1E1{W,R}P instruction
when PAN is set. If *any* EL0 permission is set, then the translation
must equally fail. Just like a load or a store from EL1 would fail if
any EL0 permission is set when PSTATE.PAN is set.

Since we cannot check for both permissions at once, we do it twice.
It is worth noting that we don't quite handle the PAN3 case correctly
(because we can't retrieve the *execution* property using AT). I'll
add that to the list of stuff to fix.

> 
> > +	if (fail) {
> > +		vcpu_write_sys_reg(vcpu, SYS_PAR_EL1_F, PAR_EL1);
> > +		goto nopan;
> > +	}
> > [..]
> > +	if (par & SYS_PAR_EL1_F) {
> > +		u8 fst = FIELD_GET(SYS_PAR_EL1_FST, par);
> > +
> > +		/*
> > +		 * If we get something other than a permission fault, we
> > +		 * need to retry, as we're likely to have missed in the PTs.
> > +		 */
> > +		if ((fst & ESR_ELx_FSC_TYPE) != ESR_ELx_FSC_PERM)
> > +			retry_slow = true;
> > +	} else {
> > +		/*
> > +		 * The EL0 access succeded, but we don't have the full
> > +		 * syndrom information to synthetize the failure. Go slow.
> > +		 */
> > +		retry_slow = true;
> > +	}
> 
> This is what PSTATE.PAN controls:
> 
> If the Effective value of PSTATE.PAN is 1, then a privileged data access from
> any of the following Exception levels to a virtual memory address that is
> accessible to data accesses at EL0 generates a stage 1 Permission fault:
> 
> - A privileged data access from EL1.
> - If HCR_EL2.E2H is 1, then a privileged data access from EL2.
> 
> With that in mind, I am really struggling to understand the logic.

I don't quite see what you don't understand, you'll have to be more
precise. Are you worried about the page tables we're looking at, the
value of PSTATE.PAN, the permission fault, or something else?

It also doesn't help that you're looking at the patch that contains
the integration with the slow-path, which is pretty hard to read (I
have a reworked version that's a bit better). You probably want to
look at the "fast" path alone.

> 
> If AT S1E0{R,W} (from check_at_pan()) failed, doesn't that mean that the virtual
> memory address is not accessible to EL0? Add that to the fact that the AT
> S1E1{R,W} (from the beginning of __kvm_at_s1e01()) succeeded, doesn't that mean
> that AT S1E1{R,W}P should succeed, and furthermore the PAR_EL1 value should be
> the one KVM got from AT S1E1{R,W}?

There are plenty of ways for AT S1E0 to fail when AT S1E1 succeeded:

- no EL0 permission: that's the best case, and the PAR_EL1 obtained
  from the AT S1E1 is the correct one. That's what we return.

- The EL0 access failed, but for another reason than a permission
  fault. This contradicts the EL1 walk, and is a sure sign that
  someone is playing behind our back. We fail.

- exception from AT S1E0: something went wrong (again the guest
  playing with the PTs behind our back). We fail as well.

Do you at least agree with these as goals? If you do, what in
the implementation does not satisfy these goals? If you don't, what in
these goals seem improper to you?

Thanks,

	M.
Alexandru Elisei July 15, 2024, 3:30 p.m. UTC | #5
Hi Marc,

On Thu, Jul 11, 2024 at 01:16:42PM +0100, Marc Zyngier wrote:
> On Thu, 11 Jul 2024 11:56:13 +0100,
> Alexandru Elisei <alexandru.elisei@arm.com> wrote:
> > 
> > Hi,
> > 
> > On Mon, Jul 08, 2024 at 05:57:58PM +0100, Marc Zyngier wrote:
> > > In order to plug the brokenness of our current AT implementation,
> > > we need a SW walker that is going to... err.. walk the S1 tables
> > > and tell us what it finds.
> > > 
> > > Of course, it builds on top of our S2 walker, and share similar
> > > concepts. The beauty of it is that since it uses kvm_read_guest(),
> > > it is able to bring back pages that have been otherwise evicted.
> > > 
> > > This is then plugged in the two AT S1 emulation functions as
> > > a "slow path" fallback. I'm not sure it is that slow, but hey.
> > > [..]
> > >  	switch (op) {
> > >  	case OP_AT_S1E1RP:
> > >  	case OP_AT_S1E1WP:
> > > +		retry_slow = false;
> > >  		fail = check_at_pan(vcpu, vaddr, &par);
> > >  		break;
> > >  	default:
> > >  		goto nopan;
> > >  	}
> > 
> > For context, this is what check_at_pan() does:
> > 
> > static int check_at_pan(struct kvm_vcpu *vcpu, u64 vaddr, u64 *res)
> > {
> >         u64 par_e0;
> >         int error;
> > 
> >         /*
> >          * For PAN-involved AT operations, perform the same translation,
> >          * using EL0 this time. Twice. Much fun.
> >          */
> >         error = __kvm_at(OP_AT_S1E0R, vaddr);
> >         if (error)
> >                 return error;
> > 
> >         par_e0 = read_sysreg_par();
> >         if (!(par_e0 & SYS_PAR_EL1_F))
> >                 goto out;
> > 
> >         error = __kvm_at(OP_AT_S1E0W, vaddr);
> >         if (error)
> >                 return error;
> > 
> >         par_e0 = read_sysreg_par();
> > out:
> >         *res = par_e0;
> >         return 0;
> > }
> > 
> > I'm having a hard time understanding why KVM is doing both AT S1E0R and AT S1E0W
> > regardless of the type of the access (read/write) in the PAN-aware AT
> > instruction. Would you mind elaborating on that?
> 
> Because that's the very definition of an AT S1E1{W,R}P instruction
> when PAN is set. If *any* EL0 permission is set, then the translation
> must equally fail. Just like a load or a store from EL1 would fail if
> any EL0 permission is set when PSTATE.PAN is set.
> 
> Since we cannot check for both permissions at once, we do it twice.
> It is worth noting that we don't quite handle the PAN3 case correctly
> (because we can't retrieve the *execution* property using AT). I'll
> add that to the list of stuff to fix.
> 
> > 
> > > +	if (fail) {
> > > +		vcpu_write_sys_reg(vcpu, SYS_PAR_EL1_F, PAR_EL1);
> > > +		goto nopan;
> > > +	}
> > > [..]
> > > +	if (par & SYS_PAR_EL1_F) {
> > > +		u8 fst = FIELD_GET(SYS_PAR_EL1_FST, par);
> > > +
> > > +		/*
> > > +		 * If we get something other than a permission fault, we
> > > +		 * need to retry, as we're likely to have missed in the PTs.
> > > +		 */
> > > +		if ((fst & ESR_ELx_FSC_TYPE) != ESR_ELx_FSC_PERM)
> > > +			retry_slow = true;
> > > +	} else {
> > > +		/*
> > > +		 * The EL0 access succeded, but we don't have the full
> > > +		 * syndrom information to synthetize the failure. Go slow.
> > > +		 */
> > > +		retry_slow = true;
> > > +	}
> > 
> > This is what PSTATE.PAN controls:
> > 
> > If the Effective value of PSTATE.PAN is 1, then a privileged data access from
> > any of the following Exception levels to a virtual memory address that is
> > accessible to data accesses at EL0 generates a stage 1 Permission fault:
> > 
> > - A privileged data access from EL1.
> > - If HCR_EL2.E2H is 1, then a privileged data access from EL2.
> > 
> > With that in mind, I am really struggling to understand the logic.
> 
> I don't quite see what you don't understand, you'll have to be more
> precise. Are you worried about the page tables we're looking at, the
> value of PSTATE.PAN, the permission fault, or something else?
> 
> It also doesn't help that you're looking at the patch that contains
> the integration with the slow-path, which is pretty hard to read (I
> have a reworked version that's a bit better). You probably want to
> look at the "fast" path alone.

I was referring to checking both unprivileged read and write permissions.

And you are right, sorry, I managed to get myself terribly confused. For
completeness sake, this matches AArch64.S1DirectBasePermissions(), where if PAN
&& (UnprivRead || UnprivWrite) then PrivRead = False and PrivWrite = False. So
you need to check that both UnprivRead and UnprivWrite are false for the PAN
variants of AT to succeed.

> 
> > 
> > If AT S1E0{R,W} (from check_at_pan()) failed, doesn't that mean that the virtual
> > memory address is not accessible to EL0? Add that to the fact that the AT
> > S1E1{R,W} (from the beginning of __kvm_at_s1e01()) succeeded, doesn't that mean
> > that AT S1E1{R,W}P should succeed, and furthermore the PAR_EL1 value should be
> > the one KVM got from AT S1E1{R,W}?
> 
> There are plenty of ways for AT S1E0 to fail when AT S1E1 succeeded:
> 
> - no EL0 permission: that's the best case, and the PAR_EL1 obtained
>   from the AT S1E1 is the correct one. That's what we return.

Yes, that is correct, the place where VCPUs PAR_EL1 register is set is far
enough from this code that I didn't make the connection.

> 
> - The EL0 access failed, but for another reason than a permission
>   fault. This contradicts the EL1 walk, and is a sure sign that
>   someone is playing behind our back. We fail.
> 
> - exception from AT S1E0: something went wrong (again the guest
>   playing with the PTs behind our back). We fail as well.
> 
> Do you at least agree with these as goals? If you do, what in
> the implementation does not satisfy these goals? If you don't, what in
> these goals seem improper to you?

I agree with the goals.

In this patch, if I'm reading the code right (and I'm starting to doubt myself)
if PAR_EL1.F is set and PAR_EL1 doesn't indicate a permissions fault, then KVM
falls back to walking the S1 tables:

        if (par & SYS_PAR_EL1_F) {
                u8 fst = FIELD_GET(SYS_PAR_EL1_FST, par);

                /*
                 * If we get something other than a permission fault, we
                 * need to retry, as we're likely to have missed in the PTs.
                 */
                if ((fst & ESR_ELx_FSC_TYPE) != ESR_ELx_FSC_PERM)
                        retry_slow = true;
	}

I suppose that's because KVM cannot distinguish between two very different
reasons for AT failing: 1, because of something being wrong with the stage 1
tables when the AT S1E0* instruction was executed and 2, because of missing
entries at stage 2, as per the comment. Is that correct?

Thanks,
Alex
Marc Zyngier July 18, 2024, 11:37 a.m. UTC | #6
Hi Alex,

On Mon, 15 Jul 2024 16:30:19 +0100,
Alexandru Elisei <alexandru.elisei@arm.com> wrote:
> 
> In this patch, if I'm reading the code right (and I'm starting to doubt myself)
> if PAR_EL1.F is set and PAR_EL1 doesn't indicate a permissions fault, then KVM
> falls back to walking the S1 tables:
> 
>         if (par & SYS_PAR_EL1_F) {
>                 u8 fst = FIELD_GET(SYS_PAR_EL1_FST, par);
> 
>                 /*
>                  * If we get something other than a permission fault, we
>                  * need to retry, as we're likely to have missed in the PTs.
>                  */
>                 if ((fst & ESR_ELx_FSC_TYPE) != ESR_ELx_FSC_PERM)
>                         retry_slow = true;
> 	}
> 
> I suppose that's because KVM cannot distinguish between two very different
> reasons for AT failing: 1, because of something being wrong with the stage 1
> tables when the AT S1E0* instruction was executed and 2, because of missing
> entries at stage 2, as per the comment. Is that correct?

Exactly. It doesn't help that I'm using 3 AT instructions to implement
a single one, and that makes the window of opportunity for things to
go wrong rather large.

Now, I've been thinking about this some more, and I came to the
conclusion that we can actually implement the FEAT_PAN2 instructions
using the PAN2 instructions themselves, which would greatly simplify
the code. We just need to switch PSTATE.PAN so that it reflects the
guest's state around the AT instruction.

With that scheme, the process becomes slightly clearer (and applies to
all AT instructions except for FEAT_ATS1A):

- either we have a successful translation and all is good

- or we have a failure for permission fault: all is good as well, as
  this is simply a "normal" failure

- or we have a failure for any other reason, and we must fall back to
  a SW walk to work things out properly.

I'll try to capture this reasoning as a comment in the next version.

Thanks,

	M.
Alexandru Elisei July 18, 2024, 3:16 p.m. UTC | #7
Hi,

Managed to have a look at AT handling for stage 1, I've been comparing it with
AArch64.AT().

On Mon, Jul 08, 2024 at 05:57:58PM +0100, Marc Zyngier wrote:
> In order to plug the brokenness of our current AT implementation,
> we need a SW walker that is going to... err.. walk the S1 tables
> and tell us what it finds.
> 
> Of course, it builds on top of our S2 walker, and share similar
> concepts. The beauty of it is that since it uses kvm_read_guest(),
> it is able to bring back pages that have been otherwise evicted.
> 
> This is then plugged in the two AT S1 emulation functions as
> a "slow path" fallback. I'm not sure it is that slow, but hey.
> 
> Signed-off-by: Marc Zyngier <maz@kernel.org>
> ---
>  arch/arm64/kvm/at.c | 538 ++++++++++++++++++++++++++++++++++++++++++--
>  1 file changed, 520 insertions(+), 18 deletions(-)
> 
> diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c
> index 71e3390b43b4c..8452273cbff6d 100644
> --- a/arch/arm64/kvm/at.c
> +++ b/arch/arm64/kvm/at.c
> @@ -4,9 +4,305 @@
>   * Author: Jintack Lim <jintack.lim@linaro.org>
>   */
>  
> +#include <linux/kvm_host.h>
> +
> +#include <asm/esr.h>
>  #include <asm/kvm_hyp.h>
>  #include <asm/kvm_mmu.h>
>  
> +struct s1_walk_info {
> +	u64	     baddr;
> +	unsigned int max_oa_bits;
> +	unsigned int pgshift;
> +	unsigned int txsz;
> +	int 	     sl;
> +	bool	     hpd;
> +	bool	     be;
> +	bool	     nvhe;
> +	bool	     s2;
> +};
> +
> +struct s1_walk_result {
> +	union {
> +		struct {
> +			u64	desc;
> +			u64	pa;
> +			s8	level;
> +			u8	APTable;
> +			bool	UXNTable;
> +			bool	PXNTable;
> +		};
> +		struct {
> +			u8	fst;
> +			bool	ptw;
> +			bool	s2;
> +		};
> +	};
> +	bool	failed;
> +};
> +
> +static void fail_s1_walk(struct s1_walk_result *wr, u8 fst, bool ptw, bool s2)
> +{
> +	wr->fst		= fst;
> +	wr->ptw		= ptw;
> +	wr->s2		= s2;
> +	wr->failed	= true;
> +}
> +
> +#define S1_MMU_DISABLED		(-127)
> +
> +static int setup_s1_walk(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
> +			 struct s1_walk_result *wr, const u64 va, const int el)
> +{
> +	u64 sctlr, tcr, tg, ps, ia_bits, ttbr;
> +	unsigned int stride, x;
> +	bool va55, tbi;
> +
> +	wi->nvhe = el == 2 && !vcpu_el2_e2h_is_set(vcpu);
> +
> +	va55 = va & BIT(55);
> +
> +	if (wi->nvhe && va55)
> +		goto addrsz;
> +
> +	wi->s2 = el < 2 && (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_VM);
> +
> +	switch (el) {
> +	case 1:
> +		sctlr	= vcpu_read_sys_reg(vcpu, SCTLR_EL1);
> +		tcr	= vcpu_read_sys_reg(vcpu, TCR_EL1);
> +		ttbr	= (va55 ?
> +			   vcpu_read_sys_reg(vcpu, TTBR1_EL1) :
> +			   vcpu_read_sys_reg(vcpu, TTBR0_EL1));
> +		break;
> +	case 2:
> +		sctlr	= vcpu_read_sys_reg(vcpu, SCTLR_EL2);
> +		tcr	= vcpu_read_sys_reg(vcpu, TCR_EL2);
> +		ttbr	= (va55 ?
> +			   vcpu_read_sys_reg(vcpu, TTBR1_EL2) :
> +			   vcpu_read_sys_reg(vcpu, TTBR0_EL2));
> +		break;
> +	default:
> +		BUG();
> +	}
> +
> +	/* Let's put the MMU disabled case aside immediately */
> +	if (!(sctlr & SCTLR_ELx_M) ||
> +	    (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_DC)) {
> +		if (va >= BIT(kvm_get_pa_bits(vcpu->kvm)))
> +			goto addrsz;
> +
> +		wr->level = S1_MMU_DISABLED;
> +		wr->desc = va;
> +		return 0;
> +	}
> +
> +	wi->be = sctlr & SCTLR_ELx_EE;
> +
> +	wi->hpd  = kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, HPDS, IMP);
> +	wi->hpd &= (wi->nvhe ?
> +		    FIELD_GET(TCR_EL2_HPD, tcr) :
> +		    (va55 ?
> +		     FIELD_GET(TCR_HPD1, tcr) :
> +		     FIELD_GET(TCR_HPD0, tcr)));
> +
> +	tbi = (wi->nvhe ?
> +	       FIELD_GET(TCR_EL2_TBI, tcr) :
> +	       (va55 ?
> +		FIELD_GET(TCR_TBI1, tcr) :
> +		FIELD_GET(TCR_TBI0, tcr)));
> +
> +	if (!tbi && sign_extend64(va, 55) != (s64)va)
> +		goto addrsz;
> +
> +	/* Someone was silly enough to encode TG0/TG1 differently */
> +	if (va55) {
> +		wi->txsz = FIELD_GET(TCR_T1SZ_MASK, tcr);
> +		tg = FIELD_GET(TCR_TG1_MASK, tcr);
> +
> +		switch (tg << TCR_TG1_SHIFT) {
> +		case TCR_TG1_4K:
> +			wi->pgshift = 12;	 break;
> +		case TCR_TG1_16K:
> +			wi->pgshift = 14;	 break;
> +		case TCR_TG1_64K:
> +		default:	    /* IMPDEF: treat any other value as 64k */
> +			wi->pgshift = 16;	 break;
> +		}
> +	} else {
> +		wi->txsz = FIELD_GET(TCR_T0SZ_MASK, tcr);
> +		tg = FIELD_GET(TCR_TG0_MASK, tcr);
> +
> +		switch (tg << TCR_TG0_SHIFT) {
> +		case TCR_TG0_4K:
> +			wi->pgshift = 12;	 break;
> +		case TCR_TG0_16K:
> +			wi->pgshift = 14;	 break;
> +		case TCR_TG0_64K:
> +		default:	    /* IMPDEF: treat any other value as 64k */
> +			wi->pgshift = 16;	 break;
> +		}
> +	}
> +
> +	ia_bits = 64 - wi->txsz;

get_ia_size()?

> +
> +	/* AArch64.S1StartLevel() */
> +	stride = wi->pgshift - 3;
> +	wi->sl = 3 - (((ia_bits - 1) - wi->pgshift) / stride);
> +
> +	/* Check for SL mandating LPA2 (which we don't support yet) */
> +	switch (BIT(wi->pgshift)) {
> +	case SZ_4K:
> +		if (wi->sl == -1 &&
> +		    !kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN4, 52_BIT))
> +			goto addrsz;
> +		break;
> +	case SZ_16K:
> +		if (wi->sl == 0 &&
> +		    !kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN16, 52_BIT))
> +			goto addrsz;
> +		break;
> +	}
> +
> +	ps = (wi->nvhe ?
> +	      FIELD_GET(TCR_EL2_PS_MASK, tcr) : FIELD_GET(TCR_IPS_MASK, tcr));
> +
> +	wi->max_oa_bits = min(get_kvm_ipa_limit(), ps_to_output_size(ps));
> +
> +	/* Compute minimal alignment */
> +	x = 3 + ia_bits - ((3 - wi->sl) * stride + wi->pgshift);
> +
> +	wi->baddr = ttbr & TTBRx_EL1_BADDR;
> +	wi->baddr &= GENMASK_ULL(wi->max_oa_bits - 1, x);
> +
> +	return 0;
> +
> +addrsz:	/* Address Size Fault level 0 */
> +	fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ, false, false);
> +
> +	return -EFAULT;
> +}

The function seems to be missing checks for:

- valid TxSZ
- VA is not larger than the maximum input size, as defined by TxSZ
- EPD{0,1}

> +
> +static int get_ia_size(struct s1_walk_info *wi)
> +{
> +	return 64 - wi->txsz;
> +}

This looks a lot like get_ia_size() from nested.c.

> +
> +static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
> +		   struct s1_walk_result *wr, u64 va)
> +{
> +	u64 va_top, va_bottom, baddr, desc;
> +	int level, stride, ret;
> +
> +	level = wi->sl;
> +	stride = wi->pgshift - 3;
> +	baddr = wi->baddr;

AArch64.S1Walk() also checks that baddr is not larger than the OA size.
check_output_size() from nested.c looks almost like what you want here.

> +
> +	va_top = get_ia_size(wi) - 1;
> +
> +	while (1) {
> +		u64 index, ipa;
> +
> +		va_bottom = (3 - level) * stride + wi->pgshift;
> +		index = (va & GENMASK_ULL(va_top, va_bottom)) >> (va_bottom - 3);
> +
> +		ipa = baddr | index;
> +
> +		if (wi->s2) {
> +			struct kvm_s2_trans s2_trans = {};
> +
> +			ret = kvm_walk_nested_s2(vcpu, ipa, &s2_trans);
> +			if (ret) {
> +				fail_s1_walk(wr,
> +					     (s2_trans.esr & ~ESR_ELx_FSC_LEVEL) | level,
> +					     true, true);
> +				return ret;
> +			}
> +
> +			if (!kvm_s2_trans_readable(&s2_trans)) {
> +				fail_s1_walk(wr, ESR_ELx_FSC_PERM | level,
> +					     true, true);
> +
> +				return -EPERM;
> +			}
> +
> +			ipa = kvm_s2_trans_output(&s2_trans);
> +		}
> +
> +		ret = kvm_read_guest(vcpu->kvm, ipa, &desc, sizeof(desc));
> +		if (ret) {
> +			fail_s1_walk(wr, ESR_ELx_FSC_SEA_TTW(level),
> +				     true, false);
> +			return ret;
> +		}
> +
> +		if (wi->be)
> +			desc = be64_to_cpu((__force __be64)desc);
> +		else
> +			desc = le64_to_cpu((__force __le64)desc);
> +
> +		if (!(desc & 1) || ((desc & 3) == 1 && level == 3)) {
> +			fail_s1_walk(wr, ESR_ELx_FSC_FAULT | level,
> +				     true, false);
> +			return -ENOENT;
> +		}
> +
> +		/* We found a leaf, handle that */
> +		if ((desc & 3) == 1 || level == 3)
> +			break;
> +
> +		if (!wi->hpd) {
> +			wr->APTable  |= FIELD_GET(PMD_TABLE_AP, desc);
> +			wr->UXNTable |= FIELD_GET(PMD_TABLE_UXN, desc);
> +			wr->PXNTable |= FIELD_GET(PMD_TABLE_PXN, desc);
> +		}
> +
> +		baddr = GENMASK_ULL(47, wi->pgshift);

Where is baddr updated with the value read from the descriptor? Am I missing
something obvious here?

> +
> +		/* Check for out-of-range OA */
> +		if (wi->max_oa_bits < 48 &&
> +		    (baddr & GENMASK_ULL(47, wi->max_oa_bits))) {
> +			fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ | level,
> +				     true, false);
> +			return -EINVAL;
> +		}

This looks very much like check_output_size() from nested.c.

> +
> +		/* Prepare for next round */
> +		va_top = va_bottom - 1;
> +		level++;
> +	}
> +
> +	/* Block mapping, check the validity of the level */
> +	if (!(desc & BIT(1))) {
> +		bool valid_block = false;
> +
> +		switch (BIT(wi->pgshift)) {
> +		case SZ_4K:
> +			valid_block = level == 1 || level == 2;
> +			break;
> +		case SZ_16K:
> +		case SZ_64K:
> +			valid_block = level == 2;
> +			break;
> +		}
> +
> +		if (!valid_block) {
> +			fail_s1_walk(wr, ESR_ELx_FSC_FAULT | level,
> +				     true, false);
> +			return -EINVAL;
> +		}
> +	}

Matches AArch64.BlockDescSupported(), with the caveat that the walker currently
doesn't support 52 bit PAs.

> +
> +	wr->failed = false;
> +	wr->level = level;
> +	wr->desc = desc;
> +	wr->pa = desc & GENMASK(47, va_bottom);

No output size check for final PA.

> +	if (va_bottom > 12)
> +		wr->pa |= va & GENMASK_ULL(va_bottom - 1, 12);
> +
> +	return 0;
> +}
> +
>  struct mmu_config {
>  	u64	ttbr0;
>  	u64	ttbr1;
> @@ -234,6 +530,177 @@ static u64 compute_par_s12(struct kvm_vcpu *vcpu, u64 s1_par,
>  	return par;
>  }
>  
> +static u64 compute_par_s1(struct kvm_vcpu *vcpu, struct s1_walk_result *wr)
> +{
> +	u64 par;
> +
> +	if (wr->failed) {
> +		par = SYS_PAR_EL1_RES1;
> +		par |= SYS_PAR_EL1_F;
> +		par |= FIELD_PREP(SYS_PAR_EL1_FST, wr->fst);
> +		par |= wr->ptw ? SYS_PAR_EL1_PTW : 0;
> +		par |= wr->s2 ? SYS_PAR_EL1_S : 0;
> +	} else if (wr->level == S1_MMU_DISABLED) {
> +		/* MMU off or HCR_EL2.DC == 1 */
> +		par = wr->pa & GENMASK_ULL(47, 12);

That's interesting, setup_s1_walk() sets wr->desc = va and leaves wr->pa
unchanged (it's 0 from initialization in handle_at_slow()).

> +
> +		if (!(__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_DC)) {
> +			par |= FIELD_PREP(SYS_PAR_EL1_ATTR, 0); /* nGnRnE */
> +			par |= FIELD_PREP(SYS_PAR_EL1_SH, 0b10); /* OS */
> +		} else {
> +			par |= FIELD_PREP(SYS_PAR_EL1_ATTR,
> +					  MEMATTR(WbRaWa, WbRaWa));
> +			par |= FIELD_PREP(SYS_PAR_EL1_SH, 0b00); /* NS */
> +		}

This matches AArch64.S1DisabledOutput().

> +	} else {
> +		u64 mair, sctlr;
> +		int el;
> +		u8 sh;
> +
> +		el = (vcpu_el2_e2h_is_set(vcpu) &&
> +		      vcpu_el2_tge_is_set(vcpu)) ? 2 : 1;
> +
> +		mair = ((el == 2) ?
> +			vcpu_read_sys_reg(vcpu, MAIR_EL2) :
> +			vcpu_read_sys_reg(vcpu, MAIR_EL1));
> +
> +		mair >>= FIELD_GET(PTE_ATTRINDX_MASK, wr->desc) * 8;
> +		mair &= 0xff;
> +
> +		sctlr = ((el == 2) ?
> +			vcpu_read_sys_reg(vcpu, SCTLR_EL2) :
> +			vcpu_read_sys_reg(vcpu, SCTLR_EL1));
> +
> +		/* Force NC for memory if SCTLR_ELx.C is clear */
> +		if (!(sctlr & SCTLR_EL1_C) && !MEMATTR_IS_DEVICE(mair))
> +			mair = MEMATTR(NC, NC);

This matches the compute memory attributes part of AArch64.S1Translate().

> +
> +		par  = FIELD_PREP(SYS_PAR_EL1_ATTR, mair);
> +		par |= wr->pa & GENMASK_ULL(47, 12);
> +
> +		sh = compute_sh(mair, wr->desc);
> +		par |= FIELD_PREP(SYS_PAR_EL1_SH, sh);
> +	}
> +
> +	return par;
> +}
> +
> +static u64 handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
> +{
> +	bool perm_fail, ur, uw, ux, pr, pw, pan;
> +	struct s1_walk_result wr = {};
> +	struct s1_walk_info wi = {};
> +	int ret, idx, el;
> +
> +	/*
> +	 * We only get here from guest EL2, so the translation regime
> +	 * AT applies to is solely defined by {E2H,TGE}.
> +	 */
> +	el = (vcpu_el2_e2h_is_set(vcpu) &&
> +	      vcpu_el2_tge_is_set(vcpu)) ? 2 : 1;
> +
> +	ret = setup_s1_walk(vcpu, &wi, &wr, vaddr, el);
> +	if (ret)
> +		goto compute_par;
> +
> +	if (wr.level == S1_MMU_DISABLED)
> +		goto compute_par;
> +
> +	idx = srcu_read_lock(&vcpu->kvm->srcu);
> +
> +	ret = walk_s1(vcpu, &wi, &wr, vaddr);
> +
> +	srcu_read_unlock(&vcpu->kvm->srcu, idx);
> +
> +	if (ret)
> +		goto compute_par;
> +
> +	/* FIXME: revisit when adding indirect permission support */
> +	if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, PAN, PAN3) &&
> +	    !wi.nvhe) {

Just FYI, the 'if' statement fits on one line without going over the old 80
character limit.

> +		u64 sctlr;
> +
> +		if (el == 1)
> +			sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1);
> +		else
> +			sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL2);
> +
> +		ux = (sctlr & SCTLR_EL1_EPAN) && !(wr.desc & PTE_UXN);

I don't understand this. UnprivExecute is true for the memory location if and
only if **SCTLR_ELx.EPAN** && !UXN?

> +	} else {
> +		ux = false;
> +	}
> +
> +	pw = !(wr.desc & PTE_RDONLY);
> +
> +	if (wi.nvhe) {
> +		ur = uw = false;
> +		pr = true;
> +	} else {
> +		if (wr.desc & PTE_USER) {
> +			ur = pr = true;
> +			uw = pw;
> +		} else {
> +			ur = uw = false;
> +			pr = true;
> +		}
> +	}
> +
> +	/* Apply the Hierarchical Permission madness */
> +	if (wi.nvhe) {
> +		wr.APTable &= BIT(1);
> +		wr.PXNTable = wr.UXNTable;
> +	}
> +
> +	ur &= !(wr.APTable & BIT(0));
> +	uw &= !(wr.APTable != 0);
> +	ux &= !wr.UXNTable;
> +
> +	pw &= !(wr.APTable & BIT(1));

Would it make sense here to compute the resulting permisisons like in
AArch64.S1DirectBasePermissions()? I.e, look at the AP bits first, have
a switch statement for all 4 values (also makes it very easy to cross-reference
with Table D8-60), then apply hierarchical permissions/pan/epan. I do admit
that I have a very selfish reason to propose this - it makes reviewing easier.

> +
> +	pan = *vcpu_cpsr(vcpu) & PSR_PAN_BIT;
> +
> +	perm_fail = false;
> +
> +	switch (op) {
> +	case OP_AT_S1E1RP:
> +		perm_fail |= pan && (ur || uw || ux);

I had a very hard time understanding what the code is trying to do here.  How
about rewriting it to something like the pseudocode below:

  // ux = !(desc and UXN) and !UXNTable
  perm_fail |= pan && (ur || uw || ((sctlr & SCTLR_EL1_EPAN) && ux));

... which maps more closely to AArch64.S1DirectBasePermissions().

Thanks,
Alex

> +		fallthrough;
> +	case OP_AT_S1E1R:
> +	case OP_AT_S1E2R:
> +		perm_fail |= !pr;
> +		break;
> +	case OP_AT_S1E1WP:
> +		perm_fail |= pan && (ur || uw || ux);
> +		fallthrough;
> +	case OP_AT_S1E1W:
> +	case OP_AT_S1E2W:
> +		perm_fail |= !pw;
> +		break;
> +	case OP_AT_S1E0R:
> +		perm_fail |= !ur;
> +		break;
> +	case OP_AT_S1E0W:
> +		perm_fail |= !uw;
> +		break;
> +	default:
> +		BUG();
> +	}
> +
> +	if (perm_fail) {
> +		struct s1_walk_result tmp;
> +
> +		tmp.failed = true;
> +		tmp.fst = ESR_ELx_FSC_PERM | wr.level;
> +		tmp.s2 = false;
> +		tmp.ptw = false;
> +
> +		wr = tmp;
> +	}
> +
> +compute_par:
> +	return compute_par_s1(vcpu, &wr);
> +}
> [..]
Marc Zyngier July 20, 2024, 1:49 p.m. UTC | #8
On Thu, 18 Jul 2024 16:16:19 +0100,
Alexandru Elisei <alexandru.elisei@arm.com> wrote:
> 
> Hi,
> 
> Managed to have a look at AT handling for stage 1, I've been comparing it with
> AArch64.AT().
> 
> On Mon, Jul 08, 2024 at 05:57:58PM +0100, Marc Zyngier wrote:
> > In order to plug the brokenness of our current AT implementation,
> > we need a SW walker that is going to... err.. walk the S1 tables
> > and tell us what it finds.
> > 
> > Of course, it builds on top of our S2 walker, and share similar
> > concepts. The beauty of it is that since it uses kvm_read_guest(),
> > it is able to bring back pages that have been otherwise evicted.
> > 
> > This is then plugged in the two AT S1 emulation functions as
> > a "slow path" fallback. I'm not sure it is that slow, but hey.
> > 
> > Signed-off-by: Marc Zyngier <maz@kernel.org>
> > ---
> >  arch/arm64/kvm/at.c | 538 ++++++++++++++++++++++++++++++++++++++++++--
> >  1 file changed, 520 insertions(+), 18 deletions(-)
> > 
> > diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c
> > index 71e3390b43b4c..8452273cbff6d 100644
> > --- a/arch/arm64/kvm/at.c
> > +++ b/arch/arm64/kvm/at.c
> > @@ -4,9 +4,305 @@
> >   * Author: Jintack Lim <jintack.lim@linaro.org>
> >   */
> >  
> > +#include <linux/kvm_host.h>
> > +
> > +#include <asm/esr.h>
> >  #include <asm/kvm_hyp.h>
> >  #include <asm/kvm_mmu.h>
> >  
> > +struct s1_walk_info {
> > +	u64	     baddr;
> > +	unsigned int max_oa_bits;
> > +	unsigned int pgshift;
> > +	unsigned int txsz;
> > +	int 	     sl;
> > +	bool	     hpd;
> > +	bool	     be;
> > +	bool	     nvhe;
> > +	bool	     s2;
> > +};
> > +
> > +struct s1_walk_result {
> > +	union {
> > +		struct {
> > +			u64	desc;
> > +			u64	pa;
> > +			s8	level;
> > +			u8	APTable;
> > +			bool	UXNTable;
> > +			bool	PXNTable;
> > +		};
> > +		struct {
> > +			u8	fst;
> > +			bool	ptw;
> > +			bool	s2;
> > +		};
> > +	};
> > +	bool	failed;
> > +};
> > +
> > +static void fail_s1_walk(struct s1_walk_result *wr, u8 fst, bool ptw, bool s2)
> > +{
> > +	wr->fst		= fst;
> > +	wr->ptw		= ptw;
> > +	wr->s2		= s2;
> > +	wr->failed	= true;
> > +}
> > +
> > +#define S1_MMU_DISABLED		(-127)
> > +
> > +static int setup_s1_walk(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
> > +			 struct s1_walk_result *wr, const u64 va, const int el)
> > +{
> > +	u64 sctlr, tcr, tg, ps, ia_bits, ttbr;
> > +	unsigned int stride, x;
> > +	bool va55, tbi;
> > +
> > +	wi->nvhe = el == 2 && !vcpu_el2_e2h_is_set(vcpu);
> > +
> > +	va55 = va & BIT(55);
> > +
> > +	if (wi->nvhe && va55)
> > +		goto addrsz;
> > +
> > +	wi->s2 = el < 2 && (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_VM);
> > +
> > +	switch (el) {
> > +	case 1:
> > +		sctlr	= vcpu_read_sys_reg(vcpu, SCTLR_EL1);
> > +		tcr	= vcpu_read_sys_reg(vcpu, TCR_EL1);
> > +		ttbr	= (va55 ?
> > +			   vcpu_read_sys_reg(vcpu, TTBR1_EL1) :
> > +			   vcpu_read_sys_reg(vcpu, TTBR0_EL1));
> > +		break;
> > +	case 2:
> > +		sctlr	= vcpu_read_sys_reg(vcpu, SCTLR_EL2);
> > +		tcr	= vcpu_read_sys_reg(vcpu, TCR_EL2);
> > +		ttbr	= (va55 ?
> > +			   vcpu_read_sys_reg(vcpu, TTBR1_EL2) :
> > +			   vcpu_read_sys_reg(vcpu, TTBR0_EL2));
> > +		break;
> > +	default:
> > +		BUG();
> > +	}
> > +
> > +	/* Let's put the MMU disabled case aside immediately */
> > +	if (!(sctlr & SCTLR_ELx_M) ||
> > +	    (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_DC)) {
> > +		if (va >= BIT(kvm_get_pa_bits(vcpu->kvm)))
> > +			goto addrsz;
> > +
> > +		wr->level = S1_MMU_DISABLED;
> > +		wr->desc = va;
> > +		return 0;
> > +	}
> > +
> > +	wi->be = sctlr & SCTLR_ELx_EE;
> > +
> > +	wi->hpd  = kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, HPDS, IMP);
> > +	wi->hpd &= (wi->nvhe ?
> > +		    FIELD_GET(TCR_EL2_HPD, tcr) :
> > +		    (va55 ?
> > +		     FIELD_GET(TCR_HPD1, tcr) :
> > +		     FIELD_GET(TCR_HPD0, tcr)));
> > +
> > +	tbi = (wi->nvhe ?
> > +	       FIELD_GET(TCR_EL2_TBI, tcr) :
> > +	       (va55 ?
> > +		FIELD_GET(TCR_TBI1, tcr) :
> > +		FIELD_GET(TCR_TBI0, tcr)));
> > +
> > +	if (!tbi && sign_extend64(va, 55) != (s64)va)
> > +		goto addrsz;
> > +
> > +	/* Someone was silly enough to encode TG0/TG1 differently */
> > +	if (va55) {
> > +		wi->txsz = FIELD_GET(TCR_T1SZ_MASK, tcr);
> > +		tg = FIELD_GET(TCR_TG1_MASK, tcr);
> > +
> > +		switch (tg << TCR_TG1_SHIFT) {
> > +		case TCR_TG1_4K:
> > +			wi->pgshift = 12;	 break;
> > +		case TCR_TG1_16K:
> > +			wi->pgshift = 14;	 break;
> > +		case TCR_TG1_64K:
> > +		default:	    /* IMPDEF: treat any other value as 64k */
> > +			wi->pgshift = 16;	 break;
> > +		}
> > +	} else {
> > +		wi->txsz = FIELD_GET(TCR_T0SZ_MASK, tcr);
> > +		tg = FIELD_GET(TCR_TG0_MASK, tcr);
> > +
> > +		switch (tg << TCR_TG0_SHIFT) {
> > +		case TCR_TG0_4K:
> > +			wi->pgshift = 12;	 break;
> > +		case TCR_TG0_16K:
> > +			wi->pgshift = 14;	 break;
> > +		case TCR_TG0_64K:
> > +		default:	    /* IMPDEF: treat any other value as 64k */
> > +			wi->pgshift = 16;	 break;
> > +		}
> > +	}
> > +
> > +	ia_bits = 64 - wi->txsz;
> 
> get_ia_size()?

yeah, fair enough. I wasn't sold on using any helper while the
walk_info struct is incomplete, but that doesn't change much.

> 
> > +
> > +	/* AArch64.S1StartLevel() */
> > +	stride = wi->pgshift - 3;
> > +	wi->sl = 3 - (((ia_bits - 1) - wi->pgshift) / stride);
> > +
> > +	/* Check for SL mandating LPA2 (which we don't support yet) */
> > +	switch (BIT(wi->pgshift)) {
> > +	case SZ_4K:
> > +		if (wi->sl == -1 &&
> > +		    !kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN4, 52_BIT))
> > +			goto addrsz;
> > +		break;
> > +	case SZ_16K:
> > +		if (wi->sl == 0 &&
> > +		    !kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN16, 52_BIT))
> > +			goto addrsz;
> > +		break;
> > +	}
> > +
> > +	ps = (wi->nvhe ?
> > +	      FIELD_GET(TCR_EL2_PS_MASK, tcr) : FIELD_GET(TCR_IPS_MASK, tcr));
> > +
> > +	wi->max_oa_bits = min(get_kvm_ipa_limit(), ps_to_output_size(ps));
> > +
> > +	/* Compute minimal alignment */
> > +	x = 3 + ia_bits - ((3 - wi->sl) * stride + wi->pgshift);
> > +
> > +	wi->baddr = ttbr & TTBRx_EL1_BADDR;
> > +	wi->baddr &= GENMASK_ULL(wi->max_oa_bits - 1, x);
> > +
> > +	return 0;
> > +
> > +addrsz:	/* Address Size Fault level 0 */
> > +	fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ, false, false);
> > +
> > +	return -EFAULT;
> > +}
> 
> The function seems to be missing checks for:
> 
> - valid TxSZ
> - VA is not larger than the maximum input size, as defined by TxSZ
> - EPD{0,1}

Yup, all fixed now, with E0PD{0,1} as an added bonus. The number of
ways a translation can fail is amazing.

> 
> > +
> > +static int get_ia_size(struct s1_walk_info *wi)
> > +{
> > +	return 64 - wi->txsz;
> > +}
> 
> This looks a lot like get_ia_size() from nested.c.

Indeed. Except that the *type* is different. And I really like the
fact that they are separate for now. I may end-up merging some of the
attributes at some point though.

> 
> > +
> > +static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
> > +		   struct s1_walk_result *wr, u64 va)
> > +{
> > +	u64 va_top, va_bottom, baddr, desc;
> > +	int level, stride, ret;
> > +
> > +	level = wi->sl;
> > +	stride = wi->pgshift - 3;
> > +	baddr = wi->baddr;
> 
> AArch64.S1Walk() also checks that baddr is not larger than the OA size.
> check_output_size() from nested.c looks almost like what you want
> here.

At this stage, it is too late, as wi->baddr has already been sanitised
by the setup phase. I'll add a check over there.

> 
> > +
> > +	va_top = get_ia_size(wi) - 1;
> > +
> > +	while (1) {
> > +		u64 index, ipa;
> > +
> > +		va_bottom = (3 - level) * stride + wi->pgshift;
> > +		index = (va & GENMASK_ULL(va_top, va_bottom)) >> (va_bottom - 3);
> > +
> > +		ipa = baddr | index;
> > +
> > +		if (wi->s2) {
> > +			struct kvm_s2_trans s2_trans = {};
> > +
> > +			ret = kvm_walk_nested_s2(vcpu, ipa, &s2_trans);
> > +			if (ret) {
> > +				fail_s1_walk(wr,
> > +					     (s2_trans.esr & ~ESR_ELx_FSC_LEVEL) | level,
> > +					     true, true);
> > +				return ret;
> > +			}
> > +
> > +			if (!kvm_s2_trans_readable(&s2_trans)) {
> > +				fail_s1_walk(wr, ESR_ELx_FSC_PERM | level,
> > +					     true, true);
> > +
> > +				return -EPERM;
> > +			}
> > +
> > +			ipa = kvm_s2_trans_output(&s2_trans);
> > +		}
> > +
> > +		ret = kvm_read_guest(vcpu->kvm, ipa, &desc, sizeof(desc));
> > +		if (ret) {
> > +			fail_s1_walk(wr, ESR_ELx_FSC_SEA_TTW(level),
> > +				     true, false);
> > +			return ret;
> > +		}
> > +
> > +		if (wi->be)
> > +			desc = be64_to_cpu((__force __be64)desc);
> > +		else
> > +			desc = le64_to_cpu((__force __le64)desc);
> > +
> > +		if (!(desc & 1) || ((desc & 3) == 1 && level == 3)) {
> > +			fail_s1_walk(wr, ESR_ELx_FSC_FAULT | level,
> > +				     true, false);
> > +			return -ENOENT;
> > +		}
> > +
> > +		/* We found a leaf, handle that */
> > +		if ((desc & 3) == 1 || level == 3)
> > +			break;
> > +
> > +		if (!wi->hpd) {
> > +			wr->APTable  |= FIELD_GET(PMD_TABLE_AP, desc);
> > +			wr->UXNTable |= FIELD_GET(PMD_TABLE_UXN, desc);
> > +			wr->PXNTable |= FIELD_GET(PMD_TABLE_PXN, desc);
> > +		}
> > +
> > +		baddr = GENMASK_ULL(47, wi->pgshift);
> 
> Where is baddr updated with the value read from the descriptor? Am I missing
> something obvious here?

Huh. Something has gone very wrong, and I have no idea how.
This should read:

		baddr = desc & GENMASK_ULL(47, wi->pgshift);

because otherwise nothing makes sense. I must have done a last minute
cleanup and somehow broken it. Time to retest everything!

> 
> > +
> > +		/* Check for out-of-range OA */
> > +		if (wi->max_oa_bits < 48 &&
> > +		    (baddr & GENMASK_ULL(47, wi->max_oa_bits))) {
> > +			fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ | level,
> > +				     true, false);
> > +			return -EINVAL;
> > +		}
> 
> This looks very much like check_output_size() from nested.c.

Yup. I'll fold that into a helper -- still separate from the S2
version though.

> 
> > +
> > +		/* Prepare for next round */
> > +		va_top = va_bottom - 1;
> > +		level++;
> > +	}
> > +
> > +	/* Block mapping, check the validity of the level */
> > +	if (!(desc & BIT(1))) {
> > +		bool valid_block = false;
> > +
> > +		switch (BIT(wi->pgshift)) {
> > +		case SZ_4K:
> > +			valid_block = level == 1 || level == 2;
> > +			break;
> > +		case SZ_16K:
> > +		case SZ_64K:
> > +			valid_block = level == 2;
> > +			break;
> > +		}
> > +
> > +		if (!valid_block) {
> > +			fail_s1_walk(wr, ESR_ELx_FSC_FAULT | level,
> > +				     true, false);
> > +			return -EINVAL;
> > +		}
> > +	}
> 
> Matches AArch64.BlockDescSupported(), with the caveat that the walker currently
> doesn't support 52 bit PAs.
> 
> > +
> > +	wr->failed = false;
> > +	wr->level = level;
> > +	wr->desc = desc;
> > +	wr->pa = desc & GENMASK(47, va_bottom);
> 
> No output size check for final PA.

Now fixed.

> 
> > +	if (va_bottom > 12)
> > +		wr->pa |= va & GENMASK_ULL(va_bottom - 1, 12);
> > +
> > +	return 0;
> > +}
> > +
> >  struct mmu_config {
> >  	u64	ttbr0;
> >  	u64	ttbr1;
> > @@ -234,6 +530,177 @@ static u64 compute_par_s12(struct kvm_vcpu *vcpu, u64 s1_par,
> >  	return par;
> >  }
> >  
> > +static u64 compute_par_s1(struct kvm_vcpu *vcpu, struct s1_walk_result *wr)
> > +{
> > +	u64 par;
> > +
> > +	if (wr->failed) {
> > +		par = SYS_PAR_EL1_RES1;
> > +		par |= SYS_PAR_EL1_F;
> > +		par |= FIELD_PREP(SYS_PAR_EL1_FST, wr->fst);
> > +		par |= wr->ptw ? SYS_PAR_EL1_PTW : 0;
> > +		par |= wr->s2 ? SYS_PAR_EL1_S : 0;
> > +	} else if (wr->level == S1_MMU_DISABLED) {
> > +		/* MMU off or HCR_EL2.DC == 1 */
> > +		par = wr->pa & GENMASK_ULL(47, 12);
> 
> That's interesting, setup_s1_walk() sets wr->desc = va and leaves wr->pa
> unchanged (it's 0 from initialization in handle_at_slow()).

If by "interesting" you mean broken, then I agree!

> 
> > +
> > +		if (!(__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_DC)) {
> > +			par |= FIELD_PREP(SYS_PAR_EL1_ATTR, 0); /* nGnRnE */
> > +			par |= FIELD_PREP(SYS_PAR_EL1_SH, 0b10); /* OS */
> > +		} else {
> > +			par |= FIELD_PREP(SYS_PAR_EL1_ATTR,
> > +					  MEMATTR(WbRaWa, WbRaWa));
> > +			par |= FIELD_PREP(SYS_PAR_EL1_SH, 0b00); /* NS */
> > +		}
> 
> This matches AArch64.S1DisabledOutput().
> 
> > +	} else {
> > +		u64 mair, sctlr;
> > +		int el;
> > +		u8 sh;
> > +
> > +		el = (vcpu_el2_e2h_is_set(vcpu) &&
> > +		      vcpu_el2_tge_is_set(vcpu)) ? 2 : 1;
> > +
> > +		mair = ((el == 2) ?
> > +			vcpu_read_sys_reg(vcpu, MAIR_EL2) :
> > +			vcpu_read_sys_reg(vcpu, MAIR_EL1));
> > +
> > +		mair >>= FIELD_GET(PTE_ATTRINDX_MASK, wr->desc) * 8;
> > +		mair &= 0xff;
> > +
> > +		sctlr = ((el == 2) ?
> > +			vcpu_read_sys_reg(vcpu, SCTLR_EL2) :
> > +			vcpu_read_sys_reg(vcpu, SCTLR_EL1));
> > +
> > +		/* Force NC for memory if SCTLR_ELx.C is clear */
> > +		if (!(sctlr & SCTLR_EL1_C) && !MEMATTR_IS_DEVICE(mair))
> > +			mair = MEMATTR(NC, NC);
> 
> This matches the compute memory attributes part of AArch64.S1Translate().
> 
> > +
> > +		par  = FIELD_PREP(SYS_PAR_EL1_ATTR, mair);
> > +		par |= wr->pa & GENMASK_ULL(47, 12);
> > +
> > +		sh = compute_sh(mair, wr->desc);
> > +		par |= FIELD_PREP(SYS_PAR_EL1_SH, sh);
> > +	}
> > +
> > +	return par;
> > +}
> > +
> > +static u64 handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
> > +{
> > +	bool perm_fail, ur, uw, ux, pr, pw, pan;
> > +	struct s1_walk_result wr = {};
> > +	struct s1_walk_info wi = {};
> > +	int ret, idx, el;
> > +
> > +	/*
> > +	 * We only get here from guest EL2, so the translation regime
> > +	 * AT applies to is solely defined by {E2H,TGE}.
> > +	 */
> > +	el = (vcpu_el2_e2h_is_set(vcpu) &&
> > +	      vcpu_el2_tge_is_set(vcpu)) ? 2 : 1;
> > +
> > +	ret = setup_s1_walk(vcpu, &wi, &wr, vaddr, el);
> > +	if (ret)
> > +		goto compute_par;
> > +
> > +	if (wr.level == S1_MMU_DISABLED)
> > +		goto compute_par;
> > +
> > +	idx = srcu_read_lock(&vcpu->kvm->srcu);
> > +
> > +	ret = walk_s1(vcpu, &wi, &wr, vaddr);
> > +
> > +	srcu_read_unlock(&vcpu->kvm->srcu, idx);
> > +
> > +	if (ret)
> > +		goto compute_par;
> > +
> > +	/* FIXME: revisit when adding indirect permission support */
> > +	if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, PAN, PAN3) &&
> > +	    !wi.nvhe) {
> 
> Just FYI, the 'if' statement fits on one line without going over the old 80
> character limit.

All that code has now been reworked.

> 
> > +		u64 sctlr;
> > +
> > +		if (el == 1)
> > +			sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1);
> > +		else
> > +			sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL2);
> > +
> > +		ux = (sctlr & SCTLR_EL1_EPAN) && !(wr.desc & PTE_UXN);
> 
> I don't understand this. UnprivExecute is true for the memory location if and
> only if **SCTLR_ELx.EPAN** && !UXN?

Well, it is the only case we actually care about. And from what i read
below, you *have* understood it.

> 
> > +	} else {
> > +		ux = false;
> > +	}
> > +
> > +	pw = !(wr.desc & PTE_RDONLY);
> > +
> > +	if (wi.nvhe) {
> > +		ur = uw = false;
> > +		pr = true;
> > +	} else {
> > +		if (wr.desc & PTE_USER) {
> > +			ur = pr = true;
> > +			uw = pw;
> > +		} else {
> > +			ur = uw = false;
> > +			pr = true;
> > +		}
> > +	}
> > +
> > +	/* Apply the Hierarchical Permission madness */
> > +	if (wi.nvhe) {
> > +		wr.APTable &= BIT(1);
> > +		wr.PXNTable = wr.UXNTable;
> > +	}
> > +
> > +	ur &= !(wr.APTable & BIT(0));
> > +	uw &= !(wr.APTable != 0);
> > +	ux &= !wr.UXNTable;
> > +
> > +	pw &= !(wr.APTable & BIT(1));
> 
> Would it make sense here to compute the resulting permisisons like in
> AArch64.S1DirectBasePermissions()? I.e, look at the AP bits first, have
> a switch statement for all 4 values (also makes it very easy to cross-reference
> with Table D8-60), then apply hierarchical permissions/pan/epan. I do admit
> that I have a very selfish reason to propose this - it makes reviewing easier.
>

Fair enough. I usually try to distance myself from the pseudocode and
implement what I understand, but I appreciate this is just hard to
read. It definitely results in something larger, but it probably
doesn't matter much.

> > +
> > +	pan = *vcpu_cpsr(vcpu) & PSR_PAN_BIT;
> > +
> > +	perm_fail = false;
> > +
> > +	switch (op) {
> > +	case OP_AT_S1E1RP:
> > +		perm_fail |= pan && (ur || uw || ux);
> 
> I had a very hard time understanding what the code is trying to do here.  How
> about rewriting it to something like the pseudocode below:
> 
>   // ux = !(desc and UXN) and !UXNTable
>   perm_fail |= pan && (ur || uw || ((sctlr & SCTLR_EL1_EPAN) && ux));
> 
> ... which maps more closely to AArch64.S1DirectBasePermissions().

Yup, I got there by virtue of adopting the same flow as the
pseudocode.

Thanks a lot for the thorough review, much appreciated.

	M.
Alexandru Elisei July 22, 2024, 10:53 a.m. UTC | #9
Hi Marc,

I would like to use the S1 walker for KVM SPE, and I was planning to move it to
a separate file, where it would be shared between nested KVM and SPE. I think
this is also good for NV, since the walker would get more testing.

Do you think moving it to a shared location is a good approach? Or do you have
something else in mind?

Also, do you know where you'll be able to send an updated version of this
series? I'm asking because I want to decide between using this code (with fixes
on top) or wait for the next iteration. Please don't feel that you need to send
the next iteration too soon.

And please CC me on the series, so I don't miss it by mistake :)

Thanks,
Alex

On Mon, Jul 08, 2024 at 05:57:58PM +0100, Marc Zyngier wrote:
> In order to plug the brokenness of our current AT implementation,
> we need a SW walker that is going to... err.. walk the S1 tables
> and tell us what it finds.
> 
> Of course, it builds on top of our S2 walker, and share similar
> concepts. The beauty of it is that since it uses kvm_read_guest(),
> it is able to bring back pages that have been otherwise evicted.
> 
> This is then plugged in the two AT S1 emulation functions as
> a "slow path" fallback. I'm not sure it is that slow, but hey.
> 
> Signed-off-by: Marc Zyngier <maz@kernel.org>
> ---
>  arch/arm64/kvm/at.c | 538 ++++++++++++++++++++++++++++++++++++++++++--
>  1 file changed, 520 insertions(+), 18 deletions(-)
> 
> diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c
> index 71e3390b43b4c..8452273cbff6d 100644
> --- a/arch/arm64/kvm/at.c
> +++ b/arch/arm64/kvm/at.c
> @@ -4,9 +4,305 @@
>   * Author: Jintack Lim <jintack.lim@linaro.org>
>   */
>  
> +#include <linux/kvm_host.h>
> +
> +#include <asm/esr.h>
>  #include <asm/kvm_hyp.h>
>  #include <asm/kvm_mmu.h>
>  
> +struct s1_walk_info {
> +	u64	     baddr;
> +	unsigned int max_oa_bits;
> +	unsigned int pgshift;
> +	unsigned int txsz;
> +	int 	     sl;
> +	bool	     hpd;
> +	bool	     be;
> +	bool	     nvhe;
> +	bool	     s2;
> +};
> +
> +struct s1_walk_result {
> +	union {
> +		struct {
> +			u64	desc;
> +			u64	pa;
> +			s8	level;
> +			u8	APTable;
> +			bool	UXNTable;
> +			bool	PXNTable;
> +		};
> +		struct {
> +			u8	fst;
> +			bool	ptw;
> +			bool	s2;
> +		};
> +	};
> +	bool	failed;
> +};
> +
> +static void fail_s1_walk(struct s1_walk_result *wr, u8 fst, bool ptw, bool s2)
> +{
> +	wr->fst		= fst;
> +	wr->ptw		= ptw;
> +	wr->s2		= s2;
> +	wr->failed	= true;
> +}
> +
> +#define S1_MMU_DISABLED		(-127)
> +
> +static int setup_s1_walk(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
> +			 struct s1_walk_result *wr, const u64 va, const int el)
> +{
> +	u64 sctlr, tcr, tg, ps, ia_bits, ttbr;
> +	unsigned int stride, x;
> +	bool va55, tbi;
> +
> +	wi->nvhe = el == 2 && !vcpu_el2_e2h_is_set(vcpu);
> +
> +	va55 = va & BIT(55);
> +
> +	if (wi->nvhe && va55)
> +		goto addrsz;
> +
> +	wi->s2 = el < 2 && (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_VM);
> +
> +	switch (el) {
> +	case 1:
> +		sctlr	= vcpu_read_sys_reg(vcpu, SCTLR_EL1);
> +		tcr	= vcpu_read_sys_reg(vcpu, TCR_EL1);
> +		ttbr	= (va55 ?
> +			   vcpu_read_sys_reg(vcpu, TTBR1_EL1) :
> +			   vcpu_read_sys_reg(vcpu, TTBR0_EL1));
> +		break;
> +	case 2:
> +		sctlr	= vcpu_read_sys_reg(vcpu, SCTLR_EL2);
> +		tcr	= vcpu_read_sys_reg(vcpu, TCR_EL2);
> +		ttbr	= (va55 ?
> +			   vcpu_read_sys_reg(vcpu, TTBR1_EL2) :
> +			   vcpu_read_sys_reg(vcpu, TTBR0_EL2));
> +		break;
> +	default:
> +		BUG();
> +	}
> +
> +	/* Let's put the MMU disabled case aside immediately */
> +	if (!(sctlr & SCTLR_ELx_M) ||
> +	    (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_DC)) {
> +		if (va >= BIT(kvm_get_pa_bits(vcpu->kvm)))
> +			goto addrsz;
> +
> +		wr->level = S1_MMU_DISABLED;
> +		wr->desc = va;
> +		return 0;
> +	}
> +
> +	wi->be = sctlr & SCTLR_ELx_EE;
> +
> +	wi->hpd  = kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, HPDS, IMP);
> +	wi->hpd &= (wi->nvhe ?
> +		    FIELD_GET(TCR_EL2_HPD, tcr) :
> +		    (va55 ?
> +		     FIELD_GET(TCR_HPD1, tcr) :
> +		     FIELD_GET(TCR_HPD0, tcr)));
> +
> +	tbi = (wi->nvhe ?
> +	       FIELD_GET(TCR_EL2_TBI, tcr) :
> +	       (va55 ?
> +		FIELD_GET(TCR_TBI1, tcr) :
> +		FIELD_GET(TCR_TBI0, tcr)));
> +
> +	if (!tbi && sign_extend64(va, 55) != (s64)va)
> +		goto addrsz;
> +
> +	/* Someone was silly enough to encode TG0/TG1 differently */
> +	if (va55) {
> +		wi->txsz = FIELD_GET(TCR_T1SZ_MASK, tcr);
> +		tg = FIELD_GET(TCR_TG1_MASK, tcr);
> +
> +		switch (tg << TCR_TG1_SHIFT) {
> +		case TCR_TG1_4K:
> +			wi->pgshift = 12;	 break;
> +		case TCR_TG1_16K:
> +			wi->pgshift = 14;	 break;
> +		case TCR_TG1_64K:
> +		default:	    /* IMPDEF: treat any other value as 64k */
> +			wi->pgshift = 16;	 break;
> +		}
> +	} else {
> +		wi->txsz = FIELD_GET(TCR_T0SZ_MASK, tcr);
> +		tg = FIELD_GET(TCR_TG0_MASK, tcr);
> +
> +		switch (tg << TCR_TG0_SHIFT) {
> +		case TCR_TG0_4K:
> +			wi->pgshift = 12;	 break;
> +		case TCR_TG0_16K:
> +			wi->pgshift = 14;	 break;
> +		case TCR_TG0_64K:
> +		default:	    /* IMPDEF: treat any other value as 64k */
> +			wi->pgshift = 16;	 break;
> +		}
> +	}
> +
> +	ia_bits = 64 - wi->txsz;
> +
> +	/* AArch64.S1StartLevel() */
> +	stride = wi->pgshift - 3;
> +	wi->sl = 3 - (((ia_bits - 1) - wi->pgshift) / stride);
> +
> +	/* Check for SL mandating LPA2 (which we don't support yet) */
> +	switch (BIT(wi->pgshift)) {
> +	case SZ_4K:
> +		if (wi->sl == -1 &&
> +		    !kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN4, 52_BIT))
> +			goto addrsz;
> +		break;
> +	case SZ_16K:
> +		if (wi->sl == 0 &&
> +		    !kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN16, 52_BIT))
> +			goto addrsz;
> +		break;
> +	}
> +
> +	ps = (wi->nvhe ?
> +	      FIELD_GET(TCR_EL2_PS_MASK, tcr) : FIELD_GET(TCR_IPS_MASK, tcr));
> +
> +	wi->max_oa_bits = min(get_kvm_ipa_limit(), ps_to_output_size(ps));
> +
> +	/* Compute minimal alignment */
> +	x = 3 + ia_bits - ((3 - wi->sl) * stride + wi->pgshift);
> +
> +	wi->baddr = ttbr & TTBRx_EL1_BADDR;
> +	wi->baddr &= GENMASK_ULL(wi->max_oa_bits - 1, x);
> +
> +	return 0;
> +
> +addrsz:	/* Address Size Fault level 0 */
> +	fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ, false, false);
> +
> +	return -EFAULT;
> +}
> +
> +static int get_ia_size(struct s1_walk_info *wi)
> +{
> +	return 64 - wi->txsz;
> +}
> +
> +static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
> +		   struct s1_walk_result *wr, u64 va)
> +{
> +	u64 va_top, va_bottom, baddr, desc;
> +	int level, stride, ret;
> +
> +	level = wi->sl;
> +	stride = wi->pgshift - 3;
> +	baddr = wi->baddr;
> +
> +	va_top = get_ia_size(wi) - 1;
> +
> +	while (1) {
> +		u64 index, ipa;
> +
> +		va_bottom = (3 - level) * stride + wi->pgshift;
> +		index = (va & GENMASK_ULL(va_top, va_bottom)) >> (va_bottom - 3);
> +
> +		ipa = baddr | index;
> +
> +		if (wi->s2) {
> +			struct kvm_s2_trans s2_trans = {};
> +
> +			ret = kvm_walk_nested_s2(vcpu, ipa, &s2_trans);
> +			if (ret) {
> +				fail_s1_walk(wr,
> +					     (s2_trans.esr & ~ESR_ELx_FSC_LEVEL) | level,
> +					     true, true);
> +				return ret;
> +			}
> +
> +			if (!kvm_s2_trans_readable(&s2_trans)) {
> +				fail_s1_walk(wr, ESR_ELx_FSC_PERM | level,
> +					     true, true);
> +
> +				return -EPERM;
> +			}
> +
> +			ipa = kvm_s2_trans_output(&s2_trans);
> +		}
> +
> +		ret = kvm_read_guest(vcpu->kvm, ipa, &desc, sizeof(desc));
> +		if (ret) {
> +			fail_s1_walk(wr, ESR_ELx_FSC_SEA_TTW(level),
> +				     true, false);
> +			return ret;
> +		}
> +
> +		if (wi->be)
> +			desc = be64_to_cpu((__force __be64)desc);
> +		else
> +			desc = le64_to_cpu((__force __le64)desc);
> +
> +		if (!(desc & 1) || ((desc & 3) == 1 && level == 3)) {
> +			fail_s1_walk(wr, ESR_ELx_FSC_FAULT | level,
> +				     true, false);
> +			return -ENOENT;
> +		}
> +
> +		/* We found a leaf, handle that */
> +		if ((desc & 3) == 1 || level == 3)
> +			break;
> +
> +		if (!wi->hpd) {
> +			wr->APTable  |= FIELD_GET(PMD_TABLE_AP, desc);
> +			wr->UXNTable |= FIELD_GET(PMD_TABLE_UXN, desc);
> +			wr->PXNTable |= FIELD_GET(PMD_TABLE_PXN, desc);
> +		}
> +
> +		baddr = GENMASK_ULL(47, wi->pgshift);
> +
> +		/* Check for out-of-range OA */
> +		if (wi->max_oa_bits < 48 &&
> +		    (baddr & GENMASK_ULL(47, wi->max_oa_bits))) {
> +			fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ | level,
> +				     true, false);
> +			return -EINVAL;
> +		}
> +
> +		/* Prepare for next round */
> +		va_top = va_bottom - 1;
> +		level++;
> +	}
> +
> +	/* Block mapping, check the validity of the level */
> +	if (!(desc & BIT(1))) {
> +		bool valid_block = false;
> +
> +		switch (BIT(wi->pgshift)) {
> +		case SZ_4K:
> +			valid_block = level == 1 || level == 2;
> +			break;
> +		case SZ_16K:
> +		case SZ_64K:
> +			valid_block = level == 2;
> +			break;
> +		}
> +
> +		if (!valid_block) {
> +			fail_s1_walk(wr, ESR_ELx_FSC_FAULT | level,
> +				     true, false);
> +			return -EINVAL;
> +		}
> +	}
> +
> +	wr->failed = false;
> +	wr->level = level;
> +	wr->desc = desc;
> +	wr->pa = desc & GENMASK(47, va_bottom);
> +	if (va_bottom > 12)
> +		wr->pa |= va & GENMASK_ULL(va_bottom - 1, 12);
> +
> +	return 0;
> +}
> +
>  struct mmu_config {
>  	u64	ttbr0;
>  	u64	ttbr1;
> @@ -234,6 +530,177 @@ static u64 compute_par_s12(struct kvm_vcpu *vcpu, u64 s1_par,
>  	return par;
>  }
>  
> +static u64 compute_par_s1(struct kvm_vcpu *vcpu, struct s1_walk_result *wr)
> +{
> +	u64 par;
> +
> +	if (wr->failed) {
> +		par = SYS_PAR_EL1_RES1;
> +		par |= SYS_PAR_EL1_F;
> +		par |= FIELD_PREP(SYS_PAR_EL1_FST, wr->fst);
> +		par |= wr->ptw ? SYS_PAR_EL1_PTW : 0;
> +		par |= wr->s2 ? SYS_PAR_EL1_S : 0;
> +	} else if (wr->level == S1_MMU_DISABLED) {
> +		/* MMU off or HCR_EL2.DC == 1 */
> +		par = wr->pa & GENMASK_ULL(47, 12);
> +
> +		if (!(__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_DC)) {
> +			par |= FIELD_PREP(SYS_PAR_EL1_ATTR, 0); /* nGnRnE */
> +			par |= FIELD_PREP(SYS_PAR_EL1_SH, 0b10); /* OS */
> +		} else {
> +			par |= FIELD_PREP(SYS_PAR_EL1_ATTR,
> +					  MEMATTR(WbRaWa, WbRaWa));
> +			par |= FIELD_PREP(SYS_PAR_EL1_SH, 0b00); /* NS */
> +		}
> +	} else {
> +		u64 mair, sctlr;
> +		int el;
> +		u8 sh;
> +
> +		el = (vcpu_el2_e2h_is_set(vcpu) &&
> +		      vcpu_el2_tge_is_set(vcpu)) ? 2 : 1;
> +
> +		mair = ((el == 2) ?
> +			vcpu_read_sys_reg(vcpu, MAIR_EL2) :
> +			vcpu_read_sys_reg(vcpu, MAIR_EL1));
> +
> +		mair >>= FIELD_GET(PTE_ATTRINDX_MASK, wr->desc) * 8;
> +		mair &= 0xff;
> +
> +		sctlr = ((el == 2) ?
> +			vcpu_read_sys_reg(vcpu, SCTLR_EL2) :
> +			vcpu_read_sys_reg(vcpu, SCTLR_EL1));
> +
> +		/* Force NC for memory if SCTLR_ELx.C is clear */
> +		if (!(sctlr & SCTLR_EL1_C) && !MEMATTR_IS_DEVICE(mair))
> +			mair = MEMATTR(NC, NC);
> +
> +		par  = FIELD_PREP(SYS_PAR_EL1_ATTR, mair);
> +		par |= wr->pa & GENMASK_ULL(47, 12);
> +
> +		sh = compute_sh(mair, wr->desc);
> +		par |= FIELD_PREP(SYS_PAR_EL1_SH, sh);
> +	}
> +
> +	return par;
> +}
> +
> +static u64 handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
> +{
> +	bool perm_fail, ur, uw, ux, pr, pw, pan;
> +	struct s1_walk_result wr = {};
> +	struct s1_walk_info wi = {};
> +	int ret, idx, el;
> +
> +	/*
> +	 * We only get here from guest EL2, so the translation regime
> +	 * AT applies to is solely defined by {E2H,TGE}.
> +	 */
> +	el = (vcpu_el2_e2h_is_set(vcpu) &&
> +	      vcpu_el2_tge_is_set(vcpu)) ? 2 : 1;
> +
> +	ret = setup_s1_walk(vcpu, &wi, &wr, vaddr, el);
> +	if (ret)
> +		goto compute_par;
> +
> +	if (wr.level == S1_MMU_DISABLED)
> +		goto compute_par;
> +
> +	idx = srcu_read_lock(&vcpu->kvm->srcu);
> +
> +	ret = walk_s1(vcpu, &wi, &wr, vaddr);
> +
> +	srcu_read_unlock(&vcpu->kvm->srcu, idx);
> +
> +	if (ret)
> +		goto compute_par;
> +
> +	/* FIXME: revisit when adding indirect permission support */
> +	if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, PAN, PAN3) &&
> +	    !wi.nvhe) {
> +		u64 sctlr;
> +
> +		if (el == 1)
> +			sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1);
> +		else
> +			sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL2);
> +
> +		ux = (sctlr & SCTLR_EL1_EPAN) && !(wr.desc & PTE_UXN);
> +	} else {
> +		ux = false;
> +	}
> +
> +	pw = !(wr.desc & PTE_RDONLY);
> +
> +	if (wi.nvhe) {
> +		ur = uw = false;
> +		pr = true;
> +	} else {
> +		if (wr.desc & PTE_USER) {
> +			ur = pr = true;
> +			uw = pw;
> +		} else {
> +			ur = uw = false;
> +			pr = true;
> +		}
> +	}
> +
> +	/* Apply the Hierarchical Permission madness */
> +	if (wi.nvhe) {
> +		wr.APTable &= BIT(1);
> +		wr.PXNTable = wr.UXNTable;
> +	}
> +
> +	ur &= !(wr.APTable & BIT(0));
> +	uw &= !(wr.APTable != 0);
> +	ux &= !wr.UXNTable;
> +
> +	pw &= !(wr.APTable & BIT(1));
> +
> +	pan = *vcpu_cpsr(vcpu) & PSR_PAN_BIT;
> +
> +	perm_fail = false;
> +
> +	switch (op) {
> +	case OP_AT_S1E1RP:
> +		perm_fail |= pan && (ur || uw || ux);
> +		fallthrough;
> +	case OP_AT_S1E1R:
> +	case OP_AT_S1E2R:
> +		perm_fail |= !pr;
> +		break;
> +	case OP_AT_S1E1WP:
> +		perm_fail |= pan && (ur || uw || ux);
> +		fallthrough;
> +	case OP_AT_S1E1W:
> +	case OP_AT_S1E2W:
> +		perm_fail |= !pw;
> +		break;
> +	case OP_AT_S1E0R:
> +		perm_fail |= !ur;
> +		break;
> +	case OP_AT_S1E0W:
> +		perm_fail |= !uw;
> +		break;
> +	default:
> +		BUG();
> +	}
> +
> +	if (perm_fail) {
> +		struct s1_walk_result tmp;
> +
> +		tmp.failed = true;
> +		tmp.fst = ESR_ELx_FSC_PERM | wr.level;
> +		tmp.s2 = false;
> +		tmp.ptw = false;
> +
> +		wr = tmp;
> +	}
> +
> +compute_par:
> +	return compute_par_s1(vcpu, &wr);
> +}
> +
>  static bool check_at_pan(struct kvm_vcpu *vcpu, u64 vaddr, u64 *res)
>  {
>  	u64 par_e0;
> @@ -266,9 +733,11 @@ void __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
>  	struct mmu_config config;
>  	struct kvm_s2_mmu *mmu;
>  	unsigned long flags;
> -	bool fail;
> +	bool fail, retry_slow;
>  	u64 par;
>  
> +	retry_slow = false;
> +
>  	write_lock(&vcpu->kvm->mmu_lock);
>  
>  	/*
> @@ -288,14 +757,15 @@ void __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
>  		goto skip_mmu_switch;
>  
>  	/*
> -	 * FIXME: Obtaining the S2 MMU for a L2 is horribly racy, and
> -	 * we may not find it (recycled by another vcpu, for example).
> -	 * See the other FIXME comment below about the need for a SW
> -	 * PTW in this case.
> +	 * Obtaining the S2 MMU for a L2 is horribly racy, and we may not
> +	 * find it (recycled by another vcpu, for example). When this
> +	 * happens, use the SW (slow) path.
>  	 */
>  	mmu = lookup_s2_mmu(vcpu);
> -	if (WARN_ON(!mmu))
> +	if (!mmu) {
> +		retry_slow = true;
>  		goto out;
> +	}
>  
>  	write_sysreg_el1(vcpu_read_sys_reg(vcpu, TTBR0_EL1),	SYS_TTBR0);
>  	write_sysreg_el1(vcpu_read_sys_reg(vcpu, TTBR1_EL1),	SYS_TTBR1);
> @@ -331,18 +801,17 @@ void __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
>  	}
>  
>  	if (!fail)
> -		par = read_sysreg(par_el1);
> +		par = read_sysreg_par();
>  	else
>  		par = SYS_PAR_EL1_F;
>  
> +	retry_slow = !fail;
> +
>  	vcpu_write_sys_reg(vcpu, par, PAR_EL1);
>  
>  	/*
> -	 * Failed? let's leave the building now.
> -	 *
> -	 * FIXME: how about a failed translation because the shadow S2
> -	 * wasn't populated? We may need to perform a SW PTW,
> -	 * populating our shadow S2 and retry the instruction.
> +	 * Failed? let's leave the building now, unless we retry on
> +	 * the slow path.
>  	 */
>  	if (par & SYS_PAR_EL1_F)
>  		goto nopan;
> @@ -354,29 +823,58 @@ void __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
>  	switch (op) {
>  	case OP_AT_S1E1RP:
>  	case OP_AT_S1E1WP:
> +		retry_slow = false;
>  		fail = check_at_pan(vcpu, vaddr, &par);
>  		break;
>  	default:
>  		goto nopan;
>  	}
>  
> +	if (fail) {
> +		vcpu_write_sys_reg(vcpu, SYS_PAR_EL1_F, PAR_EL1);
> +		goto nopan;
> +	}
> +
>  	/*
>  	 * If the EL0 translation has succeeded, we need to pretend
>  	 * the AT operation has failed, as the PAN setting forbids
>  	 * such a translation.
> -	 *
> -	 * FIXME: we hardcode a Level-3 permission fault. We really
> -	 * should return the real fault level.
>  	 */
> -	if (fail || !(par & SYS_PAR_EL1_F))
> -		vcpu_write_sys_reg(vcpu, (0xf << 1) | SYS_PAR_EL1_F, PAR_EL1);
> -
> +	if (par & SYS_PAR_EL1_F) {
> +		u8 fst = FIELD_GET(SYS_PAR_EL1_FST, par);
> +
> +		/*
> +		 * If we get something other than a permission fault, we
> +		 * need to retry, as we're likely to have missed in the PTs.
> +		 */
> +		if ((fst & ESR_ELx_FSC_TYPE) != ESR_ELx_FSC_PERM)
> +			retry_slow = true;
> +	} else {
> +		/*
> +		 * The EL0 access succeded, but we don't have the full
> +		 * syndrom information to synthetize the failure. Go slow.
> +		 */
> +		retry_slow = true;
> +	}
>  nopan:
>  	__mmu_config_restore(&config);
>  out:
>  	local_irq_restore(flags);
>  
>  	write_unlock(&vcpu->kvm->mmu_lock);
> +
> +	/*
> +	 * If retry_slow is true, then we either are missing shadow S2
> +	 * entries, have paged out guest S1, or something is inconsistent.
> +	 *
> +	 * Either way, we need to walk the PTs by hand so that we can either
> +	 * fault things back, in or record accurate fault information along
> +	 * the way.
> +	 */
> +	if (retry_slow) {
> +		par = handle_at_slow(vcpu, op, vaddr);
> +		vcpu_write_sys_reg(vcpu, par, PAR_EL1);
> +	}
>  }
>  
>  void __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
> @@ -433,6 +931,10 @@ void __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
>  
>  	write_unlock(&vcpu->kvm->mmu_lock);
>  
> +	/* We failed the translation, let's replay it in slow motion */
> +	if (!fail && (par & SYS_PAR_EL1_F))
> +		par = handle_at_slow(vcpu, op, vaddr);
> +
>  	vcpu_write_sys_reg(vcpu, par, PAR_EL1);
>  }
>  
> -- 
> 2.39.2
> 
>
Marc Zyngier July 22, 2024, 3:25 p.m. UTC | #10
Hi Alex,

On Mon, 22 Jul 2024 11:53:13 +0100,
Alexandru Elisei <alexandru.elisei@arm.com> wrote:
> 
> Hi Marc,
> 
> I would like to use the S1 walker for KVM SPE, and I was planning to move it to
> a separate file, where it would be shared between nested KVM and SPE. I think
> this is also good for NV, since the walker would get more testing.
> 
> Do you think moving it to a shared location is a good approach? Or do you have
> something else in mind?

I'm definitely open to moving it somewhere else if that helps, though
the location doesn't matter much, TBH, and it is the boundary of the
interface I'm more interested in. It may need some work though, as the
current design is solely written with AT in mind.

> Also, do you know where you'll be able to send an updated version of this
> series? I'm asking because I want to decide between using this code (with fixes
> on top) or wait for the next iteration. Please don't feel that you need to send
> the next iteration too soon.

The current state of the branch is at [1], which I plan to send once
-rc1 is out. Note that this isn't a stable branch, so things can
change without any warning!

> And please CC me on the series, so I don't miss it by mistake :)

Of course!

Thanks,

	M.

[1] https://git.kernel.org/pub/scm/linux/kernel/git/maz/arm-platforms.git/log/?h=kvm-arm64/nv-at-pan-WIP
Alexandru Elisei July 23, 2024, 8:57 a.m. UTC | #11
Hi Marc,

On Mon, Jul 22, 2024 at 04:25:00PM +0100, Marc Zyngier wrote:
> Hi Alex,
> 
> On Mon, 22 Jul 2024 11:53:13 +0100,
> Alexandru Elisei <alexandru.elisei@arm.com> wrote:
> > 
> > Hi Marc,
> > 
> > I would like to use the S1 walker for KVM SPE, and I was planning to move it to
> > a separate file, where it would be shared between nested KVM and SPE. I think
> > this is also good for NV, since the walker would get more testing.
> > 
> > Do you think moving it to a shared location is a good approach? Or do you have
> > something else in mind?
> 
> I'm definitely open to moving it somewhere else if that helps, though
> the location doesn't matter much, TBH, and it is the boundary of the
> interface I'm more interested in. It may need some work though, as the
> current design is solely written with AT in mind.

Looks that way to me too.

> 
> > Also, do you know where you'll be able to send an updated version of this
> > series? I'm asking because I want to decide between using this code (with fixes
> > on top) or wait for the next iteration. Please don't feel that you need to send
> > the next iteration too soon.
> 
> The current state of the branch is at [1], which I plan to send once
> -rc1 is out. Note that this isn't a stable branch, so things can
> change without any warning!
> 
> > And please CC me on the series, so I don't miss it by mistake :)
> 
> Of course!

Sounds great, thanks!

Alex

> 
> Thanks,
> 
> 	M.
> 
> [1] https://git.kernel.org/pub/scm/linux/kernel/git/maz/arm-platforms.git/log/?h=kvm-arm64/nv-at-pan-WIP
> 
> -- 
> Without deviation from the norm, progress is not possible.
>
Alexandru Elisei July 25, 2024, 2:16 p.m. UTC | #12
Hi Marc,

On Mon, Jul 08, 2024 at 05:57:58PM +0100, Marc Zyngier wrote:
> In order to plug the brokenness of our current AT implementation,
> we need a SW walker that is going to... err.. walk the S1 tables
> and tell us what it finds.
> 
> Of course, it builds on top of our S2 walker, and share similar
> concepts. The beauty of it is that since it uses kvm_read_guest(),
> it is able to bring back pages that have been otherwise evicted.
> 
> This is then plugged in the two AT S1 emulation functions as
> a "slow path" fallback. I'm not sure it is that slow, but hey.
> 
> Signed-off-by: Marc Zyngier <maz@kernel.org>
> [..]
> +static u64 handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
> +{
> +	bool perm_fail, ur, uw, ux, pr, pw, pan;
> +	struct s1_walk_result wr = {};
> +	struct s1_walk_info wi = {};
> +	int ret, idx, el;
> +
> +	/*
> +	 * We only get here from guest EL2, so the translation regime
> +	 * AT applies to is solely defined by {E2H,TGE}.
> +	 */
> +	el = (vcpu_el2_e2h_is_set(vcpu) &&
> +	      vcpu_el2_tge_is_set(vcpu)) ? 2 : 1;
> +
> +	ret = setup_s1_walk(vcpu, &wi, &wr, vaddr, el);
> +	if (ret)
> +		goto compute_par;
> +
> +	if (wr.level == S1_MMU_DISABLED)
> +		goto compute_par;
> +
> +	idx = srcu_read_lock(&vcpu->kvm->srcu);
> +
> +	ret = walk_s1(vcpu, &wi, &wr, vaddr);
> +
> +	srcu_read_unlock(&vcpu->kvm->srcu, idx);
> +
> +	if (ret)
> +		goto compute_par;
> +
> +	/* FIXME: revisit when adding indirect permission support */
> +	if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, PAN, PAN3) &&
> +	    !wi.nvhe) {
> +		u64 sctlr;
> +
> +		if (el == 1)
> +			sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1);
> +		else
> +			sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL2);
> +
> +		ux = (sctlr & SCTLR_EL1_EPAN) && !(wr.desc & PTE_UXN);
> +	} else {
> +		ux = false;
> +	}
> +
> +	pw = !(wr.desc & PTE_RDONLY);
> +
> +	if (wi.nvhe) {
> +		ur = uw = false;
> +		pr = true;
> +	} else {
> +		if (wr.desc & PTE_USER) {
> +			ur = pr = true;
> +			uw = pw;
> +		} else {
> +			ur = uw = false;
> +			pr = true;
> +		}
> +	}
> +
> +	/* Apply the Hierarchical Permission madness */
> +	if (wi.nvhe) {
> +		wr.APTable &= BIT(1);
> +		wr.PXNTable = wr.UXNTable;
> +	}
> +
> +	ur &= !(wr.APTable & BIT(0));
> +	uw &= !(wr.APTable != 0);
> +	ux &= !wr.UXNTable;
> +
> +	pw &= !(wr.APTable & BIT(1));
> +
> +	pan = *vcpu_cpsr(vcpu) & PSR_PAN_BIT;
> +
> +	perm_fail = false;
> +
> +	switch (op) {
> +	case OP_AT_S1E1RP:
> +		perm_fail |= pan && (ur || uw || ux);
> +		fallthrough;
> +	case OP_AT_S1E1R:
> +	case OP_AT_S1E2R:
> +		perm_fail |= !pr;
> +		break;
> +	case OP_AT_S1E1WP:
> +		perm_fail |= pan && (ur || uw || ux);
> +		fallthrough;
> +	case OP_AT_S1E1W:
> +	case OP_AT_S1E2W:
> +		perm_fail |= !pw;
> +		break;
> +	case OP_AT_S1E0R:
> +		perm_fail |= !ur;
> +		break;
> +	case OP_AT_S1E0W:
> +		perm_fail |= !uw;
> +		break;
> +	default:
> +		BUG();
> +	}
> +
> +	if (perm_fail) {
> +		struct s1_walk_result tmp;

I was wondering if you would consider initializing 'tmp' to the empty struct
here. That makes it consistent with the initialization of 'wr' in the !perm_fail
case and I think it will make the code more robust wrt to changes to
compute_par_s1() and what fields it accesses.

Thanks,
Alex

> +
> +		tmp.failed = true;
> +		tmp.fst = ESR_ELx_FSC_PERM | wr.level;
> +		tmp.s2 = false;
> +		tmp.ptw = false;
> +
> +		wr = tmp;
> +	}
> +
> +compute_par:
> +	return compute_par_s1(vcpu, &wr);
> +}
Marc Zyngier July 25, 2024, 2:30 p.m. UTC | #13
On Thu, 25 Jul 2024 15:16:12 +0100,
Alexandru Elisei <alexandru.elisei@arm.com> wrote:
> 
> Hi Marc,
> 
> On Mon, Jul 08, 2024 at 05:57:58PM +0100, Marc Zyngier wrote:
> > +	if (perm_fail) {
> > +		struct s1_walk_result tmp;
> 
> I was wondering if you would consider initializing 'tmp' to the empty struct
> here. That makes it consistent with the initialization of 'wr' in the !perm_fail
> case and I think it will make the code more robust wrt to changes to
> compute_par_s1() and what fields it accesses.

I think there is a slightly better way, with something like this:

diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c
index b02d8dbffd209..36fa2801ab4ef 100644
--- a/arch/arm64/kvm/at.c
+++ b/arch/arm64/kvm/at.c
@@ -803,12 +803,12 @@ static u64 handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
 	}
 
 	if (perm_fail) {
-		struct s1_walk_result tmp;
-
-		tmp.failed = true;
-		tmp.fst = ESR_ELx_FSC_PERM | wr.level;
-		tmp.s2 = false;
-		tmp.ptw = false;
+		struct s1_walk_result tmp = (struct s1_walk_result){
+			.failed	= true,
+			.fst	= ESR_ELx_FSC_PERM | wr.level,
+			.s2	= false,
+			.ptw	= false,
+		};
 
 		wr = tmp;
 	}

Thoughts?

	M.
Alexandru Elisei July 25, 2024, 3:13 p.m. UTC | #14
Hi,

On Thu, Jul 25, 2024 at 03:30:00PM +0100, Marc Zyngier wrote:
> On Thu, 25 Jul 2024 15:16:12 +0100,
> Alexandru Elisei <alexandru.elisei@arm.com> wrote:
> > 
> > Hi Marc,
> > 
> > On Mon, Jul 08, 2024 at 05:57:58PM +0100, Marc Zyngier wrote:
> > > +	if (perm_fail) {
> > > +		struct s1_walk_result tmp;
> > 
> > I was wondering if you would consider initializing 'tmp' to the empty struct
> > here. That makes it consistent with the initialization of 'wr' in the !perm_fail
> > case and I think it will make the code more robust wrt to changes to
> > compute_par_s1() and what fields it accesses.
> 
> I think there is a slightly better way, with something like this:
> 
> diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c
> index b02d8dbffd209..36fa2801ab4ef 100644
> --- a/arch/arm64/kvm/at.c
> +++ b/arch/arm64/kvm/at.c
> @@ -803,12 +803,12 @@ static u64 handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
>  	}
>  
>  	if (perm_fail) {
> -		struct s1_walk_result tmp;
> -
> -		tmp.failed = true;
> -		tmp.fst = ESR_ELx_FSC_PERM | wr.level;
> -		tmp.s2 = false;
> -		tmp.ptw = false;
> +		struct s1_walk_result tmp = (struct s1_walk_result){
> +			.failed	= true,
> +			.fst	= ESR_ELx_FSC_PERM | wr.level,
> +			.s2	= false,
> +			.ptw	= false,
> +		};
>  
>  		wr = tmp;
>  	}
> 
> Thoughts?

How about (diff against your kvm-arm64/nv-at-pan-WIP branch, in case something
looks off):

diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c
index b02d8dbffd20..74ebe3223a13 100644
--- a/arch/arm64/kvm/at.c
+++ b/arch/arm64/kvm/at.c
@@ -802,16 +802,8 @@ static u64 handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
                BUG();
        }

-       if (perm_fail) {
-               struct s1_walk_result tmp;
-
-               tmp.failed = true;
-               tmp.fst = ESR_ELx_FSC_PERM | wr.level;
-               tmp.s2 = false;
-               tmp.ptw = false;
-
-               wr = tmp;
-       }
+       if (perm_fail)
+               fail_s1_walk(&wr, ESR_ELx_FSC_PERM | wr.level, false, false);

 compute_par:
        return compute_par_s1(vcpu, &wr);

Thanks,
Alex
Marc Zyngier July 25, 2024, 3:33 p.m. UTC | #15
On Thu, 25 Jul 2024 16:13:25 +0100,
Alexandru Elisei <alexandru.elisei@arm.com> wrote:
> 
> Hi,
> 
> On Thu, Jul 25, 2024 at 03:30:00PM +0100, Marc Zyngier wrote:
> > On Thu, 25 Jul 2024 15:16:12 +0100,
> > Alexandru Elisei <alexandru.elisei@arm.com> wrote:
> > > 
> > > Hi Marc,
> > > 
> > > On Mon, Jul 08, 2024 at 05:57:58PM +0100, Marc Zyngier wrote:
> > > > +	if (perm_fail) {
> > > > +		struct s1_walk_result tmp;
> > > 
> > > I was wondering if you would consider initializing 'tmp' to the empty struct
> > > here. That makes it consistent with the initialization of 'wr' in the !perm_fail
> > > case and I think it will make the code more robust wrt to changes to
> > > compute_par_s1() and what fields it accesses.
> > 
> > I think there is a slightly better way, with something like this:
> > 
> > diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c
> > index b02d8dbffd209..36fa2801ab4ef 100644
> > --- a/arch/arm64/kvm/at.c
> > +++ b/arch/arm64/kvm/at.c
> > @@ -803,12 +803,12 @@ static u64 handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
> >  	}
> >  
> >  	if (perm_fail) {
> > -		struct s1_walk_result tmp;
> > -
> > -		tmp.failed = true;
> > -		tmp.fst = ESR_ELx_FSC_PERM | wr.level;
> > -		tmp.s2 = false;
> > -		tmp.ptw = false;
> > +		struct s1_walk_result tmp = (struct s1_walk_result){
> > +			.failed	= true,
> > +			.fst	= ESR_ELx_FSC_PERM | wr.level,
> > +			.s2	= false,
> > +			.ptw	= false,
> > +		};
> >  
> >  		wr = tmp;
> >  	}
> > 
> > Thoughts?
> 
> How about (diff against your kvm-arm64/nv-at-pan-WIP branch, in case something
> looks off):
> 
> diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c
> index b02d8dbffd20..74ebe3223a13 100644
> --- a/arch/arm64/kvm/at.c
> +++ b/arch/arm64/kvm/at.c
> @@ -802,16 +802,8 @@ static u64 handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
>                 BUG();
>         }
> 
> -       if (perm_fail) {
> -               struct s1_walk_result tmp;
> -
> -               tmp.failed = true;
> -               tmp.fst = ESR_ELx_FSC_PERM | wr.level;
> -               tmp.s2 = false;
> -               tmp.ptw = false;
> -
> -               wr = tmp;
> -       }
> +       if (perm_fail)
> +               fail_s1_walk(&wr, ESR_ELx_FSC_PERM | wr.level, false, false);
> 
>  compute_par:
>         return compute_par_s1(vcpu, &wr);
> 

Ah, much nicer indeed!

Thanks,

	M.
Alexandru Elisei July 29, 2024, 3:26 p.m. UTC | #16
Hi Marc,

On Mon, Jul 08, 2024 at 05:57:58PM +0100, Marc Zyngier wrote:
> In order to plug the brokenness of our current AT implementation,
> we need a SW walker that is going to... err.. walk the S1 tables
> and tell us what it finds.
> 
> Of course, it builds on top of our S2 walker, and share similar
> concepts. The beauty of it is that since it uses kvm_read_guest(),
> it is able to bring back pages that have been otherwise evicted.
> 
> This is then plugged in the two AT S1 emulation functions as
> a "slow path" fallback. I'm not sure it is that slow, but hey.
> 
> Signed-off-by: Marc Zyngier <maz@kernel.org>
> ---
>  arch/arm64/kvm/at.c | 538 ++++++++++++++++++++++++++++++++++++++++++--
>  1 file changed, 520 insertions(+), 18 deletions(-)
> 
> diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c
> index 71e3390b43b4c..8452273cbff6d 100644
> --- a/arch/arm64/kvm/at.c
> +++ b/arch/arm64/kvm/at.c
> @@ -4,9 +4,305 @@
>   * Author: Jintack Lim <jintack.lim@linaro.org>
>   */
>  
> +#include <linux/kvm_host.h>
> +
> +#include <asm/esr.h>
>  #include <asm/kvm_hyp.h>
>  #include <asm/kvm_mmu.h>
>  
> +struct s1_walk_info {
> +	u64	     baddr;
> +	unsigned int max_oa_bits;
> +	unsigned int pgshift;
> +	unsigned int txsz;
> +	int 	     sl;
> +	bool	     hpd;
> +	bool	     be;
> +	bool	     nvhe;
> +	bool	     s2;
> +};
> +
> +struct s1_walk_result {
> +	union {
> +		struct {
> +			u64	desc;
> +			u64	pa;
> +			s8	level;
> +			u8	APTable;
> +			bool	UXNTable;
> +			bool	PXNTable;
> +		};
> +		struct {
> +			u8	fst;
> +			bool	ptw;
> +			bool	s2;
> +		};
> +	};
> +	bool	failed;
> +};
> +
> +static void fail_s1_walk(struct s1_walk_result *wr, u8 fst, bool ptw, bool s2)
> +{
> +	wr->fst		= fst;
> +	wr->ptw		= ptw;
> +	wr->s2		= s2;
> +	wr->failed	= true;
> +}
> +
> +#define S1_MMU_DISABLED		(-127)
> +
> +static int setup_s1_walk(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
> +			 struct s1_walk_result *wr, const u64 va, const int el)
> +{
> +	u64 sctlr, tcr, tg, ps, ia_bits, ttbr;
> +	unsigned int stride, x;
> +	bool va55, tbi;
> +
> +	wi->nvhe = el == 2 && !vcpu_el2_e2h_is_set(vcpu);

Where 'el' is computed in handle_at_slow() as:

	/*
	 * We only get here from guest EL2, so the translation regime
	 * AT applies to is solely defined by {E2H,TGE}.
	 */
	el = (vcpu_el2_e2h_is_set(vcpu) &&
	      vcpu_el2_tge_is_set(vcpu)) ? 2 : 1;

I think 'nvhe' will always be false ('el' is 2 only when E2H is set).

I'm curious about what 'el' represents. The translation regime for the AT
instruction?

Thanks,
Alex
Marc Zyngier July 31, 2024, 8:55 a.m. UTC | #17
On Mon, 29 Jul 2024 16:26:00 +0100,
Alexandru Elisei <alexandru.elisei@arm.com> wrote:
> 
> Hi Marc,
> 
> On Mon, Jul 08, 2024 at 05:57:58PM +0100, Marc Zyngier wrote:
> > In order to plug the brokenness of our current AT implementation,
> > we need a SW walker that is going to... err.. walk the S1 tables
> > and tell us what it finds.
> > 
> > Of course, it builds on top of our S2 walker, and share similar
> > concepts. The beauty of it is that since it uses kvm_read_guest(),
> > it is able to bring back pages that have been otherwise evicted.
> > 
> > This is then plugged in the two AT S1 emulation functions as
> > a "slow path" fallback. I'm not sure it is that slow, but hey.
> > 
> > Signed-off-by: Marc Zyngier <maz@kernel.org>
> > ---
> >  arch/arm64/kvm/at.c | 538 ++++++++++++++++++++++++++++++++++++++++++--
> >  1 file changed, 520 insertions(+), 18 deletions(-)
> > 
> > diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c
> > index 71e3390b43b4c..8452273cbff6d 100644
> > --- a/arch/arm64/kvm/at.c
> > +++ b/arch/arm64/kvm/at.c
> > @@ -4,9 +4,305 @@
> >   * Author: Jintack Lim <jintack.lim@linaro.org>
> >   */
> >  
> > +#include <linux/kvm_host.h>
> > +
> > +#include <asm/esr.h>
> >  #include <asm/kvm_hyp.h>
> >  #include <asm/kvm_mmu.h>
> >  
> > +struct s1_walk_info {
> > +	u64	     baddr;
> > +	unsigned int max_oa_bits;
> > +	unsigned int pgshift;
> > +	unsigned int txsz;
> > +	int 	     sl;
> > +	bool	     hpd;
> > +	bool	     be;
> > +	bool	     nvhe;
> > +	bool	     s2;
> > +};
> > +
> > +struct s1_walk_result {
> > +	union {
> > +		struct {
> > +			u64	desc;
> > +			u64	pa;
> > +			s8	level;
> > +			u8	APTable;
> > +			bool	UXNTable;
> > +			bool	PXNTable;
> > +		};
> > +		struct {
> > +			u8	fst;
> > +			bool	ptw;
> > +			bool	s2;
> > +		};
> > +	};
> > +	bool	failed;
> > +};
> > +
> > +static void fail_s1_walk(struct s1_walk_result *wr, u8 fst, bool ptw, bool s2)
> > +{
> > +	wr->fst		= fst;
> > +	wr->ptw		= ptw;
> > +	wr->s2		= s2;
> > +	wr->failed	= true;
> > +}
> > +
> > +#define S1_MMU_DISABLED		(-127)
> > +
> > +static int setup_s1_walk(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
> > +			 struct s1_walk_result *wr, const u64 va, const int el)
> > +{
> > +	u64 sctlr, tcr, tg, ps, ia_bits, ttbr;
> > +	unsigned int stride, x;
> > +	bool va55, tbi;
> > +
> > +	wi->nvhe = el == 2 && !vcpu_el2_e2h_is_set(vcpu);
> 
> Where 'el' is computed in handle_at_slow() as:
> 
> 	/*
> 	 * We only get here from guest EL2, so the translation regime
> 	 * AT applies to is solely defined by {E2H,TGE}.
> 	 */
> 	el = (vcpu_el2_e2h_is_set(vcpu) &&
> 	      vcpu_el2_tge_is_set(vcpu)) ? 2 : 1;
> 
> I think 'nvhe' will always be false ('el' is 2 only when E2H is
> set).

Yeah, there is a number of problems here. el should depend on both the
instruction (some are EL2-specific) and the HCR control bits. I'll
tackle that now.

> I'm curious about what 'el' represents. The translation regime for the AT
> instruction?

Exactly that.

Thanks,

	M.
Alexandru Elisei July 31, 2024, 9:53 a.m. UTC | #18
Hi,

On Wed, Jul 31, 2024 at 09:55:28AM +0100, Marc Zyngier wrote:
> On Mon, 29 Jul 2024 16:26:00 +0100,
> Alexandru Elisei <alexandru.elisei@arm.com> wrote:
> > 
> > Hi Marc,
> > 
> > On Mon, Jul 08, 2024 at 05:57:58PM +0100, Marc Zyngier wrote:
> > > In order to plug the brokenness of our current AT implementation,
> > > we need a SW walker that is going to... err.. walk the S1 tables
> > > and tell us what it finds.
> > > 
> > > Of course, it builds on top of our S2 walker, and share similar
> > > concepts. The beauty of it is that since it uses kvm_read_guest(),
> > > it is able to bring back pages that have been otherwise evicted.
> > > 
> > > This is then plugged in the two AT S1 emulation functions as
> > > a "slow path" fallback. I'm not sure it is that slow, but hey.
> > > 
> > > Signed-off-by: Marc Zyngier <maz@kernel.org>
> > > ---
> > >  arch/arm64/kvm/at.c | 538 ++++++++++++++++++++++++++++++++++++++++++--
> > >  1 file changed, 520 insertions(+), 18 deletions(-)
> > > 
> > > diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c
> > > index 71e3390b43b4c..8452273cbff6d 100644
> > > --- a/arch/arm64/kvm/at.c
> > > +++ b/arch/arm64/kvm/at.c
> > > @@ -4,9 +4,305 @@
> > >   * Author: Jintack Lim <jintack.lim@linaro.org>
> > >   */
> > >  
> > > +#include <linux/kvm_host.h>
> > > +
> > > +#include <asm/esr.h>
> > >  #include <asm/kvm_hyp.h>
> > >  #include <asm/kvm_mmu.h>
> > >  
> > > +struct s1_walk_info {
> > > +	u64	     baddr;
> > > +	unsigned int max_oa_bits;
> > > +	unsigned int pgshift;
> > > +	unsigned int txsz;
> > > +	int 	     sl;
> > > +	bool	     hpd;
> > > +	bool	     be;
> > > +	bool	     nvhe;
> > > +	bool	     s2;
> > > +};
> > > +
> > > +struct s1_walk_result {
> > > +	union {
> > > +		struct {
> > > +			u64	desc;
> > > +			u64	pa;
> > > +			s8	level;
> > > +			u8	APTable;
> > > +			bool	UXNTable;
> > > +			bool	PXNTable;
> > > +		};
> > > +		struct {
> > > +			u8	fst;
> > > +			bool	ptw;
> > > +			bool	s2;
> > > +		};
> > > +	};
> > > +	bool	failed;
> > > +};
> > > +
> > > +static void fail_s1_walk(struct s1_walk_result *wr, u8 fst, bool ptw, bool s2)
> > > +{
> > > +	wr->fst		= fst;
> > > +	wr->ptw		= ptw;
> > > +	wr->s2		= s2;
> > > +	wr->failed	= true;
> > > +}
> > > +
> > > +#define S1_MMU_DISABLED		(-127)
> > > +
> > > +static int setup_s1_walk(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
> > > +			 struct s1_walk_result *wr, const u64 va, const int el)
> > > +{
> > > +	u64 sctlr, tcr, tg, ps, ia_bits, ttbr;
> > > +	unsigned int stride, x;
> > > +	bool va55, tbi;
> > > +
> > > +	wi->nvhe = el == 2 && !vcpu_el2_e2h_is_set(vcpu);
> > 
> > Where 'el' is computed in handle_at_slow() as:
> > 
> > 	/*
> > 	 * We only get here from guest EL2, so the translation regime
> > 	 * AT applies to is solely defined by {E2H,TGE}.
> > 	 */
> > 	el = (vcpu_el2_e2h_is_set(vcpu) &&
> > 	      vcpu_el2_tge_is_set(vcpu)) ? 2 : 1;
> > 
> > I think 'nvhe' will always be false ('el' is 2 only when E2H is
> > set).
> 
> Yeah, there is a number of problems here. el should depend on both the
> instruction (some are EL2-specific) and the HCR control bits. I'll
> tackle that now.

Yeah, also noticed that how sctlr, tcr and ttbr are chosen in setup_s1_walk()
doesn't look quite right for the nvhe case.

> 
> > I'm curious about what 'el' represents. The translation regime for the AT
> > instruction?
> 
> Exactly that.

Might I make a suggestion here? I was thinking about dropping the (el, wi-nvhe*)
tuple to represent the translation regime and have a wi->regime (or similar) to
unambiguously encode the regime. The value can be an enum with three values to
represent the three possible regimes (REGIME_EL10, REGIME_EL2, REGIME_EL20).

Just a thought though, feel free to ignore at your leisure.

*wi->single_range on the kvm-arm64/nv-at-pan-WIP branch.

Thanks,
Alex
Marc Zyngier July 31, 2024, 10:18 a.m. UTC | #19
On Wed, 31 Jul 2024 10:53:14 +0100,
Alexandru Elisei <alexandru.elisei@arm.com> wrote:
> 
> Hi,
> 
> On Wed, Jul 31, 2024 at 09:55:28AM +0100, Marc Zyngier wrote:
> > On Mon, 29 Jul 2024 16:26:00 +0100,
> > Alexandru Elisei <alexandru.elisei@arm.com> wrote:
> > > 
> > > Hi Marc,
> > > 
> > > On Mon, Jul 08, 2024 at 05:57:58PM +0100, Marc Zyngier wrote:
> > > > In order to plug the brokenness of our current AT implementation,
> > > > we need a SW walker that is going to... err.. walk the S1 tables
> > > > and tell us what it finds.
> > > > 
> > > > Of course, it builds on top of our S2 walker, and share similar
> > > > concepts. The beauty of it is that since it uses kvm_read_guest(),
> > > > it is able to bring back pages that have been otherwise evicted.
> > > > 
> > > > This is then plugged in the two AT S1 emulation functions as
> > > > a "slow path" fallback. I'm not sure it is that slow, but hey.
> > > > 
> > > > Signed-off-by: Marc Zyngier <maz@kernel.org>
> > > > ---
> > > >  arch/arm64/kvm/at.c | 538 ++++++++++++++++++++++++++++++++++++++++++--
> > > >  1 file changed, 520 insertions(+), 18 deletions(-)
> > > > 
> > > > diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c
> > > > index 71e3390b43b4c..8452273cbff6d 100644
> > > > --- a/arch/arm64/kvm/at.c
> > > > +++ b/arch/arm64/kvm/at.c
> > > > @@ -4,9 +4,305 @@
> > > >   * Author: Jintack Lim <jintack.lim@linaro.org>
> > > >   */
> > > >  
> > > > +#include <linux/kvm_host.h>
> > > > +
> > > > +#include <asm/esr.h>
> > > >  #include <asm/kvm_hyp.h>
> > > >  #include <asm/kvm_mmu.h>
> > > >  
> > > > +struct s1_walk_info {
> > > > +	u64	     baddr;
> > > > +	unsigned int max_oa_bits;
> > > > +	unsigned int pgshift;
> > > > +	unsigned int txsz;
> > > > +	int 	     sl;
> > > > +	bool	     hpd;
> > > > +	bool	     be;
> > > > +	bool	     nvhe;
> > > > +	bool	     s2;
> > > > +};
> > > > +
> > > > +struct s1_walk_result {
> > > > +	union {
> > > > +		struct {
> > > > +			u64	desc;
> > > > +			u64	pa;
> > > > +			s8	level;
> > > > +			u8	APTable;
> > > > +			bool	UXNTable;
> > > > +			bool	PXNTable;
> > > > +		};
> > > > +		struct {
> > > > +			u8	fst;
> > > > +			bool	ptw;
> > > > +			bool	s2;
> > > > +		};
> > > > +	};
> > > > +	bool	failed;
> > > > +};
> > > > +
> > > > +static void fail_s1_walk(struct s1_walk_result *wr, u8 fst, bool ptw, bool s2)
> > > > +{
> > > > +	wr->fst		= fst;
> > > > +	wr->ptw		= ptw;
> > > > +	wr->s2		= s2;
> > > > +	wr->failed	= true;
> > > > +}
> > > > +
> > > > +#define S1_MMU_DISABLED		(-127)
> > > > +
> > > > +static int setup_s1_walk(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
> > > > +			 struct s1_walk_result *wr, const u64 va, const int el)
> > > > +{
> > > > +	u64 sctlr, tcr, tg, ps, ia_bits, ttbr;
> > > > +	unsigned int stride, x;
> > > > +	bool va55, tbi;
> > > > +
> > > > +	wi->nvhe = el == 2 && !vcpu_el2_e2h_is_set(vcpu);
> > > 
> > > Where 'el' is computed in handle_at_slow() as:
> > > 
> > > 	/*
> > > 	 * We only get here from guest EL2, so the translation regime
> > > 	 * AT applies to is solely defined by {E2H,TGE}.
> > > 	 */
> > > 	el = (vcpu_el2_e2h_is_set(vcpu) &&
> > > 	      vcpu_el2_tge_is_set(vcpu)) ? 2 : 1;
> > > 
> > > I think 'nvhe' will always be false ('el' is 2 only when E2H is
> > > set).
> > 
> > Yeah, there is a number of problems here. el should depend on both the
> > instruction (some are EL2-specific) and the HCR control bits. I'll
> > tackle that now.
> 
> Yeah, also noticed that how sctlr, tcr and ttbr are chosen in setup_s1_walk()
> doesn't look quite right for the nvhe case.

Are you sure? Assuming the 'el' value is correct (and I think I fixed
that on my local branch), they seem correct to me (we check for va55
early in the function to avoid an later issue).

Can you point out what exactly fails in that logic?

>
> > 
> > > I'm curious about what 'el' represents. The translation regime for the AT
> > > instruction?
> > 
> > Exactly that.
> 
> Might I make a suggestion here? I was thinking about dropping the (el, wi-nvhe*)
> tuple to represent the translation regime and have a wi->regime (or similar) to
> unambiguously encode the regime. The value can be an enum with three values to
> represent the three possible regimes (REGIME_EL10, REGIME_EL2, REGIME_EL20).

I've been thinking of that, but I'm wondering whether that just
results in pretty awful code in the end, because we go from 2 cases
(el==1 or el==2) to 3. But most of the time, we don't care about the
E2H=0 case, because we can handle it just like E2H=1.

I'll give it a go and see what it looks like.

Thanks,

	M.
Alexandru Elisei July 31, 2024, 10:28 a.m. UTC | #20
Hi,

On Wed, Jul 31, 2024 at 11:18:06AM +0100, Marc Zyngier wrote:
> On Wed, 31 Jul 2024 10:53:14 +0100,
> Alexandru Elisei <alexandru.elisei@arm.com> wrote:
> > 
> > Hi,
> > 
> > On Wed, Jul 31, 2024 at 09:55:28AM +0100, Marc Zyngier wrote:
> > > On Mon, 29 Jul 2024 16:26:00 +0100,
> > > Alexandru Elisei <alexandru.elisei@arm.com> wrote:
> > > > 
> > > > Hi Marc,
> > > > 
> > > > On Mon, Jul 08, 2024 at 05:57:58PM +0100, Marc Zyngier wrote:
> > > > > In order to plug the brokenness of our current AT implementation,
> > > > > we need a SW walker that is going to... err.. walk the S1 tables
> > > > > and tell us what it finds.
> > > > > 
> > > > > Of course, it builds on top of our S2 walker, and share similar
> > > > > concepts. The beauty of it is that since it uses kvm_read_guest(),
> > > > > it is able to bring back pages that have been otherwise evicted.
> > > > > 
> > > > > This is then plugged in the two AT S1 emulation functions as
> > > > > a "slow path" fallback. I'm not sure it is that slow, but hey.
> > > > > 
> > > > > Signed-off-by: Marc Zyngier <maz@kernel.org>
> > > > > ---
> > > > >  arch/arm64/kvm/at.c | 538 ++++++++++++++++++++++++++++++++++++++++++--
> > > > >  1 file changed, 520 insertions(+), 18 deletions(-)
> > > > > 
> > > > > diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c
> > > > > index 71e3390b43b4c..8452273cbff6d 100644
> > > > > --- a/arch/arm64/kvm/at.c
> > > > > +++ b/arch/arm64/kvm/at.c
> > > > > @@ -4,9 +4,305 @@
> > > > >   * Author: Jintack Lim <jintack.lim@linaro.org>
> > > > >   */
> > > > >  
> > > > > +#include <linux/kvm_host.h>
> > > > > +
> > > > > +#include <asm/esr.h>
> > > > >  #include <asm/kvm_hyp.h>
> > > > >  #include <asm/kvm_mmu.h>
> > > > >  
> > > > > +struct s1_walk_info {
> > > > > +	u64	     baddr;
> > > > > +	unsigned int max_oa_bits;
> > > > > +	unsigned int pgshift;
> > > > > +	unsigned int txsz;
> > > > > +	int 	     sl;
> > > > > +	bool	     hpd;
> > > > > +	bool	     be;
> > > > > +	bool	     nvhe;
> > > > > +	bool	     s2;
> > > > > +};
> > > > > +
> > > > > +struct s1_walk_result {
> > > > > +	union {
> > > > > +		struct {
> > > > > +			u64	desc;
> > > > > +			u64	pa;
> > > > > +			s8	level;
> > > > > +			u8	APTable;
> > > > > +			bool	UXNTable;
> > > > > +			bool	PXNTable;
> > > > > +		};
> > > > > +		struct {
> > > > > +			u8	fst;
> > > > > +			bool	ptw;
> > > > > +			bool	s2;
> > > > > +		};
> > > > > +	};
> > > > > +	bool	failed;
> > > > > +};
> > > > > +
> > > > > +static void fail_s1_walk(struct s1_walk_result *wr, u8 fst, bool ptw, bool s2)
> > > > > +{
> > > > > +	wr->fst		= fst;
> > > > > +	wr->ptw		= ptw;
> > > > > +	wr->s2		= s2;
> > > > > +	wr->failed	= true;
> > > > > +}
> > > > > +
> > > > > +#define S1_MMU_DISABLED		(-127)
> > > > > +
> > > > > +static int setup_s1_walk(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
> > > > > +			 struct s1_walk_result *wr, const u64 va, const int el)
> > > > > +{
> > > > > +	u64 sctlr, tcr, tg, ps, ia_bits, ttbr;
> > > > > +	unsigned int stride, x;
> > > > > +	bool va55, tbi;
> > > > > +
> > > > > +	wi->nvhe = el == 2 && !vcpu_el2_e2h_is_set(vcpu);
> > > > 
> > > > Where 'el' is computed in handle_at_slow() as:
> > > > 
> > > > 	/*
> > > > 	 * We only get here from guest EL2, so the translation regime
> > > > 	 * AT applies to is solely defined by {E2H,TGE}.
> > > > 	 */
> > > > 	el = (vcpu_el2_e2h_is_set(vcpu) &&
> > > > 	      vcpu_el2_tge_is_set(vcpu)) ? 2 : 1;
> > > > 
> > > > I think 'nvhe' will always be false ('el' is 2 only when E2H is
> > > > set).
> > > 
> > > Yeah, there is a number of problems here. el should depend on both the
> > > instruction (some are EL2-specific) and the HCR control bits. I'll
> > > tackle that now.
> > 
> > Yeah, also noticed that how sctlr, tcr and ttbr are chosen in setup_s1_walk()
> > doesn't look quite right for the nvhe case.
> 
> Are you sure? Assuming the 'el' value is correct (and I think I fixed
> that on my local branch), they seem correct to me (we check for va55
> early in the function to avoid an later issue).
> 
> Can you point out what exactly fails in that logic?

I was trying to say that another consequence of el being 1 in the nvhe case was
that sctlr, tcr and ttbr were read from the EL1 variants of the registers,
instead of EL2. Sorry if that wasn't clear.

Thanks,
Alex

> 
> >
> > > 
> > > > I'm curious about what 'el' represents. The translation regime for the AT
> > > > instruction?
> > > 
> > > Exactly that.
> > 
> > Might I make a suggestion here? I was thinking about dropping the (el, wi-nvhe*)
> > tuple to represent the translation regime and have a wi->regime (or similar) to
> > unambiguously encode the regime. The value can be an enum with three values to
> > represent the three possible regimes (REGIME_EL10, REGIME_EL2, REGIME_EL20).
> 
> I've been thinking of that, but I'm wondering whether that just
> results in pretty awful code in the end, because we go from 2 cases
> (el==1 or el==2) to 3. But most of the time, we don't care about the
> E2H=0 case, because we can handle it just like E2H=1.
> 
> I'll give it a go and see what it looks like.
> 
> Thanks,
> 
> 	M.
> 
> -- 
> Without deviation from the norm, progress is not possible.
Alexandru Elisei July 31, 2024, 2:33 p.m. UTC | #21
Hi Marc,

On Mon, Jul 08, 2024 at 05:57:58PM +0100, Marc Zyngier wrote:
> In order to plug the brokenness of our current AT implementation,
> we need a SW walker that is going to... err.. walk the S1 tables
> and tell us what it finds.
> 
> Of course, it builds on top of our S2 walker, and share similar
> concepts. The beauty of it is that since it uses kvm_read_guest(),
> it is able to bring back pages that have been otherwise evicted.
> 
> This is then plugged in the two AT S1 emulation functions as
> a "slow path" fallback. I'm not sure it is that slow, but hey.
> 
> Signed-off-by: Marc Zyngier <maz@kernel.org>
> ---
>  arch/arm64/kvm/at.c | 538 ++++++++++++++++++++++++++++++++++++++++++--
>  1 file changed, 520 insertions(+), 18 deletions(-)
> 
> diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c
> index 71e3390b43b4c..8452273cbff6d 100644
> --- a/arch/arm64/kvm/at.c
> +++ b/arch/arm64/kvm/at.c
> @@ -4,9 +4,305 @@
>   * Author: Jintack Lim <jintack.lim@linaro.org>
>   */
>  
> +#include <linux/kvm_host.h>
> +
> +#include <asm/esr.h>
>  #include <asm/kvm_hyp.h>
>  #include <asm/kvm_mmu.h>
>  
> +struct s1_walk_info {
> +	u64	     baddr;
> +	unsigned int max_oa_bits;
> +	unsigned int pgshift;
> +	unsigned int txsz;
> +	int 	     sl;
> +	bool	     hpd;
> +	bool	     be;
> +	bool	     nvhe;
> +	bool	     s2;
> +};
> +
> +struct s1_walk_result {
> +	union {
> +		struct {
> +			u64	desc;
> +			u64	pa;
> +			s8	level;
> +			u8	APTable;
> +			bool	UXNTable;
> +			bool	PXNTable;
> +		};
> +		struct {
> +			u8	fst;
> +			bool	ptw;
> +			bool	s2;
> +		};
> +	};
> +	bool	failed;
> +};
> +
> +static void fail_s1_walk(struct s1_walk_result *wr, u8 fst, bool ptw, bool s2)
> +{
> +	wr->fst		= fst;
> +	wr->ptw		= ptw;
> +	wr->s2		= s2;
> +	wr->failed	= true;
> +}
> +
> +#define S1_MMU_DISABLED		(-127)
> +
> +static int setup_s1_walk(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
> +			 struct s1_walk_result *wr, const u64 va, const int el)
> +{
> +	u64 sctlr, tcr, tg, ps, ia_bits, ttbr;
> +	unsigned int stride, x;
> +	bool va55, tbi;
> +
> +	wi->nvhe = el == 2 && !vcpu_el2_e2h_is_set(vcpu);
> +
> +	va55 = va & BIT(55);
> +
> +	if (wi->nvhe && va55)
> +		goto addrsz;
> +
> +	wi->s2 = el < 2 && (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_VM);
> +
> +	switch (el) {
> +	case 1:
> +		sctlr	= vcpu_read_sys_reg(vcpu, SCTLR_EL1);
> +		tcr	= vcpu_read_sys_reg(vcpu, TCR_EL1);
> +		ttbr	= (va55 ?
> +			   vcpu_read_sys_reg(vcpu, TTBR1_EL1) :
> +			   vcpu_read_sys_reg(vcpu, TTBR0_EL1));
> +		break;
> +	case 2:
> +		sctlr	= vcpu_read_sys_reg(vcpu, SCTLR_EL2);
> +		tcr	= vcpu_read_sys_reg(vcpu, TCR_EL2);
> +		ttbr	= (va55 ?
> +			   vcpu_read_sys_reg(vcpu, TTBR1_EL2) :
> +			   vcpu_read_sys_reg(vcpu, TTBR0_EL2));
> +		break;
> +	default:
> +		BUG();
> +	}
> +
> +	/* Let's put the MMU disabled case aside immediately */
> +	if (!(sctlr & SCTLR_ELx_M) ||
> +	    (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_DC)) {
> +		if (va >= BIT(kvm_get_pa_bits(vcpu->kvm)))

As far as I can tell, if TBI, the pseudocode ignores bits 63:56 when checking
for out-of-bounds VA for the MMU disabled case (above) and the MMU enabled case
(below). That also matches the description of TBIx bits in the TCR_ELx
registers.

Thanks,
Alex

> +			goto addrsz;
> +
> +		wr->level = S1_MMU_DISABLED;
> +		wr->desc = va;
> +		return 0;
> +	}
> +
> +	wi->be = sctlr & SCTLR_ELx_EE;
> +
> +	wi->hpd  = kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, HPDS, IMP);
> +	wi->hpd &= (wi->nvhe ?
> +		    FIELD_GET(TCR_EL2_HPD, tcr) :
> +		    (va55 ?
> +		     FIELD_GET(TCR_HPD1, tcr) :
> +		     FIELD_GET(TCR_HPD0, tcr)));
> +
> +	tbi = (wi->nvhe ?
> +	       FIELD_GET(TCR_EL2_TBI, tcr) :
> +	       (va55 ?
> +		FIELD_GET(TCR_TBI1, tcr) :
> +		FIELD_GET(TCR_TBI0, tcr)));
> +
> +	if (!tbi && sign_extend64(va, 55) != (s64)va)
> +		goto addrsz;
> +
> +	/* Someone was silly enough to encode TG0/TG1 differently */
> +	if (va55) {
> +		wi->txsz = FIELD_GET(TCR_T1SZ_MASK, tcr);
> +		tg = FIELD_GET(TCR_TG1_MASK, tcr);
> +
> +		switch (tg << TCR_TG1_SHIFT) {
> +		case TCR_TG1_4K:
> +			wi->pgshift = 12;	 break;
> +		case TCR_TG1_16K:
> +			wi->pgshift = 14;	 break;
> +		case TCR_TG1_64K:
> +		default:	    /* IMPDEF: treat any other value as 64k */
> +			wi->pgshift = 16;	 break;
> +		}
> +	} else {
> +		wi->txsz = FIELD_GET(TCR_T0SZ_MASK, tcr);
> +		tg = FIELD_GET(TCR_TG0_MASK, tcr);
> +
> +		switch (tg << TCR_TG0_SHIFT) {
> +		case TCR_TG0_4K:
> +			wi->pgshift = 12;	 break;
> +		case TCR_TG0_16K:
> +			wi->pgshift = 14;	 break;
> +		case TCR_TG0_64K:
> +		default:	    /* IMPDEF: treat any other value as 64k */
> +			wi->pgshift = 16;	 break;
> +		}
> +	}
> +
> +	ia_bits = 64 - wi->txsz;
> +
> +	/* AArch64.S1StartLevel() */
> +	stride = wi->pgshift - 3;
> +	wi->sl = 3 - (((ia_bits - 1) - wi->pgshift) / stride);
> +
> +	/* Check for SL mandating LPA2 (which we don't support yet) */
> +	switch (BIT(wi->pgshift)) {
> +	case SZ_4K:
> +		if (wi->sl == -1 &&
> +		    !kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN4, 52_BIT))
> +			goto addrsz;
> +		break;
> +	case SZ_16K:
> +		if (wi->sl == 0 &&
> +		    !kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN16, 52_BIT))
> +			goto addrsz;
> +		break;
> +	}
> +
> +	ps = (wi->nvhe ?
> +	      FIELD_GET(TCR_EL2_PS_MASK, tcr) : FIELD_GET(TCR_IPS_MASK, tcr));
> +
> +	wi->max_oa_bits = min(get_kvm_ipa_limit(), ps_to_output_size(ps));
> +
> +	/* Compute minimal alignment */
> +	x = 3 + ia_bits - ((3 - wi->sl) * stride + wi->pgshift);
> +
> +	wi->baddr = ttbr & TTBRx_EL1_BADDR;
> +	wi->baddr &= GENMASK_ULL(wi->max_oa_bits - 1, x);
> +
> +	return 0;
> +
> +addrsz:	/* Address Size Fault level 0 */
> +	fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ, false, false);
> +
> +	return -EFAULT;
> +}
> +
> +static int get_ia_size(struct s1_walk_info *wi)
> +{
> +	return 64 - wi->txsz;
> +}
> +
> +static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
> +		   struct s1_walk_result *wr, u64 va)
> +{
> +	u64 va_top, va_bottom, baddr, desc;
> +	int level, stride, ret;
> +
> +	level = wi->sl;
> +	stride = wi->pgshift - 3;
> +	baddr = wi->baddr;
> +
> +	va_top = get_ia_size(wi) - 1;
> +
> +	while (1) {
> +		u64 index, ipa;
> +
> +		va_bottom = (3 - level) * stride + wi->pgshift;
> +		index = (va & GENMASK_ULL(va_top, va_bottom)) >> (va_bottom - 3);
> +
> +		ipa = baddr | index;
> +
> +		if (wi->s2) {
> +			struct kvm_s2_trans s2_trans = {};
> +
> +			ret = kvm_walk_nested_s2(vcpu, ipa, &s2_trans);
> +			if (ret) {
> +				fail_s1_walk(wr,
> +					     (s2_trans.esr & ~ESR_ELx_FSC_LEVEL) | level,
> +					     true, true);
> +				return ret;
> +			}
> +
> +			if (!kvm_s2_trans_readable(&s2_trans)) {
> +				fail_s1_walk(wr, ESR_ELx_FSC_PERM | level,
> +					     true, true);
> +
> +				return -EPERM;
> +			}
> +
> +			ipa = kvm_s2_trans_output(&s2_trans);
> +		}
> +
> +		ret = kvm_read_guest(vcpu->kvm, ipa, &desc, sizeof(desc));
> +		if (ret) {
> +			fail_s1_walk(wr, ESR_ELx_FSC_SEA_TTW(level),
> +				     true, false);
> +			return ret;
> +		}
> +
> +		if (wi->be)
> +			desc = be64_to_cpu((__force __be64)desc);
> +		else
> +			desc = le64_to_cpu((__force __le64)desc);
> +
> +		if (!(desc & 1) || ((desc & 3) == 1 && level == 3)) {
> +			fail_s1_walk(wr, ESR_ELx_FSC_FAULT | level,
> +				     true, false);
> +			return -ENOENT;
> +		}
> +
> +		/* We found a leaf, handle that */
> +		if ((desc & 3) == 1 || level == 3)
> +			break;
> +
> +		if (!wi->hpd) {
> +			wr->APTable  |= FIELD_GET(PMD_TABLE_AP, desc);
> +			wr->UXNTable |= FIELD_GET(PMD_TABLE_UXN, desc);
> +			wr->PXNTable |= FIELD_GET(PMD_TABLE_PXN, desc);
> +		}
> +
> +		baddr = GENMASK_ULL(47, wi->pgshift);
> +
> +		/* Check for out-of-range OA */
> +		if (wi->max_oa_bits < 48 &&
> +		    (baddr & GENMASK_ULL(47, wi->max_oa_bits))) {
> +			fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ | level,
> +				     true, false);
> +			return -EINVAL;
> +		}
> +
> +		/* Prepare for next round */
> +		va_top = va_bottom - 1;
> +		level++;
> +	}
> +
> +	/* Block mapping, check the validity of the level */
> +	if (!(desc & BIT(1))) {
> +		bool valid_block = false;
> +
> +		switch (BIT(wi->pgshift)) {
> +		case SZ_4K:
> +			valid_block = level == 1 || level == 2;
> +			break;
> +		case SZ_16K:
> +		case SZ_64K:
> +			valid_block = level == 2;
> +			break;
> +		}
> +
> +		if (!valid_block) {
> +			fail_s1_walk(wr, ESR_ELx_FSC_FAULT | level,
> +				     true, false);
> +			return -EINVAL;
> +		}
> +	}
> +
> +	wr->failed = false;
> +	wr->level = level;
> +	wr->desc = desc;
> +	wr->pa = desc & GENMASK(47, va_bottom);
> +	if (va_bottom > 12)
> +		wr->pa |= va & GENMASK_ULL(va_bottom - 1, 12);
> +
> +	return 0;
> +}
> +
>  struct mmu_config {
>  	u64	ttbr0;
>  	u64	ttbr1;
> @@ -234,6 +530,177 @@ static u64 compute_par_s12(struct kvm_vcpu *vcpu, u64 s1_par,
>  	return par;
>  }
>  
> +static u64 compute_par_s1(struct kvm_vcpu *vcpu, struct s1_walk_result *wr)
> +{
> +	u64 par;
> +
> +	if (wr->failed) {
> +		par = SYS_PAR_EL1_RES1;
> +		par |= SYS_PAR_EL1_F;
> +		par |= FIELD_PREP(SYS_PAR_EL1_FST, wr->fst);
> +		par |= wr->ptw ? SYS_PAR_EL1_PTW : 0;
> +		par |= wr->s2 ? SYS_PAR_EL1_S : 0;
> +	} else if (wr->level == S1_MMU_DISABLED) {
> +		/* MMU off or HCR_EL2.DC == 1 */
> +		par = wr->pa & GENMASK_ULL(47, 12);
> +
> +		if (!(__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_DC)) {
> +			par |= FIELD_PREP(SYS_PAR_EL1_ATTR, 0); /* nGnRnE */
> +			par |= FIELD_PREP(SYS_PAR_EL1_SH, 0b10); /* OS */
> +		} else {
> +			par |= FIELD_PREP(SYS_PAR_EL1_ATTR,
> +					  MEMATTR(WbRaWa, WbRaWa));
> +			par |= FIELD_PREP(SYS_PAR_EL1_SH, 0b00); /* NS */
> +		}
> +	} else {
> +		u64 mair, sctlr;
> +		int el;
> +		u8 sh;
> +
> +		el = (vcpu_el2_e2h_is_set(vcpu) &&
> +		      vcpu_el2_tge_is_set(vcpu)) ? 2 : 1;
> +
> +		mair = ((el == 2) ?
> +			vcpu_read_sys_reg(vcpu, MAIR_EL2) :
> +			vcpu_read_sys_reg(vcpu, MAIR_EL1));
> +
> +		mair >>= FIELD_GET(PTE_ATTRINDX_MASK, wr->desc) * 8;
> +		mair &= 0xff;
> +
> +		sctlr = ((el == 2) ?
> +			vcpu_read_sys_reg(vcpu, SCTLR_EL2) :
> +			vcpu_read_sys_reg(vcpu, SCTLR_EL1));
> +
> +		/* Force NC for memory if SCTLR_ELx.C is clear */
> +		if (!(sctlr & SCTLR_EL1_C) && !MEMATTR_IS_DEVICE(mair))
> +			mair = MEMATTR(NC, NC);
> +
> +		par  = FIELD_PREP(SYS_PAR_EL1_ATTR, mair);
> +		par |= wr->pa & GENMASK_ULL(47, 12);
> +
> +		sh = compute_sh(mair, wr->desc);
> +		par |= FIELD_PREP(SYS_PAR_EL1_SH, sh);
> +	}
> +
> +	return par;
> +}
> +
> +static u64 handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
> +{
> +	bool perm_fail, ur, uw, ux, pr, pw, pan;
> +	struct s1_walk_result wr = {};
> +	struct s1_walk_info wi = {};
> +	int ret, idx, el;
> +
> +	/*
> +	 * We only get here from guest EL2, so the translation regime
> +	 * AT applies to is solely defined by {E2H,TGE}.
> +	 */
> +	el = (vcpu_el2_e2h_is_set(vcpu) &&
> +	      vcpu_el2_tge_is_set(vcpu)) ? 2 : 1;
> +
> +	ret = setup_s1_walk(vcpu, &wi, &wr, vaddr, el);
> +	if (ret)
> +		goto compute_par;
> +
> +	if (wr.level == S1_MMU_DISABLED)
> +		goto compute_par;
> +
> +	idx = srcu_read_lock(&vcpu->kvm->srcu);
> +
> +	ret = walk_s1(vcpu, &wi, &wr, vaddr);
> +
> +	srcu_read_unlock(&vcpu->kvm->srcu, idx);
> +
> +	if (ret)
> +		goto compute_par;
> +
> +	/* FIXME: revisit when adding indirect permission support */
> +	if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, PAN, PAN3) &&
> +	    !wi.nvhe) {
> +		u64 sctlr;
> +
> +		if (el == 1)
> +			sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1);
> +		else
> +			sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL2);
> +
> +		ux = (sctlr & SCTLR_EL1_EPAN) && !(wr.desc & PTE_UXN);
> +	} else {
> +		ux = false;
> +	}
> +
> +	pw = !(wr.desc & PTE_RDONLY);
> +
> +	if (wi.nvhe) {
> +		ur = uw = false;
> +		pr = true;
> +	} else {
> +		if (wr.desc & PTE_USER) {
> +			ur = pr = true;
> +			uw = pw;
> +		} else {
> +			ur = uw = false;
> +			pr = true;
> +		}
> +	}
> +
> +	/* Apply the Hierarchical Permission madness */
> +	if (wi.nvhe) {
> +		wr.APTable &= BIT(1);
> +		wr.PXNTable = wr.UXNTable;
> +	}
> +
> +	ur &= !(wr.APTable & BIT(0));
> +	uw &= !(wr.APTable != 0);
> +	ux &= !wr.UXNTable;
> +
> +	pw &= !(wr.APTable & BIT(1));
> +
> +	pan = *vcpu_cpsr(vcpu) & PSR_PAN_BIT;
> +
> +	perm_fail = false;
> +
> +	switch (op) {
> +	case OP_AT_S1E1RP:
> +		perm_fail |= pan && (ur || uw || ux);
> +		fallthrough;
> +	case OP_AT_S1E1R:
> +	case OP_AT_S1E2R:
> +		perm_fail |= !pr;
> +		break;
> +	case OP_AT_S1E1WP:
> +		perm_fail |= pan && (ur || uw || ux);
> +		fallthrough;
> +	case OP_AT_S1E1W:
> +	case OP_AT_S1E2W:
> +		perm_fail |= !pw;
> +		break;
> +	case OP_AT_S1E0R:
> +		perm_fail |= !ur;
> +		break;
> +	case OP_AT_S1E0W:
> +		perm_fail |= !uw;
> +		break;
> +	default:
> +		BUG();
> +	}
> +
> +	if (perm_fail) {
> +		struct s1_walk_result tmp;
> +
> +		tmp.failed = true;
> +		tmp.fst = ESR_ELx_FSC_PERM | wr.level;
> +		tmp.s2 = false;
> +		tmp.ptw = false;
> +
> +		wr = tmp;
> +	}
> +
> +compute_par:
> +	return compute_par_s1(vcpu, &wr);
> +}
> +
>  static bool check_at_pan(struct kvm_vcpu *vcpu, u64 vaddr, u64 *res)
>  {
>  	u64 par_e0;
> @@ -266,9 +733,11 @@ void __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
>  	struct mmu_config config;
>  	struct kvm_s2_mmu *mmu;
>  	unsigned long flags;
> -	bool fail;
> +	bool fail, retry_slow;
>  	u64 par;
>  
> +	retry_slow = false;
> +
>  	write_lock(&vcpu->kvm->mmu_lock);
>  
>  	/*
> @@ -288,14 +757,15 @@ void __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
>  		goto skip_mmu_switch;
>  
>  	/*
> -	 * FIXME: Obtaining the S2 MMU for a L2 is horribly racy, and
> -	 * we may not find it (recycled by another vcpu, for example).
> -	 * See the other FIXME comment below about the need for a SW
> -	 * PTW in this case.
> +	 * Obtaining the S2 MMU for a L2 is horribly racy, and we may not
> +	 * find it (recycled by another vcpu, for example). When this
> +	 * happens, use the SW (slow) path.
>  	 */
>  	mmu = lookup_s2_mmu(vcpu);
> -	if (WARN_ON(!mmu))
> +	if (!mmu) {
> +		retry_slow = true;
>  		goto out;
> +	}
>  
>  	write_sysreg_el1(vcpu_read_sys_reg(vcpu, TTBR0_EL1),	SYS_TTBR0);
>  	write_sysreg_el1(vcpu_read_sys_reg(vcpu, TTBR1_EL1),	SYS_TTBR1);
> @@ -331,18 +801,17 @@ void __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
>  	}
>  
>  	if (!fail)
> -		par = read_sysreg(par_el1);
> +		par = read_sysreg_par();
>  	else
>  		par = SYS_PAR_EL1_F;
>  
> +	retry_slow = !fail;
> +
>  	vcpu_write_sys_reg(vcpu, par, PAR_EL1);
>  
>  	/*
> -	 * Failed? let's leave the building now.
> -	 *
> -	 * FIXME: how about a failed translation because the shadow S2
> -	 * wasn't populated? We may need to perform a SW PTW,
> -	 * populating our shadow S2 and retry the instruction.
> +	 * Failed? let's leave the building now, unless we retry on
> +	 * the slow path.
>  	 */
>  	if (par & SYS_PAR_EL1_F)
>  		goto nopan;
> @@ -354,29 +823,58 @@ void __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
>  	switch (op) {
>  	case OP_AT_S1E1RP:
>  	case OP_AT_S1E1WP:
> +		retry_slow = false;
>  		fail = check_at_pan(vcpu, vaddr, &par);
>  		break;
>  	default:
>  		goto nopan;
>  	}
>  
> +	if (fail) {
> +		vcpu_write_sys_reg(vcpu, SYS_PAR_EL1_F, PAR_EL1);
> +		goto nopan;
> +	}
> +
>  	/*
>  	 * If the EL0 translation has succeeded, we need to pretend
>  	 * the AT operation has failed, as the PAN setting forbids
>  	 * such a translation.
> -	 *
> -	 * FIXME: we hardcode a Level-3 permission fault. We really
> -	 * should return the real fault level.
>  	 */
> -	if (fail || !(par & SYS_PAR_EL1_F))
> -		vcpu_write_sys_reg(vcpu, (0xf << 1) | SYS_PAR_EL1_F, PAR_EL1);
> -
> +	if (par & SYS_PAR_EL1_F) {
> +		u8 fst = FIELD_GET(SYS_PAR_EL1_FST, par);
> +
> +		/*
> +		 * If we get something other than a permission fault, we
> +		 * need to retry, as we're likely to have missed in the PTs.
> +		 */
> +		if ((fst & ESR_ELx_FSC_TYPE) != ESR_ELx_FSC_PERM)
> +			retry_slow = true;
> +	} else {
> +		/*
> +		 * The EL0 access succeded, but we don't have the full
> +		 * syndrom information to synthetize the failure. Go slow.
> +		 */
> +		retry_slow = true;
> +	}
>  nopan:
>  	__mmu_config_restore(&config);
>  out:
>  	local_irq_restore(flags);
>  
>  	write_unlock(&vcpu->kvm->mmu_lock);
> +
> +	/*
> +	 * If retry_slow is true, then we either are missing shadow S2
> +	 * entries, have paged out guest S1, or something is inconsistent.
> +	 *
> +	 * Either way, we need to walk the PTs by hand so that we can either
> +	 * fault things back, in or record accurate fault information along
> +	 * the way.
> +	 */
> +	if (retry_slow) {
> +		par = handle_at_slow(vcpu, op, vaddr);
> +		vcpu_write_sys_reg(vcpu, par, PAR_EL1);
> +	}
>  }
>  
>  void __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
> @@ -433,6 +931,10 @@ void __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
>  
>  	write_unlock(&vcpu->kvm->mmu_lock);
>  
> +	/* We failed the translation, let's replay it in slow motion */
> +	if (!fail && (par & SYS_PAR_EL1_F))
> +		par = handle_at_slow(vcpu, op, vaddr);
> +
>  	vcpu_write_sys_reg(vcpu, par, PAR_EL1);
>  }
>  
> -- 
> 2.39.2
> 
>
Marc Zyngier July 31, 2024, 3:43 p.m. UTC | #22
On Wed, 31 Jul 2024 15:33:25 +0100,
Alexandru Elisei <alexandru.elisei@arm.com> wrote:
> 
> Hi Marc,
> 
> On Mon, Jul 08, 2024 at 05:57:58PM +0100, Marc Zyngier wrote:
> > In order to plug the brokenness of our current AT implementation,
> > we need a SW walker that is going to... err.. walk the S1 tables
> > and tell us what it finds.
> > 
> > Of course, it builds on top of our S2 walker, and share similar
> > concepts. The beauty of it is that since it uses kvm_read_guest(),
> > it is able to bring back pages that have been otherwise evicted.
> > 
> > This is then plugged in the two AT S1 emulation functions as
> > a "slow path" fallback. I'm not sure it is that slow, but hey.
> > 
> > Signed-off-by: Marc Zyngier <maz@kernel.org>
> > ---
> >  arch/arm64/kvm/at.c | 538 ++++++++++++++++++++++++++++++++++++++++++--
> >  1 file changed, 520 insertions(+), 18 deletions(-)
> > 
> > diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c
> > index 71e3390b43b4c..8452273cbff6d 100644
> > --- a/arch/arm64/kvm/at.c
> > +++ b/arch/arm64/kvm/at.c
> > @@ -4,9 +4,305 @@
> >   * Author: Jintack Lim <jintack.lim@linaro.org>
> >   */
> >  
> > +#include <linux/kvm_host.h>
> > +
> > +#include <asm/esr.h>
> >  #include <asm/kvm_hyp.h>
> >  #include <asm/kvm_mmu.h>
> >  
> > +struct s1_walk_info {
> > +	u64	     baddr;
> > +	unsigned int max_oa_bits;
> > +	unsigned int pgshift;
> > +	unsigned int txsz;
> > +	int 	     sl;
> > +	bool	     hpd;
> > +	bool	     be;
> > +	bool	     nvhe;
> > +	bool	     s2;
> > +};
> > +
> > +struct s1_walk_result {
> > +	union {
> > +		struct {
> > +			u64	desc;
> > +			u64	pa;
> > +			s8	level;
> > +			u8	APTable;
> > +			bool	UXNTable;
> > +			bool	PXNTable;
> > +		};
> > +		struct {
> > +			u8	fst;
> > +			bool	ptw;
> > +			bool	s2;
> > +		};
> > +	};
> > +	bool	failed;
> > +};
> > +
> > +static void fail_s1_walk(struct s1_walk_result *wr, u8 fst, bool ptw, bool s2)
> > +{
> > +	wr->fst		= fst;
> > +	wr->ptw		= ptw;
> > +	wr->s2		= s2;
> > +	wr->failed	= true;
> > +}
> > +
> > +#define S1_MMU_DISABLED		(-127)
> > +
> > +static int setup_s1_walk(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
> > +			 struct s1_walk_result *wr, const u64 va, const int el)
> > +{
> > +	u64 sctlr, tcr, tg, ps, ia_bits, ttbr;
> > +	unsigned int stride, x;
> > +	bool va55, tbi;
> > +
> > +	wi->nvhe = el == 2 && !vcpu_el2_e2h_is_set(vcpu);
> > +
> > +	va55 = va & BIT(55);
> > +
> > +	if (wi->nvhe && va55)
> > +		goto addrsz;
> > +
> > +	wi->s2 = el < 2 && (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_VM);
> > +
> > +	switch (el) {
> > +	case 1:
> > +		sctlr	= vcpu_read_sys_reg(vcpu, SCTLR_EL1);
> > +		tcr	= vcpu_read_sys_reg(vcpu, TCR_EL1);
> > +		ttbr	= (va55 ?
> > +			   vcpu_read_sys_reg(vcpu, TTBR1_EL1) :
> > +			   vcpu_read_sys_reg(vcpu, TTBR0_EL1));
> > +		break;
> > +	case 2:
> > +		sctlr	= vcpu_read_sys_reg(vcpu, SCTLR_EL2);
> > +		tcr	= vcpu_read_sys_reg(vcpu, TCR_EL2);
> > +		ttbr	= (va55 ?
> > +			   vcpu_read_sys_reg(vcpu, TTBR1_EL2) :
> > +			   vcpu_read_sys_reg(vcpu, TTBR0_EL2));
> > +		break;
> > +	default:
> > +		BUG();
> > +	}
> > +
> > +	/* Let's put the MMU disabled case aside immediately */
> > +	if (!(sctlr & SCTLR_ELx_M) ||
> > +	    (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_DC)) {
> > +		if (va >= BIT(kvm_get_pa_bits(vcpu->kvm)))
> 
> As far as I can tell, if TBI, the pseudocode ignores bits 63:56 when checking
> for out-of-bounds VA for the MMU disabled case (above) and the MMU enabled case
> (below). That also matches the description of TBIx bits in the TCR_ELx
> registers.

Right. Then the check needs to be hoisted up and the VA sanitised
before we compare it to anything.

Thanks for all your review comments, but I am going to ask you to stop
here. You are reviewing a pretty old code base, and although I'm sure
you look at what is in my tree, I'd really like to post a new version
for everyone to enjoy.

I'll stash that last change on top and post the result.

	M.
Alexandru Elisei July 31, 2024, 4:05 p.m. UTC | #23
Hi Marc,

On Wed, Jul 31, 2024 at 04:43:16PM +0100, Marc Zyngier wrote:
> On Wed, 31 Jul 2024 15:33:25 +0100,
> Alexandru Elisei <alexandru.elisei@arm.com> wrote:
> > 
> > Hi Marc,
> > 
> > On Mon, Jul 08, 2024 at 05:57:58PM +0100, Marc Zyngier wrote:
> > > In order to plug the brokenness of our current AT implementation,
> > > we need a SW walker that is going to... err.. walk the S1 tables
> > > and tell us what it finds.
> > > 
> > > Of course, it builds on top of our S2 walker, and share similar
> > > concepts. The beauty of it is that since it uses kvm_read_guest(),
> > > it is able to bring back pages that have been otherwise evicted.
> > > 
> > > This is then plugged in the two AT S1 emulation functions as
> > > a "slow path" fallback. I'm not sure it is that slow, but hey.
> > > 
> > > Signed-off-by: Marc Zyngier <maz@kernel.org>
> > > ---
> > >  arch/arm64/kvm/at.c | 538 ++++++++++++++++++++++++++++++++++++++++++--
> > >  1 file changed, 520 insertions(+), 18 deletions(-)
> > > 
> > > diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c
> > > index 71e3390b43b4c..8452273cbff6d 100644
> > > --- a/arch/arm64/kvm/at.c
> > > +++ b/arch/arm64/kvm/at.c
> > > @@ -4,9 +4,305 @@
> > >   * Author: Jintack Lim <jintack.lim@linaro.org>
> > >   */
> > >  
> > > +#include <linux/kvm_host.h>
> > > +
> > > +#include <asm/esr.h>
> > >  #include <asm/kvm_hyp.h>
> > >  #include <asm/kvm_mmu.h>
> > >  
> > > +struct s1_walk_info {
> > > +	u64	     baddr;
> > > +	unsigned int max_oa_bits;
> > > +	unsigned int pgshift;
> > > +	unsigned int txsz;
> > > +	int 	     sl;
> > > +	bool	     hpd;
> > > +	bool	     be;
> > > +	bool	     nvhe;
> > > +	bool	     s2;
> > > +};
> > > +
> > > +struct s1_walk_result {
> > > +	union {
> > > +		struct {
> > > +			u64	desc;
> > > +			u64	pa;
> > > +			s8	level;
> > > +			u8	APTable;
> > > +			bool	UXNTable;
> > > +			bool	PXNTable;
> > > +		};
> > > +		struct {
> > > +			u8	fst;
> > > +			bool	ptw;
> > > +			bool	s2;
> > > +		};
> > > +	};
> > > +	bool	failed;
> > > +};
> > > +
> > > +static void fail_s1_walk(struct s1_walk_result *wr, u8 fst, bool ptw, bool s2)
> > > +{
> > > +	wr->fst		= fst;
> > > +	wr->ptw		= ptw;
> > > +	wr->s2		= s2;
> > > +	wr->failed	= true;
> > > +}
> > > +
> > > +#define S1_MMU_DISABLED		(-127)
> > > +
> > > +static int setup_s1_walk(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
> > > +			 struct s1_walk_result *wr, const u64 va, const int el)
> > > +{
> > > +	u64 sctlr, tcr, tg, ps, ia_bits, ttbr;
> > > +	unsigned int stride, x;
> > > +	bool va55, tbi;
> > > +
> > > +	wi->nvhe = el == 2 && !vcpu_el2_e2h_is_set(vcpu);
> > > +
> > > +	va55 = va & BIT(55);
> > > +
> > > +	if (wi->nvhe && va55)
> > > +		goto addrsz;
> > > +
> > > +	wi->s2 = el < 2 && (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_VM);
> > > +
> > > +	switch (el) {
> > > +	case 1:
> > > +		sctlr	= vcpu_read_sys_reg(vcpu, SCTLR_EL1);
> > > +		tcr	= vcpu_read_sys_reg(vcpu, TCR_EL1);
> > > +		ttbr	= (va55 ?
> > > +			   vcpu_read_sys_reg(vcpu, TTBR1_EL1) :
> > > +			   vcpu_read_sys_reg(vcpu, TTBR0_EL1));
> > > +		break;
> > > +	case 2:
> > > +		sctlr	= vcpu_read_sys_reg(vcpu, SCTLR_EL2);
> > > +		tcr	= vcpu_read_sys_reg(vcpu, TCR_EL2);
> > > +		ttbr	= (va55 ?
> > > +			   vcpu_read_sys_reg(vcpu, TTBR1_EL2) :
> > > +			   vcpu_read_sys_reg(vcpu, TTBR0_EL2));
> > > +		break;
> > > +	default:
> > > +		BUG();
> > > +	}
> > > +
> > > +	/* Let's put the MMU disabled case aside immediately */
> > > +	if (!(sctlr & SCTLR_ELx_M) ||
> > > +	    (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_DC)) {
> > > +		if (va >= BIT(kvm_get_pa_bits(vcpu->kvm)))
> > 
> > As far as I can tell, if TBI, the pseudocode ignores bits 63:56 when checking
> > for out-of-bounds VA for the MMU disabled case (above) and the MMU enabled case
> > (below). That also matches the description of TBIx bits in the TCR_ELx
> > registers.
> 
> Right. Then the check needs to be hoisted up and the VA sanitised
> before we compare it to anything.
> 
> Thanks for all your review comments, but I am going to ask you to stop
> here. You are reviewing a pretty old code base, and although I'm sure
> you look at what is in my tree, I'd really like to post a new version
> for everyone to enjoy.

Got it.

Thanks,
Alex
diff mbox series

Patch

diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c
index 71e3390b43b4c..8452273cbff6d 100644
--- a/arch/arm64/kvm/at.c
+++ b/arch/arm64/kvm/at.c
@@ -4,9 +4,305 @@ 
  * Author: Jintack Lim <jintack.lim@linaro.org>
  */
 
+#include <linux/kvm_host.h>
+
+#include <asm/esr.h>
 #include <asm/kvm_hyp.h>
 #include <asm/kvm_mmu.h>
 
+struct s1_walk_info {
+	u64	     baddr;
+	unsigned int max_oa_bits;
+	unsigned int pgshift;
+	unsigned int txsz;
+	int 	     sl;
+	bool	     hpd;
+	bool	     be;
+	bool	     nvhe;
+	bool	     s2;
+};
+
+struct s1_walk_result {
+	union {
+		struct {
+			u64	desc;
+			u64	pa;
+			s8	level;
+			u8	APTable;
+			bool	UXNTable;
+			bool	PXNTable;
+		};
+		struct {
+			u8	fst;
+			bool	ptw;
+			bool	s2;
+		};
+	};
+	bool	failed;
+};
+
+static void fail_s1_walk(struct s1_walk_result *wr, u8 fst, bool ptw, bool s2)
+{
+	wr->fst		= fst;
+	wr->ptw		= ptw;
+	wr->s2		= s2;
+	wr->failed	= true;
+}
+
+#define S1_MMU_DISABLED		(-127)
+
+static int setup_s1_walk(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
+			 struct s1_walk_result *wr, const u64 va, const int el)
+{
+	u64 sctlr, tcr, tg, ps, ia_bits, ttbr;
+	unsigned int stride, x;
+	bool va55, tbi;
+
+	wi->nvhe = el == 2 && !vcpu_el2_e2h_is_set(vcpu);
+
+	va55 = va & BIT(55);
+
+	if (wi->nvhe && va55)
+		goto addrsz;
+
+	wi->s2 = el < 2 && (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_VM);
+
+	switch (el) {
+	case 1:
+		sctlr	= vcpu_read_sys_reg(vcpu, SCTLR_EL1);
+		tcr	= vcpu_read_sys_reg(vcpu, TCR_EL1);
+		ttbr	= (va55 ?
+			   vcpu_read_sys_reg(vcpu, TTBR1_EL1) :
+			   vcpu_read_sys_reg(vcpu, TTBR0_EL1));
+		break;
+	case 2:
+		sctlr	= vcpu_read_sys_reg(vcpu, SCTLR_EL2);
+		tcr	= vcpu_read_sys_reg(vcpu, TCR_EL2);
+		ttbr	= (va55 ?
+			   vcpu_read_sys_reg(vcpu, TTBR1_EL2) :
+			   vcpu_read_sys_reg(vcpu, TTBR0_EL2));
+		break;
+	default:
+		BUG();
+	}
+
+	/* Let's put the MMU disabled case aside immediately */
+	if (!(sctlr & SCTLR_ELx_M) ||
+	    (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_DC)) {
+		if (va >= BIT(kvm_get_pa_bits(vcpu->kvm)))
+			goto addrsz;
+
+		wr->level = S1_MMU_DISABLED;
+		wr->desc = va;
+		return 0;
+	}
+
+	wi->be = sctlr & SCTLR_ELx_EE;
+
+	wi->hpd  = kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, HPDS, IMP);
+	wi->hpd &= (wi->nvhe ?
+		    FIELD_GET(TCR_EL2_HPD, tcr) :
+		    (va55 ?
+		     FIELD_GET(TCR_HPD1, tcr) :
+		     FIELD_GET(TCR_HPD0, tcr)));
+
+	tbi = (wi->nvhe ?
+	       FIELD_GET(TCR_EL2_TBI, tcr) :
+	       (va55 ?
+		FIELD_GET(TCR_TBI1, tcr) :
+		FIELD_GET(TCR_TBI0, tcr)));
+
+	if (!tbi && sign_extend64(va, 55) != (s64)va)
+		goto addrsz;
+
+	/* Someone was silly enough to encode TG0/TG1 differently */
+	if (va55) {
+		wi->txsz = FIELD_GET(TCR_T1SZ_MASK, tcr);
+		tg = FIELD_GET(TCR_TG1_MASK, tcr);
+
+		switch (tg << TCR_TG1_SHIFT) {
+		case TCR_TG1_4K:
+			wi->pgshift = 12;	 break;
+		case TCR_TG1_16K:
+			wi->pgshift = 14;	 break;
+		case TCR_TG1_64K:
+		default:	    /* IMPDEF: treat any other value as 64k */
+			wi->pgshift = 16;	 break;
+		}
+	} else {
+		wi->txsz = FIELD_GET(TCR_T0SZ_MASK, tcr);
+		tg = FIELD_GET(TCR_TG0_MASK, tcr);
+
+		switch (tg << TCR_TG0_SHIFT) {
+		case TCR_TG0_4K:
+			wi->pgshift = 12;	 break;
+		case TCR_TG0_16K:
+			wi->pgshift = 14;	 break;
+		case TCR_TG0_64K:
+		default:	    /* IMPDEF: treat any other value as 64k */
+			wi->pgshift = 16;	 break;
+		}
+	}
+
+	ia_bits = 64 - wi->txsz;
+
+	/* AArch64.S1StartLevel() */
+	stride = wi->pgshift - 3;
+	wi->sl = 3 - (((ia_bits - 1) - wi->pgshift) / stride);
+
+	/* Check for SL mandating LPA2 (which we don't support yet) */
+	switch (BIT(wi->pgshift)) {
+	case SZ_4K:
+		if (wi->sl == -1 &&
+		    !kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN4, 52_BIT))
+			goto addrsz;
+		break;
+	case SZ_16K:
+		if (wi->sl == 0 &&
+		    !kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN16, 52_BIT))
+			goto addrsz;
+		break;
+	}
+
+	ps = (wi->nvhe ?
+	      FIELD_GET(TCR_EL2_PS_MASK, tcr) : FIELD_GET(TCR_IPS_MASK, tcr));
+
+	wi->max_oa_bits = min(get_kvm_ipa_limit(), ps_to_output_size(ps));
+
+	/* Compute minimal alignment */
+	x = 3 + ia_bits - ((3 - wi->sl) * stride + wi->pgshift);
+
+	wi->baddr = ttbr & TTBRx_EL1_BADDR;
+	wi->baddr &= GENMASK_ULL(wi->max_oa_bits - 1, x);
+
+	return 0;
+
+addrsz:	/* Address Size Fault level 0 */
+	fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ, false, false);
+
+	return -EFAULT;
+}
+
+static int get_ia_size(struct s1_walk_info *wi)
+{
+	return 64 - wi->txsz;
+}
+
+static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
+		   struct s1_walk_result *wr, u64 va)
+{
+	u64 va_top, va_bottom, baddr, desc;
+	int level, stride, ret;
+
+	level = wi->sl;
+	stride = wi->pgshift - 3;
+	baddr = wi->baddr;
+
+	va_top = get_ia_size(wi) - 1;
+
+	while (1) {
+		u64 index, ipa;
+
+		va_bottom = (3 - level) * stride + wi->pgshift;
+		index = (va & GENMASK_ULL(va_top, va_bottom)) >> (va_bottom - 3);
+
+		ipa = baddr | index;
+
+		if (wi->s2) {
+			struct kvm_s2_trans s2_trans = {};
+
+			ret = kvm_walk_nested_s2(vcpu, ipa, &s2_trans);
+			if (ret) {
+				fail_s1_walk(wr,
+					     (s2_trans.esr & ~ESR_ELx_FSC_LEVEL) | level,
+					     true, true);
+				return ret;
+			}
+
+			if (!kvm_s2_trans_readable(&s2_trans)) {
+				fail_s1_walk(wr, ESR_ELx_FSC_PERM | level,
+					     true, true);
+
+				return -EPERM;
+			}
+
+			ipa = kvm_s2_trans_output(&s2_trans);
+		}
+
+		ret = kvm_read_guest(vcpu->kvm, ipa, &desc, sizeof(desc));
+		if (ret) {
+			fail_s1_walk(wr, ESR_ELx_FSC_SEA_TTW(level),
+				     true, false);
+			return ret;
+		}
+
+		if (wi->be)
+			desc = be64_to_cpu((__force __be64)desc);
+		else
+			desc = le64_to_cpu((__force __le64)desc);
+
+		if (!(desc & 1) || ((desc & 3) == 1 && level == 3)) {
+			fail_s1_walk(wr, ESR_ELx_FSC_FAULT | level,
+				     true, false);
+			return -ENOENT;
+		}
+
+		/* We found a leaf, handle that */
+		if ((desc & 3) == 1 || level == 3)
+			break;
+
+		if (!wi->hpd) {
+			wr->APTable  |= FIELD_GET(PMD_TABLE_AP, desc);
+			wr->UXNTable |= FIELD_GET(PMD_TABLE_UXN, desc);
+			wr->PXNTable |= FIELD_GET(PMD_TABLE_PXN, desc);
+		}
+
+		baddr = GENMASK_ULL(47, wi->pgshift);
+
+		/* Check for out-of-range OA */
+		if (wi->max_oa_bits < 48 &&
+		    (baddr & GENMASK_ULL(47, wi->max_oa_bits))) {
+			fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ | level,
+				     true, false);
+			return -EINVAL;
+		}
+
+		/* Prepare for next round */
+		va_top = va_bottom - 1;
+		level++;
+	}
+
+	/* Block mapping, check the validity of the level */
+	if (!(desc & BIT(1))) {
+		bool valid_block = false;
+
+		switch (BIT(wi->pgshift)) {
+		case SZ_4K:
+			valid_block = level == 1 || level == 2;
+			break;
+		case SZ_16K:
+		case SZ_64K:
+			valid_block = level == 2;
+			break;
+		}
+
+		if (!valid_block) {
+			fail_s1_walk(wr, ESR_ELx_FSC_FAULT | level,
+				     true, false);
+			return -EINVAL;
+		}
+	}
+
+	wr->failed = false;
+	wr->level = level;
+	wr->desc = desc;
+	wr->pa = desc & GENMASK(47, va_bottom);
+	if (va_bottom > 12)
+		wr->pa |= va & GENMASK_ULL(va_bottom - 1, 12);
+
+	return 0;
+}
+
 struct mmu_config {
 	u64	ttbr0;
 	u64	ttbr1;
@@ -234,6 +530,177 @@  static u64 compute_par_s12(struct kvm_vcpu *vcpu, u64 s1_par,
 	return par;
 }
 
+static u64 compute_par_s1(struct kvm_vcpu *vcpu, struct s1_walk_result *wr)
+{
+	u64 par;
+
+	if (wr->failed) {
+		par = SYS_PAR_EL1_RES1;
+		par |= SYS_PAR_EL1_F;
+		par |= FIELD_PREP(SYS_PAR_EL1_FST, wr->fst);
+		par |= wr->ptw ? SYS_PAR_EL1_PTW : 0;
+		par |= wr->s2 ? SYS_PAR_EL1_S : 0;
+	} else if (wr->level == S1_MMU_DISABLED) {
+		/* MMU off or HCR_EL2.DC == 1 */
+		par = wr->pa & GENMASK_ULL(47, 12);
+
+		if (!(__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_DC)) {
+			par |= FIELD_PREP(SYS_PAR_EL1_ATTR, 0); /* nGnRnE */
+			par |= FIELD_PREP(SYS_PAR_EL1_SH, 0b10); /* OS */
+		} else {
+			par |= FIELD_PREP(SYS_PAR_EL1_ATTR,
+					  MEMATTR(WbRaWa, WbRaWa));
+			par |= FIELD_PREP(SYS_PAR_EL1_SH, 0b00); /* NS */
+		}
+	} else {
+		u64 mair, sctlr;
+		int el;
+		u8 sh;
+
+		el = (vcpu_el2_e2h_is_set(vcpu) &&
+		      vcpu_el2_tge_is_set(vcpu)) ? 2 : 1;
+
+		mair = ((el == 2) ?
+			vcpu_read_sys_reg(vcpu, MAIR_EL2) :
+			vcpu_read_sys_reg(vcpu, MAIR_EL1));
+
+		mair >>= FIELD_GET(PTE_ATTRINDX_MASK, wr->desc) * 8;
+		mair &= 0xff;
+
+		sctlr = ((el == 2) ?
+			vcpu_read_sys_reg(vcpu, SCTLR_EL2) :
+			vcpu_read_sys_reg(vcpu, SCTLR_EL1));
+
+		/* Force NC for memory if SCTLR_ELx.C is clear */
+		if (!(sctlr & SCTLR_EL1_C) && !MEMATTR_IS_DEVICE(mair))
+			mair = MEMATTR(NC, NC);
+
+		par  = FIELD_PREP(SYS_PAR_EL1_ATTR, mair);
+		par |= wr->pa & GENMASK_ULL(47, 12);
+
+		sh = compute_sh(mair, wr->desc);
+		par |= FIELD_PREP(SYS_PAR_EL1_SH, sh);
+	}
+
+	return par;
+}
+
+static u64 handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
+{
+	bool perm_fail, ur, uw, ux, pr, pw, pan;
+	struct s1_walk_result wr = {};
+	struct s1_walk_info wi = {};
+	int ret, idx, el;
+
+	/*
+	 * We only get here from guest EL2, so the translation regime
+	 * AT applies to is solely defined by {E2H,TGE}.
+	 */
+	el = (vcpu_el2_e2h_is_set(vcpu) &&
+	      vcpu_el2_tge_is_set(vcpu)) ? 2 : 1;
+
+	ret = setup_s1_walk(vcpu, &wi, &wr, vaddr, el);
+	if (ret)
+		goto compute_par;
+
+	if (wr.level == S1_MMU_DISABLED)
+		goto compute_par;
+
+	idx = srcu_read_lock(&vcpu->kvm->srcu);
+
+	ret = walk_s1(vcpu, &wi, &wr, vaddr);
+
+	srcu_read_unlock(&vcpu->kvm->srcu, idx);
+
+	if (ret)
+		goto compute_par;
+
+	/* FIXME: revisit when adding indirect permission support */
+	if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, PAN, PAN3) &&
+	    !wi.nvhe) {
+		u64 sctlr;
+
+		if (el == 1)
+			sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1);
+		else
+			sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL2);
+
+		ux = (sctlr & SCTLR_EL1_EPAN) && !(wr.desc & PTE_UXN);
+	} else {
+		ux = false;
+	}
+
+	pw = !(wr.desc & PTE_RDONLY);
+
+	if (wi.nvhe) {
+		ur = uw = false;
+		pr = true;
+	} else {
+		if (wr.desc & PTE_USER) {
+			ur = pr = true;
+			uw = pw;
+		} else {
+			ur = uw = false;
+			pr = true;
+		}
+	}
+
+	/* Apply the Hierarchical Permission madness */
+	if (wi.nvhe) {
+		wr.APTable &= BIT(1);
+		wr.PXNTable = wr.UXNTable;
+	}
+
+	ur &= !(wr.APTable & BIT(0));
+	uw &= !(wr.APTable != 0);
+	ux &= !wr.UXNTable;
+
+	pw &= !(wr.APTable & BIT(1));
+
+	pan = *vcpu_cpsr(vcpu) & PSR_PAN_BIT;
+
+	perm_fail = false;
+
+	switch (op) {
+	case OP_AT_S1E1RP:
+		perm_fail |= pan && (ur || uw || ux);
+		fallthrough;
+	case OP_AT_S1E1R:
+	case OP_AT_S1E2R:
+		perm_fail |= !pr;
+		break;
+	case OP_AT_S1E1WP:
+		perm_fail |= pan && (ur || uw || ux);
+		fallthrough;
+	case OP_AT_S1E1W:
+	case OP_AT_S1E2W:
+		perm_fail |= !pw;
+		break;
+	case OP_AT_S1E0R:
+		perm_fail |= !ur;
+		break;
+	case OP_AT_S1E0W:
+		perm_fail |= !uw;
+		break;
+	default:
+		BUG();
+	}
+
+	if (perm_fail) {
+		struct s1_walk_result tmp;
+
+		tmp.failed = true;
+		tmp.fst = ESR_ELx_FSC_PERM | wr.level;
+		tmp.s2 = false;
+		tmp.ptw = false;
+
+		wr = tmp;
+	}
+
+compute_par:
+	return compute_par_s1(vcpu, &wr);
+}
+
 static bool check_at_pan(struct kvm_vcpu *vcpu, u64 vaddr, u64 *res)
 {
 	u64 par_e0;
@@ -266,9 +733,11 @@  void __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
 	struct mmu_config config;
 	struct kvm_s2_mmu *mmu;
 	unsigned long flags;
-	bool fail;
+	bool fail, retry_slow;
 	u64 par;
 
+	retry_slow = false;
+
 	write_lock(&vcpu->kvm->mmu_lock);
 
 	/*
@@ -288,14 +757,15 @@  void __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
 		goto skip_mmu_switch;
 
 	/*
-	 * FIXME: Obtaining the S2 MMU for a L2 is horribly racy, and
-	 * we may not find it (recycled by another vcpu, for example).
-	 * See the other FIXME comment below about the need for a SW
-	 * PTW in this case.
+	 * Obtaining the S2 MMU for a L2 is horribly racy, and we may not
+	 * find it (recycled by another vcpu, for example). When this
+	 * happens, use the SW (slow) path.
 	 */
 	mmu = lookup_s2_mmu(vcpu);
-	if (WARN_ON(!mmu))
+	if (!mmu) {
+		retry_slow = true;
 		goto out;
+	}
 
 	write_sysreg_el1(vcpu_read_sys_reg(vcpu, TTBR0_EL1),	SYS_TTBR0);
 	write_sysreg_el1(vcpu_read_sys_reg(vcpu, TTBR1_EL1),	SYS_TTBR1);
@@ -331,18 +801,17 @@  void __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
 	}
 
 	if (!fail)
-		par = read_sysreg(par_el1);
+		par = read_sysreg_par();
 	else
 		par = SYS_PAR_EL1_F;
 
+	retry_slow = !fail;
+
 	vcpu_write_sys_reg(vcpu, par, PAR_EL1);
 
 	/*
-	 * Failed? let's leave the building now.
-	 *
-	 * FIXME: how about a failed translation because the shadow S2
-	 * wasn't populated? We may need to perform a SW PTW,
-	 * populating our shadow S2 and retry the instruction.
+	 * Failed? let's leave the building now, unless we retry on
+	 * the slow path.
 	 */
 	if (par & SYS_PAR_EL1_F)
 		goto nopan;
@@ -354,29 +823,58 @@  void __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
 	switch (op) {
 	case OP_AT_S1E1RP:
 	case OP_AT_S1E1WP:
+		retry_slow = false;
 		fail = check_at_pan(vcpu, vaddr, &par);
 		break;
 	default:
 		goto nopan;
 	}
 
+	if (fail) {
+		vcpu_write_sys_reg(vcpu, SYS_PAR_EL1_F, PAR_EL1);
+		goto nopan;
+	}
+
 	/*
 	 * If the EL0 translation has succeeded, we need to pretend
 	 * the AT operation has failed, as the PAN setting forbids
 	 * such a translation.
-	 *
-	 * FIXME: we hardcode a Level-3 permission fault. We really
-	 * should return the real fault level.
 	 */
-	if (fail || !(par & SYS_PAR_EL1_F))
-		vcpu_write_sys_reg(vcpu, (0xf << 1) | SYS_PAR_EL1_F, PAR_EL1);
-
+	if (par & SYS_PAR_EL1_F) {
+		u8 fst = FIELD_GET(SYS_PAR_EL1_FST, par);
+
+		/*
+		 * If we get something other than a permission fault, we
+		 * need to retry, as we're likely to have missed in the PTs.
+		 */
+		if ((fst & ESR_ELx_FSC_TYPE) != ESR_ELx_FSC_PERM)
+			retry_slow = true;
+	} else {
+		/*
+		 * The EL0 access succeded, but we don't have the full
+		 * syndrom information to synthetize the failure. Go slow.
+		 */
+		retry_slow = true;
+	}
 nopan:
 	__mmu_config_restore(&config);
 out:
 	local_irq_restore(flags);
 
 	write_unlock(&vcpu->kvm->mmu_lock);
+
+	/*
+	 * If retry_slow is true, then we either are missing shadow S2
+	 * entries, have paged out guest S1, or something is inconsistent.
+	 *
+	 * Either way, we need to walk the PTs by hand so that we can either
+	 * fault things back, in or record accurate fault information along
+	 * the way.
+	 */
+	if (retry_slow) {
+		par = handle_at_slow(vcpu, op, vaddr);
+		vcpu_write_sys_reg(vcpu, par, PAR_EL1);
+	}
 }
 
 void __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
@@ -433,6 +931,10 @@  void __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
 
 	write_unlock(&vcpu->kvm->mmu_lock);
 
+	/* We failed the translation, let's replay it in slow motion */
+	if (!fail && (par & SYS_PAR_EL1_F))
+		par = handle_at_slow(vcpu, op, vaddr);
+
 	vcpu_write_sys_reg(vcpu, par, PAR_EL1);
 }