diff mbox series

[08/18] KVM: arm64: nv: Handle HCR_EL2.NV system register traps

Message ID 20230209175820.1939006-9-maz@kernel.org (mailing list archive)
State New, archived
Headers show
Series KVM: arm64: Prefix patches for NV support | expand

Commit Message

Marc Zyngier Feb. 9, 2023, 5:58 p.m. UTC
From: Jintack Lim <jintack.lim@linaro.org>

ARM v8.3 introduces a new bit in the HCR_EL2, which is the NV bit. When
this bit is set, accessing EL2 registers in EL1 traps to EL2. In
addition, executing the following instructions in EL1 will trap to EL2:
tlbi, at, eret, and msr/mrs instructions to access SP_EL1. Most of the
instructions that trap to EL2 with the NV bit were undef at EL1 prior to
ARM v8.3. The only instruction that was not undef is eret.

This patch sets up a handler for EL2 registers and SP_EL1 register
accesses at EL1. The host hypervisor keeps those register values in
memory, and will emulate their behavior.

This patch doesn't set the NV bit yet. It will be set in a later patch
once nested virtualization support is completed.

Reviewed-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Signed-off-by: Jintack Lim <jintack.lim@linaro.org>
[maz: EL2_REG() macros]
Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 arch/arm64/include/asm/sysreg.h | 38 ++++++++++++-
 arch/arm64/kvm/sys_regs.c       | 99 +++++++++++++++++++++++++++++++--
 2 files changed, 131 insertions(+), 6 deletions(-)

Comments

Joey Gouly Feb. 24, 2023, 5:39 p.m. UTC | #1
Hi Marc,

On Thu, Feb 09, 2023 at 05:58:10PM +0000, Marc Zyngier wrote:
> From: Jintack Lim <jintack.lim@linaro.org>
> 
> ARM v8.3 introduces a new bit in the HCR_EL2, which is the NV bit. When
> this bit is set, accessing EL2 registers in EL1 traps to EL2. In
> addition, executing the following instructions in EL1 will trap to EL2:
> tlbi, at, eret, and msr/mrs instructions to access SP_EL1. Most of the
> instructions that trap to EL2 with the NV bit were undef at EL1 prior to
> ARM v8.3. The only instruction that was not undef is eret.
> 
> This patch sets up a handler for EL2 registers and SP_EL1 register
> accesses at EL1. The host hypervisor keeps those register values in
> memory, and will emulate their behavior.
> 
> This patch doesn't set the NV bit yet. It will be set in a later patch
> once nested virtualization support is completed.
> 
> Reviewed-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
> Signed-off-by: Jintack Lim <jintack.lim@linaro.org>
> [maz: EL2_REG() macros]
> Signed-off-by: Marc Zyngier <maz@kernel.org>
[..]
> diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
> index c6cbfe6b854b..1e6ae3b2e6dd 100644
> --- a/arch/arm64/kvm/sys_regs.c
> +++ b/arch/arm64/kvm/sys_regs.c
> @@ -24,6 +24,7 @@
>  #include <asm/kvm_emulate.h>
>  #include <asm/kvm_hyp.h>
>  #include <asm/kvm_mmu.h>
> +#include <asm/kvm_nested.h>
>  #include <asm/perf_event.h>
>  #include <asm/sysreg.h>
>  
> @@ -102,6 +103,18 @@ static u32 get_ccsidr(u32 csselr)
>  	return ccsidr;
>  }
>  
> +static bool access_rw(struct kvm_vcpu *vcpu,
> +		      struct sys_reg_params *p,
> +		      const struct sys_reg_desc *r)
> +{
> +	if (p->is_write)
> +		vcpu_write_sys_reg(vcpu, p->regval, r->reg);
> +	else
> +		p->regval = vcpu_read_sys_reg(vcpu, r->reg);
> +
> +	return true;
> +}
> +
>  /*
>   * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized).
>   */
> @@ -260,6 +273,14 @@ static bool trap_raz_wi(struct kvm_vcpu *vcpu,
>  		return read_zero(vcpu, p);
>  }
>  
> +static bool trap_undef(struct kvm_vcpu *vcpu,
> +		       struct sys_reg_params *p,
> +		       const struct sys_reg_desc *r)
> +{
> +	kvm_inject_undefined(vcpu);
> +	return false;
> +}
> +
>  /*
>   * ARMv8.1 mandates at least a trivial LORegion implementation, where all the
>   * RW registers are RES0 (which we can implement as RAZ/WI). On an ARMv8.0
> @@ -370,12 +391,9 @@ static bool trap_debug_regs(struct kvm_vcpu *vcpu,
>  			    struct sys_reg_params *p,
>  			    const struct sys_reg_desc *r)
>  {
> -	if (p->is_write) {
> -		vcpu_write_sys_reg(vcpu, p->regval, r->reg);
> +	access_rw(vcpu, p, r);
> +	if (p->is_write)
>  		vcpu_set_flag(vcpu, DEBUG_DIRTY);
> -	} else {
> -		p->regval = vcpu_read_sys_reg(vcpu, r->reg);
> -	}
>  
>  	trace_trap_reg(__func__, r->reg, p->is_write, p->regval);
>  
> @@ -1446,6 +1464,24 @@ static unsigned int mte_visibility(const struct kvm_vcpu *vcpu,
>  	.visibility = mte_visibility,		\
>  }
>  
> +static unsigned int el2_visibility(const struct kvm_vcpu *vcpu,
> +				   const struct sys_reg_desc *rd)
> +{
> +	if (vcpu_has_nv(vcpu))
> +		return 0;
> +
> +	return REG_HIDDEN;
> +}
> +
> +#define EL2_REG(name, acc, rst, v) {		\
> +	SYS_DESC(SYS_##name),			\
> +	.access = acc,				\
> +	.reset = rst,				\
> +	.reg = name,				\
> +	.visibility = el2_visibility,		\
> +	.val = v,				\
> +}
> +
>  /* sys_reg_desc initialiser for known cpufeature ID registers */
>  #define ID_SANITISED(name) {			\
>  	SYS_DESC(SYS_##name),			\
> @@ -1490,6 +1526,18 @@ static unsigned int mte_visibility(const struct kvm_vcpu *vcpu,
>  	.visibility = raz_visibility,		\
>  }
>  
> +static bool access_sp_el1(struct kvm_vcpu *vcpu,
> +			  struct sys_reg_params *p,
> +			  const struct sys_reg_desc *r)
> +{
> +	if (p->is_write)
> +		__vcpu_sys_reg(vcpu, SP_EL1) = p->regval;
> +	else
> +		p->regval = __vcpu_sys_reg(vcpu, SP_EL1);
> +
> +	return true;
> +}
> +
>  /*
>   * Architected system registers.
>   * Important: Must be sorted ascending by Op0, Op1, CRn, CRm, Op2
> @@ -1913,9 +1961,50 @@ static const struct sys_reg_desc sys_reg_descs[] = {
>  	{ PMU_SYS_REG(SYS_PMCCFILTR_EL0), .access = access_pmu_evtyper,
>  	  .reset = reset_val, .reg = PMCCFILTR_EL0, .val = 0 },
>  
> +	EL2_REG(VPIDR_EL2, access_rw, reset_unknown, 0),
> +	EL2_REG(VMPIDR_EL2, access_rw, reset_unknown, 0),
> +	EL2_REG(SCTLR_EL2, access_rw, reset_val, SCTLR_EL2_RES1),
> +	EL2_REG(ACTLR_EL2, access_rw, reset_val, 0),
> +	EL2_REG(HCR_EL2, access_rw, reset_val, 0),
> +	EL2_REG(MDCR_EL2, access_rw, reset_val, 0),
> +	EL2_REG(CPTR_EL2, access_rw, reset_val, CPTR_EL2_DEFAULT ),
> +	EL2_REG(HSTR_EL2, access_rw, reset_val, 0),
> +	EL2_REG(HACR_EL2, access_rw, reset_val, 0),
> +
> +	EL2_REG(TTBR0_EL2, access_rw, reset_val, 0),
> +	EL2_REG(TTBR1_EL2, access_rw, reset_val, 0),
> +	EL2_REG(TCR_EL2, access_rw, reset_val, TCR_EL2_RES1),
> +	EL2_REG(VTTBR_EL2, access_rw, reset_val, 0),
> +	EL2_REG(VTCR_EL2, access_rw, reset_val, 0),
> +
>  	{ SYS_DESC(SYS_DACR32_EL2), NULL, reset_unknown, DACR32_EL2 },
> +	EL2_REG(SPSR_EL2, access_rw, reset_val, 0),
> +	EL2_REG(ELR_EL2, access_rw, reset_val, 0),
> +	{ SYS_DESC(SYS_SP_EL1), access_sp_el1},
> +
>  	{ SYS_DESC(SYS_IFSR32_EL2), NULL, reset_unknown, IFSR32_EL2 },
> +	EL2_REG(AFSR0_EL2, access_rw, reset_val, 0),
> +	EL2_REG(AFSR1_EL2, access_rw, reset_val, 0),
> +	EL2_REG(ESR_EL2, access_rw, reset_val, 0),
>  	{ SYS_DESC(SYS_FPEXC32_EL2), NULL, reset_val, FPEXC32_EL2, 0x700 },
> +
> +	EL2_REG(FAR_EL2, access_rw, reset_val, 0),
> +	EL2_REG(HPFAR_EL2, access_rw, reset_val, 0),
> +
> +	EL2_REG(MAIR_EL2, access_rw, reset_val, 0),
> +	EL2_REG(AMAIR_EL2, access_rw, reset_val, 0),
> +
> +	EL2_REG(VBAR_EL2, access_rw, reset_val, 0),
> +	EL2_REG(RVBAR_EL2, access_rw, reset_val, 0),
> +	{ SYS_DESC(SYS_RMR_EL2), trap_undef },
> +
> +	EL2_REG(CONTEXTIDR_EL2, access_rw, reset_val, 0),
> +	EL2_REG(TPIDR_EL2, access_rw, reset_val, 0),
> +
> +	EL2_REG(CNTVOFF_EL2, access_rw, reset_val, 0),
> +	EL2_REG(CNTHCTL_EL2, access_rw, reset_val, 0),
> +
> +	EL2_REG(SP_EL2, NULL, reset_unknown, 0),
>  };
>  
>  static bool trap_dbgdidr(struct kvm_vcpu *vcpu,
> -- 
> 2.34.1
> 
> 

I'm having an issue with this commit where a VCPU is getting a CNTVOFF_EL2 set
to 0, so it sees the same time as the host system, and the other VCPU has the
correct offset.

This is due to the new line added in this commit:

	EL2_REG(CNTVOFF_EL2, access_rw, reset_val, 0),

Here is an excerpt of the dmesg:

[    0.000000] Booting Linux on physical CPU 0x0000000000 [0x000f0510]                                                                                                               
[    0.000000] Linux version 5.14.0-rc1-00011-gc2b59ec4a322-dirty SMP PREEMPT
[    0.000000] Machine model: linux,dummy-virt                                                                                                                                       
[    0.000000] efi: UEFI not found.                                                                                                                                                  
[    0.000000] earlycon: ns16550a0 at MMIO 0x0000000001000000 (options '')                                                                                                           
[    0.000000] printk: bootconsole [ns16550a0] enabled                                                                                                                               
[    0.000000] NUMA: No NUMA configuration found                                                                                                                                     
[    0.000000] NUMA: Faking a node at [mem 0x0000000080000000-0x0000000093ffffff]                                                                                                    
[    0.000000] NUMA: NODE_DATA [mem 0x93f60c00-0x93f62fff]                                                                                                                           
[    0.000000] Zone ranges:                                                                                                                                                          
[    0.000000]   DMA      [mem 0x0000000080000000-0x0000000093ffffff]                                                                                                                
[    0.000000]   DMA32    empty                                                                                                                                                      
[    0.000000]   Normal   empty                                                                                                                                                      
[    0.000000] Movable zone start for each node                                                                                                                                      
[    0.000000] Early memory node ranges                                                                                                                                              
[    0.000000]   node   0: [mem 0x0000000080000000-0x0000000093ffffff]                                                                                                               
[    0.000000] Initmem setup node 0 [mem 0x0000000080000000-0x0000000093ffffff]                                                                                                      
[    0.000000] On node 0, zone DMA: 16384 pages in unavailable ranges                                                                                                                
[    0.000000] cma: Reserved 32 MiB at 0x0000000091800000                                                                                                                            
[    0.000000] psci: probing for conduit method from DT.                                                                                                                             
[    0.000000] psci: PSCIv1.1 detected in firmware.                                                                                                                                  
[    0.000000] psci: Using standard PSCI v0.2 function IDs                                                                                                                           
[    0.000000] psci: Trusted OS migration not required                                                                                                                               
[    0.000000] psci: SMC Calling Convention v1.1                                                                                                                                     
[    0.000000] smccc: KVM: hypervisor services detected (0x00000000 0x00000000 0x00000000 0x00000003)                                                                                
[    0.000000] percpu: Embedded 33 pages/cpu s96088 r8192 d30888 u135168                                                                                                             
[    0.000000] Detected PIPT I-cache on CPU0                                                                                                                                         
[    0.000000] CPU features: detected: Branch Target Identification                                                                                                                  
[    0.000000] CPU features: detected: GIC system register CPU interface                                                                                                             
[    0.000000] CPU features: detected: Spectre-v4                                                                                                                                    
[    0.000000] CPU features: kernel page table isolation forced ON by KASLR                                                                                                          
[    0.000000] CPU features: detected: Kernel page table isolation (KPTI)             
[..]
[    0.516017] fsl-mc MSI: msic domain created
[    0.544449] EFI services will not be available.
[    0.575385] smp: Bringing up secondary CPUs ...
[    0.658517] smp: Brought up 1 node, 2 CPUs
[    0.849724] SMP: Total of 2 processors activated.
[    0.893825] CPU features: detected: 32-bit EL0 Support
[    0.935321] CPU features: detected: 32-bit EL1 Support
[    0.989719] CPU features: detected: ARMv8.4 Translation Table Level
[    1.049576] CPU features: detected: Data cache clean to the PoU not required for I/D coherence
[    1.108269] CPU features: detected: Common not Private translations
[    1.151863] CPU features: detected: CRC32 instructions
[    1.186088] CPU features: detected: RCpc load-acquire (LDAPR)
[    1.225008] CPU features: detected: LSE atomic instructions
[    1.262173] CPU features: detected: Privileged Access Never
[    1.299687] CPU features: detected: RAS Extension Support
[    1.337388] CPU features: detected: Random Number Generator
[    1.374957] CPU features: detected: Speculation barrier (SB)
[    1.412860] CPU features: detected: Stage-2 Force Write-Back
[    1.458967] CPU features: detected: TLB range maintenance instructions
[    1.503504] CPU features: detected: Speculative Store Bypassing Safe (SSBS)
[    6.740431] CPU: All CPU(s) started at EL1
[    6.779526] alternatives: patching kernel code
[ 1685.522538] devtmpfs: initialized
[ 1685.591049] KASLR enabled
[ 1685.614942] clocksource: jiffies: mask: 0xffffffff max_cycles: 0xffffffff, max_idle_ns: 7645041785100000 ns
[ 1685.686698] futex hash table entries: 512 (order: 3, 32768 bytes, linear)
[ 1685.748071] pinctrl core: initialized pinctrl subsystem
[ 1685.809348] DMI not present or invalid.
[ 1685.854306] NET: Registered PF_NETLINK/PF_ROUTE protocol family
[ 1685.924942] DMA: preallocated 128 KiB GFP_KERNEL pool for atomic allocations
[ 1685.981774] DMA: preallocated 128 KiB GFP_KERNEL|GFP_DMA pool for atomic allocations
[ 1686.042387] DMA: preallocated 128 KiB GFP_KERNEL|GFP_DMA32 pool for atomic allocations
[ 1686.099864] audit: initializing netlink subsys (disabled)
[ 1686.152647] audit: type=2000 audit(5.488:1): state=initialized audit_enabled=0 res=1
[    7.525335] thermal_sys: Registered thermal governor 'step_wise'
[    7.565113] thermal_sys: Registered thermal governor 'power_allocator'
[    7.609143] cpuidle: using governor menu
[    7.681526] NET: Registered PF_QIPCRTR protocol family
[    7.720877] hw-breakpoint: found 6 breakpoint and 4 watchpoint registers.
[    7.770844] ASID allocator initialised with 32768 entries
[    7.816662] Serial: AMBA PL011 UART driver
[ 1686.613910] HugeTLB registered 1.00 GiB page size, pre-allocated 0 pages
[ 1686.666303] HugeTLB registered 32.0 MiB page size, pre-allocated 0 pages
[ 1686.725358] HugeTLB registered 2.00 MiB page size, pre-allocated 0 pages
[ 1686.772979] HugeTLB registered 64.0 KiB page size, pre-allocated 0 pages
[..]

The flow of execution looks like this:
	KVM_CREATE_VCPU 0 (VMM) ->
		kvm_timer_vcpu_init ->
			update_vtimer_cntvoff (VCPU0.CNTVOFF_EL2=kvm_phys_timer_read)
	KVM_ARM_VCPU_INIT (VMM) ->
		kvm_arch_vcpu_ioctl_vcpu_init ->
			kvm_vcpu_set_target ->
				kvm_reset_vcpu ->
					kvm_reset_sys_regs (VCPU0.CNTVOFF_EL2=0)

	KVM_CREATE_VCPU 1 (VMM) ->
		kvm_timer_vcpu_init ->
			update_vtimer_cntvoff (VCPU0.CNTVOFF_EL2=VCPU1.CNTVOFF_EL2=kvm_phys_timer_read)
	KVM_ARM_VCPU_INIT (VMM) ->
		kvm_arch_vcpu_ioctl_vcpu_init ->
			kvm_vcpu_set_target ->
				kvm_reset_vcpu ->
					kvm_reset_sys_regs (VCPU1.CNTVOFF_EL2=0)

	At this point VCPU0 has CNTVOFF_EL2 == kvm_phys_timer_read, and VCPU1
	has CNTVOFF_EL2 == 0.

The VCPUs having different CNTVOFF_EL2 valuess is just a symptom of the fact that
CNTVOFF_EL2 is now reset in kvm_reset_sys_regs.

This is with linux kvmarm/next [1], kvmtool [2], qemu [3] (I also saw similar
behaviour on FVP).

qemu-system-aarch64 -M virt \                                                                                                                                        
      -machine virtualization=true \                                                                                                                                                 
      -machine virt,gic-version=3 \                                                                                                                                                 
      -machine mte=off \                                                                                                                                                             
      -cpu max,pauth=off -smp 2 -m 12144 \                                                                                                                         
      -drive if=none,format=qcow2,file=disk.img,id=blk0 \                                                                                                                  
      -device virtio-blk-pci,drive=blk0 \                                                                                                                                            
      -nographic \                                                                                                                                           
      -device virtio-net-pci,netdev=net0 \                                                                                                                            
      -netdev user,id=net0,hostfwd=tcp::8024-:22 \                                                                                                                        
      -kernel ../linux/arch/arm64/boot/Image \                                                                                                                                       
      -append "root=/dev/vda2" 

And then running kvmtool:
	lkvm run -k Image -p earlycon

Hopefully that is all that is needed to reproduce it.

The following patch gets it booting again, but I'm sure it's not the right fix.

diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 53749d3a0996..1a1abda6af74 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -1587,6 +1587,14 @@ static bool access_ccsidr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
        return true;
 }
 
+static void nv_reset_val(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
+{
+       BUG_ON(!r->reg);
+       BUG_ON(r->reg >= NR_SYS_REGS);
+       if (vcpu_has_nv(vcpu))
+               __vcpu_sys_reg(vcpu, r->reg) = r->val;
+}
+
 static unsigned int mte_visibility(const struct kvm_vcpu *vcpu,
                                   const struct sys_reg_desc *rd)
 {
@@ -2190,7 +2198,7 @@ static const struct sys_reg_desc sys_reg_descs[] = {
        EL2_REG(CONTEXTIDR_EL2, access_rw, reset_val, 0),
        EL2_REG(TPIDR_EL2, access_rw, reset_val, 0),
 
-       EL2_REG(CNTVOFF_EL2, access_rw, reset_val, 0),
+       EL2_REG(CNTVOFF_EL2, access_rw, nv_reset_val, 0),
        EL2_REG(CNTHCTL_EL2, access_rw, reset_val, 0),
 
        EL12_REG(SCTLR, access_vm_reg, reset_val, 0x00C50078),

Thanks,
Joey

[1] https://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm.git/commit/?h=next&id=96a4627dbbd48144a65af936b321701c70876026
[2] e17d182ad3f797f01947fc234d95c96c050c534b
[3] 99d6b11b5b44d7dd64f4cb1973184e40a4a174f8
Oliver Upton Feb. 24, 2023, 6:36 p.m. UTC | #2
Hi Joey,

On Fri, Feb 24, 2023 at 05:39:15PM +0000, Joey Gouly wrote:
> I'm having an issue with this commit where a VCPU is getting a CNTVOFF_EL2 set
> to 0, so it sees the same time as the host system, and the other VCPU has the
> correct offset.

Yikes!

> The flow of execution looks like this:
> 	KVM_CREATE_VCPU 0 (VMM) ->
> 		kvm_timer_vcpu_init ->
> 			update_vtimer_cntvoff (VCPU0.CNTVOFF_EL2=kvm_phys_timer_read)
> 	KVM_ARM_VCPU_INIT (VMM) ->
> 		kvm_arch_vcpu_ioctl_vcpu_init ->
> 			kvm_vcpu_set_target ->
> 				kvm_reset_vcpu ->
> 					kvm_reset_sys_regs (VCPU0.CNTVOFF_EL2=0)
> 
> 	KVM_CREATE_VCPU 1 (VMM) ->
> 		kvm_timer_vcpu_init ->
> 			update_vtimer_cntvoff (VCPU0.CNTVOFF_EL2=VCPU1.CNTVOFF_EL2=kvm_phys_timer_read)
> 	KVM_ARM_VCPU_INIT (VMM) ->
> 		kvm_arch_vcpu_ioctl_vcpu_init ->
> 			kvm_vcpu_set_target ->
> 				kvm_reset_vcpu ->
> 					kvm_reset_sys_regs (VCPU1.CNTVOFF_EL2=0)
> 
> 	At this point VCPU0 has CNTVOFF_EL2 == kvm_phys_timer_read, and VCPU1
> 	has CNTVOFF_EL2 == 0.
> 
> The VCPUs having different CNTVOFF_EL2 valuess is just a symptom of the fact that
> CNTVOFF_EL2 is now reset in kvm_reset_sys_regs.

Right, and the fundamental problem at hand is that we used CNTVOFF_EL2
to stash the _host's_ offset even though we are trying to change the
meaning of it to be part of the virtual EL2's context.

> The following patch gets it booting again, but I'm sure it's not the right fix.

I'd rather we just break the host away from using the shadow reg
altogether and separately track the host offset. As it so happens Marc
has a patch that does exactly that [*].

Marc, do you want to resend that patch in isolation addressing the
feedback and link to this bug report?

[*] https://lore.kernel.org/kvmarm/20230216142123.2638675-6-maz@kernel.org/
Marc Zyngier Feb. 24, 2023, 7:03 p.m. UTC | #3
On Fri, 24 Feb 2023 18:36:23 +0000,
Oliver Upton <oliver.upton@linux.dev> wrote:
> 
> Hi Joey,
> 
> On Fri, Feb 24, 2023 at 05:39:15PM +0000, Joey Gouly wrote:
> > I'm having an issue with this commit where a VCPU is getting a CNTVOFF_EL2 set
> > to 0, so it sees the same time as the host system, and the other VCPU has the
> > correct offset.
> 
> Yikes!
> 
> > The flow of execution looks like this:
> > 	KVM_CREATE_VCPU 0 (VMM) ->
> > 		kvm_timer_vcpu_init ->
> > 			update_vtimer_cntvoff (VCPU0.CNTVOFF_EL2=kvm_phys_timer_read)
> > 	KVM_ARM_VCPU_INIT (VMM) ->
> > 		kvm_arch_vcpu_ioctl_vcpu_init ->
> > 			kvm_vcpu_set_target ->
> > 				kvm_reset_vcpu ->
> > 					kvm_reset_sys_regs (VCPU0.CNTVOFF_EL2=0)
> > 
> > 	KVM_CREATE_VCPU 1 (VMM) ->
> > 		kvm_timer_vcpu_init ->
> > 			update_vtimer_cntvoff (VCPU0.CNTVOFF_EL2=VCPU1.CNTVOFF_EL2=kvm_phys_timer_read)
> > 	KVM_ARM_VCPU_INIT (VMM) ->
> > 		kvm_arch_vcpu_ioctl_vcpu_init ->
> > 			kvm_vcpu_set_target ->
> > 				kvm_reset_vcpu ->
> > 					kvm_reset_sys_regs (VCPU1.CNTVOFF_EL2=0)
> > 
> > 	At this point VCPU0 has CNTVOFF_EL2 == kvm_phys_timer_read, and VCPU1
> > 	has CNTVOFF_EL2 == 0.
> > 
> > The VCPUs having different CNTVOFF_EL2 valuess is just a symptom of the fact that
> > CNTVOFF_EL2 is now reset in kvm_reset_sys_regs.
> 
> Right, and the fundamental problem at hand is that we used CNTVOFF_EL2
> to stash the _host's_ offset even though we are trying to change the
> meaning of it to be part of the virtual EL2's context.

What is driving me nuts is that I have never managed to reproduce the
problem so far. I guess I have too much crap in my tree to spot it...

> 
> > The following patch gets it booting again, but I'm sure it's not the right fix.
> 
> I'd rather we just break the host away from using the shadow reg
> altogether and separately track the host offset. As it so happens Marc
> has a patch that does exactly that [*].

Yup, might as well fix that *and* kill the lock inversion issue in one
go...

> Marc, do you want to resend that patch in isolation addressing the
> feedback and link to this bug report?
> 
> [*] https://lore.kernel.org/kvmarm/20230216142123.2638675-6-maz@kernel.org/

Joey, could you please give that patch a go, just to be on the safe
side?

Thanks,

	M.
diff mbox series

Patch

diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index 1312fb48f18b..d13f168abe4b 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -490,23 +490,51 @@ 
 
 #define SYS_PMCCFILTR_EL0		sys_reg(3, 3, 14, 15, 7)
 
+#define SYS_VPIDR_EL2			sys_reg(3, 4, 0, 0, 0)
+#define SYS_VMPIDR_EL2			sys_reg(3, 4, 0, 0, 5)
+
 #define SYS_SCTLR_EL2			sys_reg(3, 4, 1, 0, 0)
+#define SYS_ACTLR_EL2			sys_reg(3, 4, 1, 0, 1)
+#define SYS_HCR_EL2			sys_reg(3, 4, 1, 1, 0)
+#define SYS_MDCR_EL2			sys_reg(3, 4, 1, 1, 1)
+#define SYS_CPTR_EL2			sys_reg(3, 4, 1, 1, 2)
+#define SYS_HSTR_EL2			sys_reg(3, 4, 1, 1, 3)
 #define SYS_HFGRTR_EL2			sys_reg(3, 4, 1, 1, 4)
 #define SYS_HFGWTR_EL2			sys_reg(3, 4, 1, 1, 5)
 #define SYS_HFGITR_EL2			sys_reg(3, 4, 1, 1, 6)
+#define SYS_HACR_EL2			sys_reg(3, 4, 1, 1, 7)
+
+#define SYS_TTBR0_EL2			sys_reg(3, 4, 2, 0, 0)
+#define SYS_TTBR1_EL2			sys_reg(3, 4, 2, 0, 1)
+#define SYS_TCR_EL2			sys_reg(3, 4, 2, 0, 2)
+#define SYS_VTTBR_EL2			sys_reg(3, 4, 2, 1, 0)
+#define SYS_VTCR_EL2			sys_reg(3, 4, 2, 1, 2)
+
 #define SYS_TRFCR_EL2			sys_reg(3, 4, 1, 2, 1)
 #define SYS_HDFGRTR_EL2			sys_reg(3, 4, 3, 1, 4)
 #define SYS_HDFGWTR_EL2			sys_reg(3, 4, 3, 1, 5)
 #define SYS_HAFGRTR_EL2			sys_reg(3, 4, 3, 1, 6)
 #define SYS_SPSR_EL2			sys_reg(3, 4, 4, 0, 0)
 #define SYS_ELR_EL2			sys_reg(3, 4, 4, 0, 1)
+#define SYS_SP_EL1			sys_reg(3, 4, 4, 1, 0)
 #define SYS_IFSR32_EL2			sys_reg(3, 4, 5, 0, 1)
+#define SYS_AFSR0_EL2			sys_reg(3, 4, 5, 1, 0)
+#define SYS_AFSR1_EL2			sys_reg(3, 4, 5, 1, 1)
 #define SYS_ESR_EL2			sys_reg(3, 4, 5, 2, 0)
 #define SYS_VSESR_EL2			sys_reg(3, 4, 5, 2, 3)
 #define SYS_FPEXC32_EL2			sys_reg(3, 4, 5, 3, 0)
 #define SYS_TFSR_EL2			sys_reg(3, 4, 5, 6, 0)
 
-#define SYS_VDISR_EL2			sys_reg(3, 4, 12, 1,  1)
+#define SYS_FAR_EL2			sys_reg(3, 4, 6, 0, 0)
+#define SYS_HPFAR_EL2			sys_reg(3, 4, 6, 0, 4)
+
+#define SYS_MAIR_EL2			sys_reg(3, 4, 10, 2, 0)
+#define SYS_AMAIR_EL2			sys_reg(3, 4, 10, 3, 0)
+
+#define SYS_VBAR_EL2			sys_reg(3, 4, 12, 0, 0)
+#define SYS_RVBAR_EL2			sys_reg(3, 4, 12, 0, 1)
+#define SYS_RMR_EL2			sys_reg(3, 4, 12, 0, 2)
+#define SYS_VDISR_EL2			sys_reg(3, 4, 12, 1, 1)
 #define __SYS__AP0Rx_EL2(x)		sys_reg(3, 4, 12, 8, x)
 #define SYS_ICH_AP0R0_EL2		__SYS__AP0Rx_EL2(0)
 #define SYS_ICH_AP0R1_EL2		__SYS__AP0Rx_EL2(1)
@@ -548,6 +576,12 @@ 
 #define SYS_ICH_LR14_EL2		__SYS__LR8_EL2(6)
 #define SYS_ICH_LR15_EL2		__SYS__LR8_EL2(7)
 
+#define SYS_CONTEXTIDR_EL2		sys_reg(3, 4, 13, 0, 1)
+#define SYS_TPIDR_EL2			sys_reg(3, 4, 13, 0, 2)
+
+#define SYS_CNTVOFF_EL2			sys_reg(3, 4, 14, 0, 3)
+#define SYS_CNTHCTL_EL2			sys_reg(3, 4, 14, 1, 0)
+
 /* VHE encodings for architectural EL0/1 system registers */
 #define SYS_SCTLR_EL12			sys_reg(3, 5, 1, 0, 0)
 #define SYS_TTBR0_EL12			sys_reg(3, 5, 2, 0, 0)
@@ -570,6 +604,8 @@ 
 #define SYS_CNTV_CTL_EL02		sys_reg(3, 5, 14, 3, 1)
 #define SYS_CNTV_CVAL_EL02		sys_reg(3, 5, 14, 3, 2)
 
+#define SYS_SP_EL2			sys_reg(3, 6,  4, 1, 0)
+
 /* Common SCTLR_ELx flags. */
 #define SCTLR_ELx_ENTP2	(BIT(60))
 #define SCTLR_ELx_DSSBS	(BIT(44))
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index c6cbfe6b854b..1e6ae3b2e6dd 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -24,6 +24,7 @@ 
 #include <asm/kvm_emulate.h>
 #include <asm/kvm_hyp.h>
 #include <asm/kvm_mmu.h>
+#include <asm/kvm_nested.h>
 #include <asm/perf_event.h>
 #include <asm/sysreg.h>
 
@@ -102,6 +103,18 @@  static u32 get_ccsidr(u32 csselr)
 	return ccsidr;
 }
 
+static bool access_rw(struct kvm_vcpu *vcpu,
+		      struct sys_reg_params *p,
+		      const struct sys_reg_desc *r)
+{
+	if (p->is_write)
+		vcpu_write_sys_reg(vcpu, p->regval, r->reg);
+	else
+		p->regval = vcpu_read_sys_reg(vcpu, r->reg);
+
+	return true;
+}
+
 /*
  * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized).
  */
@@ -260,6 +273,14 @@  static bool trap_raz_wi(struct kvm_vcpu *vcpu,
 		return read_zero(vcpu, p);
 }
 
+static bool trap_undef(struct kvm_vcpu *vcpu,
+		       struct sys_reg_params *p,
+		       const struct sys_reg_desc *r)
+{
+	kvm_inject_undefined(vcpu);
+	return false;
+}
+
 /*
  * ARMv8.1 mandates at least a trivial LORegion implementation, where all the
  * RW registers are RES0 (which we can implement as RAZ/WI). On an ARMv8.0
@@ -370,12 +391,9 @@  static bool trap_debug_regs(struct kvm_vcpu *vcpu,
 			    struct sys_reg_params *p,
 			    const struct sys_reg_desc *r)
 {
-	if (p->is_write) {
-		vcpu_write_sys_reg(vcpu, p->regval, r->reg);
+	access_rw(vcpu, p, r);
+	if (p->is_write)
 		vcpu_set_flag(vcpu, DEBUG_DIRTY);
-	} else {
-		p->regval = vcpu_read_sys_reg(vcpu, r->reg);
-	}
 
 	trace_trap_reg(__func__, r->reg, p->is_write, p->regval);
 
@@ -1446,6 +1464,24 @@  static unsigned int mte_visibility(const struct kvm_vcpu *vcpu,
 	.visibility = mte_visibility,		\
 }
 
+static unsigned int el2_visibility(const struct kvm_vcpu *vcpu,
+				   const struct sys_reg_desc *rd)
+{
+	if (vcpu_has_nv(vcpu))
+		return 0;
+
+	return REG_HIDDEN;
+}
+
+#define EL2_REG(name, acc, rst, v) {		\
+	SYS_DESC(SYS_##name),			\
+	.access = acc,				\
+	.reset = rst,				\
+	.reg = name,				\
+	.visibility = el2_visibility,		\
+	.val = v,				\
+}
+
 /* sys_reg_desc initialiser for known cpufeature ID registers */
 #define ID_SANITISED(name) {			\
 	SYS_DESC(SYS_##name),			\
@@ -1490,6 +1526,18 @@  static unsigned int mte_visibility(const struct kvm_vcpu *vcpu,
 	.visibility = raz_visibility,		\
 }
 
+static bool access_sp_el1(struct kvm_vcpu *vcpu,
+			  struct sys_reg_params *p,
+			  const struct sys_reg_desc *r)
+{
+	if (p->is_write)
+		__vcpu_sys_reg(vcpu, SP_EL1) = p->regval;
+	else
+		p->regval = __vcpu_sys_reg(vcpu, SP_EL1);
+
+	return true;
+}
+
 /*
  * Architected system registers.
  * Important: Must be sorted ascending by Op0, Op1, CRn, CRm, Op2
@@ -1913,9 +1961,50 @@  static const struct sys_reg_desc sys_reg_descs[] = {
 	{ PMU_SYS_REG(SYS_PMCCFILTR_EL0), .access = access_pmu_evtyper,
 	  .reset = reset_val, .reg = PMCCFILTR_EL0, .val = 0 },
 
+	EL2_REG(VPIDR_EL2, access_rw, reset_unknown, 0),
+	EL2_REG(VMPIDR_EL2, access_rw, reset_unknown, 0),
+	EL2_REG(SCTLR_EL2, access_rw, reset_val, SCTLR_EL2_RES1),
+	EL2_REG(ACTLR_EL2, access_rw, reset_val, 0),
+	EL2_REG(HCR_EL2, access_rw, reset_val, 0),
+	EL2_REG(MDCR_EL2, access_rw, reset_val, 0),
+	EL2_REG(CPTR_EL2, access_rw, reset_val, CPTR_EL2_DEFAULT ),
+	EL2_REG(HSTR_EL2, access_rw, reset_val, 0),
+	EL2_REG(HACR_EL2, access_rw, reset_val, 0),
+
+	EL2_REG(TTBR0_EL2, access_rw, reset_val, 0),
+	EL2_REG(TTBR1_EL2, access_rw, reset_val, 0),
+	EL2_REG(TCR_EL2, access_rw, reset_val, TCR_EL2_RES1),
+	EL2_REG(VTTBR_EL2, access_rw, reset_val, 0),
+	EL2_REG(VTCR_EL2, access_rw, reset_val, 0),
+
 	{ SYS_DESC(SYS_DACR32_EL2), NULL, reset_unknown, DACR32_EL2 },
+	EL2_REG(SPSR_EL2, access_rw, reset_val, 0),
+	EL2_REG(ELR_EL2, access_rw, reset_val, 0),
+	{ SYS_DESC(SYS_SP_EL1), access_sp_el1},
+
 	{ SYS_DESC(SYS_IFSR32_EL2), NULL, reset_unknown, IFSR32_EL2 },
+	EL2_REG(AFSR0_EL2, access_rw, reset_val, 0),
+	EL2_REG(AFSR1_EL2, access_rw, reset_val, 0),
+	EL2_REG(ESR_EL2, access_rw, reset_val, 0),
 	{ SYS_DESC(SYS_FPEXC32_EL2), NULL, reset_val, FPEXC32_EL2, 0x700 },
+
+	EL2_REG(FAR_EL2, access_rw, reset_val, 0),
+	EL2_REG(HPFAR_EL2, access_rw, reset_val, 0),
+
+	EL2_REG(MAIR_EL2, access_rw, reset_val, 0),
+	EL2_REG(AMAIR_EL2, access_rw, reset_val, 0),
+
+	EL2_REG(VBAR_EL2, access_rw, reset_val, 0),
+	EL2_REG(RVBAR_EL2, access_rw, reset_val, 0),
+	{ SYS_DESC(SYS_RMR_EL2), trap_undef },
+
+	EL2_REG(CONTEXTIDR_EL2, access_rw, reset_val, 0),
+	EL2_REG(TPIDR_EL2, access_rw, reset_val, 0),
+
+	EL2_REG(CNTVOFF_EL2, access_rw, reset_val, 0),
+	EL2_REG(CNTHCTL_EL2, access_rw, reset_val, 0),
+
+	EL2_REG(SP_EL2, NULL, reset_unknown, 0),
 };
 
 static bool trap_dbgdidr(struct kvm_vcpu *vcpu,