diff mbox series

[13/59] KVM: arm64: nv: Handle virtual EL2 registers in vcpu_read/write_sys_reg()

Message ID 20190621093843.220980-14-marc.zyngier@arm.com (mailing list archive)
State New, archived
Headers show
Series KVM: arm64: ARMv8.3 Nested Virtualization support | expand

Commit Message

Marc Zyngier June 21, 2019, 9:37 a.m. UTC
From: Andre Przywara <andre.przywara@arm.com>

KVM internally uses accessor functions when reading or writing the
guest's system registers. This takes care of accessing either the stored
copy or using the "live" EL1 system registers when the host uses VHE.

With the introduction of virtual EL2 we add a bunch of EL2 system
registers, which now must also be taken care of:
- If the guest is running in vEL2, and we access an EL1 sysreg, we must
  revert to the stored version of that, and not use the CPU's copy.
- If the guest is running in vEL1, and we access an EL2 sysreg, we must
  also use the stored version, since the CPU carries the EL1 copy.
- Some EL2 system registers are supposed to affect the current execution
  of the system, so we need to put them into their respective EL1
  counterparts. For this we need to define a mapping between the two.
  This is done using the newly introduced struct el2_sysreg_map.
- Some EL2 system registers have a different format than their EL1
  counterpart, so we need to translate them before writing them to the
  CPU. This is done using an (optional) translate function in the map.
- There are the three special registers SP_EL2, SPSR_EL2 and ELR_EL2,
  which need some separate handling.

All of these cases are now wrapped into the existing accessor functions,
so KVM users wouldn't need to care whether they access EL2 or EL1
registers and also which state the guest is in.

This handles what was formerly known as the "shadow state" dynamically,
without requiring a separate copy for each vCPU EL.

Signed-off-by: Andre Przywara <andre.przywara@arm.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 arch/arm64/include/asm/kvm_emulate.h |   6 +
 arch/arm64/include/asm/kvm_host.h    |   5 +
 arch/arm64/kvm/sys_regs.c            | 163 +++++++++++++++++++++++++++
 3 files changed, 174 insertions(+)

Comments

Julien Thierry June 24, 2019, 12:42 p.m. UTC | #1
On 06/21/2019 10:37 AM, Marc Zyngier wrote:
> From: Andre Przywara <andre.przywara@arm.com>
> 
> KVM internally uses accessor functions when reading or writing the
> guest's system registers. This takes care of accessing either the stored
> copy or using the "live" EL1 system registers when the host uses VHE.
> 
> With the introduction of virtual EL2 we add a bunch of EL2 system
> registers, which now must also be taken care of:
> - If the guest is running in vEL2, and we access an EL1 sysreg, we must
>   revert to the stored version of that, and not use the CPU's copy.
> - If the guest is running in vEL1, and we access an EL2 sysreg, we must
>   also use the stored version, since the CPU carries the EL1 copy.
> - Some EL2 system registers are supposed to affect the current execution
>   of the system, so we need to put them into their respective EL1
>   counterparts. For this we need to define a mapping between the two.
>   This is done using the newly introduced struct el2_sysreg_map.
> - Some EL2 system registers have a different format than their EL1
>   counterpart, so we need to translate them before writing them to the
>   CPU. This is done using an (optional) translate function in the map.
> - There are the three special registers SP_EL2, SPSR_EL2 and ELR_EL2,
>   which need some separate handling.
> 
> All of these cases are now wrapped into the existing accessor functions,
> so KVM users wouldn't need to care whether they access EL2 or EL1
> registers and also which state the guest is in.
> 
> This handles what was formerly known as the "shadow state" dynamically,
> without requiring a separate copy for each vCPU EL.
> 
> Signed-off-by: Andre Przywara <andre.przywara@arm.com>
> Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
> ---
>  arch/arm64/include/asm/kvm_emulate.h |   6 +
>  arch/arm64/include/asm/kvm_host.h    |   5 +
>  arch/arm64/kvm/sys_regs.c            | 163 +++++++++++++++++++++++++++
>  3 files changed, 174 insertions(+)
> 
> diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
> index c43aac5fed69..f37006b6eec4 100644
> --- a/arch/arm64/include/asm/kvm_emulate.h
> +++ b/arch/arm64/include/asm/kvm_emulate.h
> @@ -70,6 +70,12 @@ void kvm_emulate_nested_eret(struct kvm_vcpu *vcpu);
>  int kvm_inject_nested_sync(struct kvm_vcpu *vcpu, u64 esr_el2);
>  int kvm_inject_nested_irq(struct kvm_vcpu *vcpu);
>  
> +u64 translate_tcr(u64 tcr);
> +u64 translate_cptr(u64 tcr);
> +u64 translate_sctlr(u64 tcr);
> +u64 translate_ttbr0(u64 tcr);
> +u64 translate_cnthctl(u64 tcr);
> +
>  static inline bool vcpu_el1_is_32bit(struct kvm_vcpu *vcpu)
>  {
>  	return !(vcpu->arch.hcr_el2 & HCR_RW);
> diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
> index 2d4290d2513a..dae9c42a7219 100644
> --- a/arch/arm64/include/asm/kvm_host.h
> +++ b/arch/arm64/include/asm/kvm_host.h
> @@ -217,6 +217,11 @@ enum vcpu_sysreg {
>  	NR_SYS_REGS	/* Nothing after this line! */
>  };
>  
> +static inline bool sysreg_is_el2(int reg)
> +{
> +	return reg >= FIRST_EL2_SYSREG && reg < NR_SYS_REGS;
> +}
> +
>  /* 32bit mapping */
>  #define c0_MPIDR	(MPIDR_EL1 * 2)	/* MultiProcessor ID Register */
>  #define c0_CSSELR	(CSSELR_EL1 * 2)/* Cache Size Selection Register */
> diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
> index 693dd063c9c2..d024114da162 100644
> --- a/arch/arm64/kvm/sys_regs.c
> +++ b/arch/arm64/kvm/sys_regs.c
> @@ -76,11 +76,142 @@ static bool write_to_read_only(struct kvm_vcpu *vcpu,
>  	return false;
>  }
>  
> +static u64 tcr_el2_ips_to_tcr_el1_ps(u64 tcr_el2)
> +{
> +	return ((tcr_el2 & TCR_EL2_PS_MASK) >> TCR_EL2_PS_SHIFT)
> +		<< TCR_IPS_SHIFT;
> +}
> +
> +u64 translate_tcr(u64 tcr)
> +{
> +	return TCR_EPD1_MASK |				/* disable TTBR1_EL1 */
> +	       ((tcr & TCR_EL2_TBI) ? TCR_TBI0 : 0) |
> +	       tcr_el2_ips_to_tcr_el1_ps(tcr) |
> +	       (tcr & TCR_EL2_TG0_MASK) |
> +	       (tcr & TCR_EL2_ORGN0_MASK) |
> +	       (tcr & TCR_EL2_IRGN0_MASK) |
> +	       (tcr & TCR_EL2_T0SZ_MASK);
> +}
> +
> +u64 translate_cptr(u64 cptr_el2)
> +{
> +	u64 cpacr_el1 = 0;
> +
> +	if (!(cptr_el2 & CPTR_EL2_TFP))
> +		cpacr_el1 |= CPACR_EL1_FPEN;
> +	if (cptr_el2 & CPTR_EL2_TTA)
> +		cpacr_el1 |= CPACR_EL1_TTA;
> +	if (!(cptr_el2 & CPTR_EL2_TZ))
> +		cpacr_el1 |= CPACR_EL1_ZEN;
> +
> +	return cpacr_el1;
> +}
> +
> +u64 translate_sctlr(u64 sctlr)
> +{
> +	/* Bit 20 is RES1 in SCTLR_EL1, but RES0 in SCTLR_EL2 */
> +	return sctlr | BIT(20);
> +}
> +
> +u64 translate_ttbr0(u64 ttbr0)
> +{
> +	/* Force ASID to 0 (ASID 0 or RES0) */
> +	return ttbr0 & ~GENMASK_ULL(63, 48);
> +}
> +
> +u64 translate_cnthctl(u64 cnthctl)
> +{
> +	return ((cnthctl & 0x3) << 10) | (cnthctl & 0xfc);
> +}
> +
> +#define EL2_SYSREG(el2, el1, translate)	\
> +	[el2 - FIRST_EL2_SYSREG] = { el2, el1, translate }
> +#define PURE_EL2_SYSREG(el2) \
> +	[el2 - FIRST_EL2_SYSREG] = { el2,__INVALID_SYSREG__, NULL }
> +/*
> + * Associate vEL2 registers to their EL1 counterparts on the CPU.
> + * The translate function can be NULL, when the register layout is identical.
> + */
> +struct el2_sysreg_map {
> +	int sysreg;	/* EL2 register index into the array above */
> +	int mapping;	/* associated EL1 register */
> +	u64 (*translate)(u64 value);
> +} nested_sysreg_map[NR_SYS_REGS - FIRST_EL2_SYSREG] = {
> +	PURE_EL2_SYSREG( VPIDR_EL2 ),
> +	PURE_EL2_SYSREG( VMPIDR_EL2 ),
> +	PURE_EL2_SYSREG( ACTLR_EL2 ),
> +	PURE_EL2_SYSREG( HCR_EL2 ),
> +	PURE_EL2_SYSREG( MDCR_EL2 ),
> +	PURE_EL2_SYSREG( HSTR_EL2 ),
> +	PURE_EL2_SYSREG( HACR_EL2 ),
> +	PURE_EL2_SYSREG( VTTBR_EL2 ),
> +	PURE_EL2_SYSREG( VTCR_EL2 ),
> +	PURE_EL2_SYSREG( RVBAR_EL2 ),
> +	PURE_EL2_SYSREG( RMR_EL2 ),
> +	PURE_EL2_SYSREG( TPIDR_EL2 ),
> +	PURE_EL2_SYSREG( CNTVOFF_EL2 ),
> +	PURE_EL2_SYSREG( CNTHCTL_EL2 ),
> +	PURE_EL2_SYSREG( HPFAR_EL2 ),
> +	EL2_SYSREG(      SCTLR_EL2,  SCTLR_EL1,      translate_sctlr ),
> +	EL2_SYSREG(      CPTR_EL2,   CPACR_EL1,      translate_cptr  ),
> +	EL2_SYSREG(      TTBR0_EL2,  TTBR0_EL1,      translate_ttbr0 ),
> +	EL2_SYSREG(      TTBR1_EL2,  TTBR1_EL1,      NULL            ),
> +	EL2_SYSREG(      TCR_EL2,    TCR_EL1,        translate_tcr   ),
> +	EL2_SYSREG(      VBAR_EL2,   VBAR_EL1,       NULL            ),
> +	EL2_SYSREG(      AFSR0_EL2,  AFSR0_EL1,      NULL            ),
> +	EL2_SYSREG(      AFSR1_EL2,  AFSR1_EL1,      NULL            ),
> +	EL2_SYSREG(      ESR_EL2,    ESR_EL1,        NULL            ),
> +	EL2_SYSREG(      FAR_EL2,    FAR_EL1,        NULL            ),
> +	EL2_SYSREG(      MAIR_EL2,   MAIR_EL1,       NULL            ),
> +	EL2_SYSREG(      AMAIR_EL2,  AMAIR_EL1,      NULL            ),
> +};
> +
> +static
> +const struct el2_sysreg_map *find_el2_sysreg(const struct el2_sysreg_map *map,
> +					     int reg)
> +{
> +	const struct el2_sysreg_map *entry;
> +
> +	if (!sysreg_is_el2(reg))
> +		return NULL;
> +
> +	entry = &nested_sysreg_map[reg - FIRST_EL2_SYSREG];
> +	if (entry->sysreg == __INVALID_SYSREG__)
> +		return NULL;
> +
> +	return entry;
> +}
> +
>  u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg)
>  {
> +
>  	if (!vcpu->arch.sysregs_loaded_on_cpu)
>  		goto immediate_read;
>  
> +	if (unlikely(sysreg_is_el2(reg))) {
> +		const struct el2_sysreg_map *el2_reg;
> +
> +		if (!is_hyp_ctxt(vcpu))
> +			goto immediate_read;
> +
> +		el2_reg = find_el2_sysreg(nested_sysreg_map, reg);
> +		if (el2_reg) {
> +			/*
> +			 * If this register does not have an EL1 counterpart,
> +			 * then read the stored EL2 version.
> +			 */
> +			if (el2_reg->mapping == __INVALID_SYSREG__)

In this patch, find_el2_sysreg returns NULL for PURE_EL2 registers. So
for PURE_EL2, the access would go through the switch case. However this
branch suggest that for PURE_EL2 register we intend to do the read from
the memory backed version.

Which should it be?

> +				goto immediate_read;
> +
> +			/* Get the current version of the EL1 counterpart. */
> +			reg = el2_reg->mapping;
> +		}
> +	} else {
> +		/* EL1 register can't be on the CPU if the guest is in vEL2. */
> +		if (unlikely(is_hyp_ctxt(vcpu)))
> +			goto immediate_read;
> +	}
> +
>  	/*
>  	 * System registers listed in the switch are not saved on every
>  	 * exit from the guest but are only saved on vcpu_put.
> @@ -114,6 +245,8 @@ u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg)
>  	case DACR32_EL2:	return read_sysreg_s(SYS_DACR32_EL2);
>  	case IFSR32_EL2:	return read_sysreg_s(SYS_IFSR32_EL2);
>  	case DBGVCR32_EL2:	return read_sysreg_s(SYS_DBGVCR32_EL2);
> +	case SP_EL2:		return read_sysreg(sp_el1);
> +	case ELR_EL2:		return read_sysreg_el1(SYS_ELR);
>  	}
>  
>  immediate_read:
> @@ -125,6 +258,34 @@ void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg)
>  	if (!vcpu->arch.sysregs_loaded_on_cpu)
>  		goto immediate_write;
>  
> +	if (unlikely(sysreg_is_el2(reg))) {
> +		const struct el2_sysreg_map *el2_reg;
> +
> +		if (!is_hyp_ctxt(vcpu))
> +			goto immediate_write;
> +
> +		/* Store the EL2 version in the sysregs array. */
> +		__vcpu_sys_reg(vcpu, reg) = val;
> +
> +		el2_reg = find_el2_sysreg(nested_sysreg_map, reg);
> +		if (el2_reg) {
> +			/* Does this register have an EL1 counterpart? */
> +			if (el2_reg->mapping == __INVALID_SYSREG__)
> +				return;

As in the read case, this is never reached and we'll go through the
switch case.

Cheers,
Alexandru Elisei June 25, 2019, 2:02 p.m. UTC | #2
On 6/24/19 1:42 PM, Julien Thierry wrote:
>
> On 06/21/2019 10:37 AM, Marc Zyngier wrote:
>> From: Andre Przywara <andre.przywara@arm.com>
>>
>> KVM internally uses accessor functions when reading or writing the
>> guest's system registers. This takes care of accessing either the stored
>> copy or using the "live" EL1 system registers when the host uses VHE.
>>
>> With the introduction of virtual EL2 we add a bunch of EL2 system
>> registers, which now must also be taken care of:
>> - If the guest is running in vEL2, and we access an EL1 sysreg, we must
>>   revert to the stored version of that, and not use the CPU's copy.
>> - If the guest is running in vEL1, and we access an EL2 sysreg, we must
>>   also use the stored version, since the CPU carries the EL1 copy.
>> - Some EL2 system registers are supposed to affect the current execution
>>   of the system, so we need to put them into their respective EL1
>>   counterparts. For this we need to define a mapping between the two.
>>   This is done using the newly introduced struct el2_sysreg_map.
>> - Some EL2 system registers have a different format than their EL1
>>   counterpart, so we need to translate them before writing them to the
>>   CPU. This is done using an (optional) translate function in the map.
>> - There are the three special registers SP_EL2, SPSR_EL2 and ELR_EL2,
>>   which need some separate handling.
>>
>> All of these cases are now wrapped into the existing accessor functions,
>> so KVM users wouldn't need to care whether they access EL2 or EL1
>> registers and also which state the guest is in.
>>
>> This handles what was formerly known as the "shadow state" dynamically,
>> without requiring a separate copy for each vCPU EL.
>>
>> Signed-off-by: Andre Przywara <andre.przywara@arm.com>
>> Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
>> ---
>>  arch/arm64/include/asm/kvm_emulate.h |   6 +
>>  arch/arm64/include/asm/kvm_host.h    |   5 +
>>  arch/arm64/kvm/sys_regs.c            | 163 +++++++++++++++++++++++++++
>>  3 files changed, 174 insertions(+)
>>
>> diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
>> index c43aac5fed69..f37006b6eec4 100644
>> --- a/arch/arm64/include/asm/kvm_emulate.h
>> +++ b/arch/arm64/include/asm/kvm_emulate.h
>> @@ -70,6 +70,12 @@ void kvm_emulate_nested_eret(struct kvm_vcpu *vcpu);
>>  int kvm_inject_nested_sync(struct kvm_vcpu *vcpu, u64 esr_el2);
>>  int kvm_inject_nested_irq(struct kvm_vcpu *vcpu);
>>  
>> +u64 translate_tcr(u64 tcr);
>> +u64 translate_cptr(u64 tcr);
>> +u64 translate_sctlr(u64 tcr);
>> +u64 translate_ttbr0(u64 tcr);
>> +u64 translate_cnthctl(u64 tcr);
>> +
>>  static inline bool vcpu_el1_is_32bit(struct kvm_vcpu *vcpu)
>>  {
>>  	return !(vcpu->arch.hcr_el2 & HCR_RW);
>> diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
>> index 2d4290d2513a..dae9c42a7219 100644
>> --- a/arch/arm64/include/asm/kvm_host.h
>> +++ b/arch/arm64/include/asm/kvm_host.h
>> @@ -217,6 +217,11 @@ enum vcpu_sysreg {
>>  	NR_SYS_REGS	/* Nothing after this line! */
>>  };
>>  
>> +static inline bool sysreg_is_el2(int reg)
>> +{
>> +	return reg >= FIRST_EL2_SYSREG && reg < NR_SYS_REGS;
>> +}
>> +
>>  /* 32bit mapping */
>>  #define c0_MPIDR	(MPIDR_EL1 * 2)	/* MultiProcessor ID Register */
>>  #define c0_CSSELR	(CSSELR_EL1 * 2)/* Cache Size Selection Register */
>> diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
>> index 693dd063c9c2..d024114da162 100644
>> --- a/arch/arm64/kvm/sys_regs.c
>> +++ b/arch/arm64/kvm/sys_regs.c
>> @@ -76,11 +76,142 @@ static bool write_to_read_only(struct kvm_vcpu *vcpu,
>>  	return false;
>>  }
>>  
>> +static u64 tcr_el2_ips_to_tcr_el1_ps(u64 tcr_el2)
>> +{
>> +	return ((tcr_el2 & TCR_EL2_PS_MASK) >> TCR_EL2_PS_SHIFT)
>> +		<< TCR_IPS_SHIFT;
>> +}
>> +
>> +u64 translate_tcr(u64 tcr)
>> +{
>> +	return TCR_EPD1_MASK |				/* disable TTBR1_EL1 */
>> +	       ((tcr & TCR_EL2_TBI) ? TCR_TBI0 : 0) |
>> +	       tcr_el2_ips_to_tcr_el1_ps(tcr) |
>> +	       (tcr & TCR_EL2_TG0_MASK) |
>> +	       (tcr & TCR_EL2_ORGN0_MASK) |
>> +	       (tcr & TCR_EL2_IRGN0_MASK) |
>> +	       (tcr & TCR_EL2_T0SZ_MASK);
>> +}
>> +
>> +u64 translate_cptr(u64 cptr_el2)
>> +{
>> +	u64 cpacr_el1 = 0;
>> +
>> +	if (!(cptr_el2 & CPTR_EL2_TFP))
>> +		cpacr_el1 |= CPACR_EL1_FPEN;
>> +	if (cptr_el2 & CPTR_EL2_TTA)
>> +		cpacr_el1 |= CPACR_EL1_TTA;
>> +	if (!(cptr_el2 & CPTR_EL2_TZ))
>> +		cpacr_el1 |= CPACR_EL1_ZEN;
>> +
>> +	return cpacr_el1;
>> +}
>> +
>> +u64 translate_sctlr(u64 sctlr)
>> +{
>> +	/* Bit 20 is RES1 in SCTLR_EL1, but RES0 in SCTLR_EL2 */
>> +	return sctlr | BIT(20);
>> +}
>> +
>> +u64 translate_ttbr0(u64 ttbr0)
>> +{
>> +	/* Force ASID to 0 (ASID 0 or RES0) */
>> +	return ttbr0 & ~GENMASK_ULL(63, 48);
>> +}
>> +
>> +u64 translate_cnthctl(u64 cnthctl)
>> +{
>> +	return ((cnthctl & 0x3) << 10) | (cnthctl & 0xfc);
>> +}
>> +
>> +#define EL2_SYSREG(el2, el1, translate)	\
>> +	[el2 - FIRST_EL2_SYSREG] = { el2, el1, translate }
>> +#define PURE_EL2_SYSREG(el2) \
>> +	[el2 - FIRST_EL2_SYSREG] = { el2,__INVALID_SYSREG__, NULL }
>> +/*
>> + * Associate vEL2 registers to their EL1 counterparts on the CPU.
>> + * The translate function can be NULL, when the register layout is identical.
>> + */
>> +struct el2_sysreg_map {
>> +	int sysreg;	/* EL2 register index into the array above */
>> +	int mapping;	/* associated EL1 register */
>> +	u64 (*translate)(u64 value);
>> +} nested_sysreg_map[NR_SYS_REGS - FIRST_EL2_SYSREG] = {
>> +	PURE_EL2_SYSREG( VPIDR_EL2 ),
>> +	PURE_EL2_SYSREG( VMPIDR_EL2 ),
>> +	PURE_EL2_SYSREG( ACTLR_EL2 ),
>> +	PURE_EL2_SYSREG( HCR_EL2 ),
>> +	PURE_EL2_SYSREG( MDCR_EL2 ),
>> +	PURE_EL2_SYSREG( HSTR_EL2 ),
>> +	PURE_EL2_SYSREG( HACR_EL2 ),
>> +	PURE_EL2_SYSREG( VTTBR_EL2 ),
>> +	PURE_EL2_SYSREG( VTCR_EL2 ),
>> +	PURE_EL2_SYSREG( RVBAR_EL2 ),
>> +	PURE_EL2_SYSREG( RMR_EL2 ),
>> +	PURE_EL2_SYSREG( TPIDR_EL2 ),
>> +	PURE_EL2_SYSREG( CNTVOFF_EL2 ),
>> +	PURE_EL2_SYSREG( CNTHCTL_EL2 ),
>> +	PURE_EL2_SYSREG( HPFAR_EL2 ),
>> +	EL2_SYSREG(      SCTLR_EL2,  SCTLR_EL1,      translate_sctlr ),
>> +	EL2_SYSREG(      CPTR_EL2,   CPACR_EL1,      translate_cptr  ),
>> +	EL2_SYSREG(      TTBR0_EL2,  TTBR0_EL1,      translate_ttbr0 ),
>> +	EL2_SYSREG(      TTBR1_EL2,  TTBR1_EL1,      NULL            ),
>> +	EL2_SYSREG(      TCR_EL2,    TCR_EL1,        translate_tcr   ),
>> +	EL2_SYSREG(      VBAR_EL2,   VBAR_EL1,       NULL            ),
>> +	EL2_SYSREG(      AFSR0_EL2,  AFSR0_EL1,      NULL            ),
>> +	EL2_SYSREG(      AFSR1_EL2,  AFSR1_EL1,      NULL            ),
>> +	EL2_SYSREG(      ESR_EL2,    ESR_EL1,        NULL            ),
>> +	EL2_SYSREG(      FAR_EL2,    FAR_EL1,        NULL            ),
>> +	EL2_SYSREG(      MAIR_EL2,   MAIR_EL1,       NULL            ),
>> +	EL2_SYSREG(      AMAIR_EL2,  AMAIR_EL1,      NULL            ),
>> +};
>> +
>> +static
>> +const struct el2_sysreg_map *find_el2_sysreg(const struct el2_sysreg_map *map,
>> +					     int reg)
>> +{
>> +	const struct el2_sysreg_map *entry;
>> +
>> +	if (!sysreg_is_el2(reg))
>> +		return NULL;
>> +
>> +	entry = &nested_sysreg_map[reg - FIRST_EL2_SYSREG];
>> +	if (entry->sysreg == __INVALID_SYSREG__)
>> +		return NULL;
>> +
>> +	return entry;
>> +}
>> +
>>  u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg)
>>  {
>> +
>>  	if (!vcpu->arch.sysregs_loaded_on_cpu)
>>  		goto immediate_read;
>>  
>> +	if (unlikely(sysreg_is_el2(reg))) {
>> +		const struct el2_sysreg_map *el2_reg;
>> +
>> +		if (!is_hyp_ctxt(vcpu))
>> +			goto immediate_read;
>> +
>> +		el2_reg = find_el2_sysreg(nested_sysreg_map, reg);
>> +		if (el2_reg) {
>> +			/*
>> +			 * If this register does not have an EL1 counterpart,
>> +			 * then read the stored EL2 version.
>> +			 */
>> +			if (el2_reg->mapping == __INVALID_SYSREG__)
> In this patch, find_el2_sysreg returns NULL for PURE_EL2 registers. So
> for PURE_EL2, the access would go through the switch case. However this
> branch suggest that for PURE_EL2 register we intend to do the read from
> the memory backed version.
>
> Which should it be?
From my understanding of the code, find_el2_sysreg returns NULL when reg is not
an EL2 register or when the entry associated with reg in nested_sysreg_map is
zero (reg is not in the map).
>
>> +				goto immediate_read;
>> +
>> +			/* Get the current version of the EL1 counterpart. */
>> +			reg = el2_reg->mapping;
>> +		}
>> +	} else {
>> +		/* EL1 register can't be on the CPU if the guest is in vEL2. */
>> +		if (unlikely(is_hyp_ctxt(vcpu)))
>> +			goto immediate_read;
>> +	}
>> +
>>  	/*
>>  	 * System registers listed in the switch are not saved on every
>>  	 * exit from the guest but are only saved on vcpu_put.
>> @@ -114,6 +245,8 @@ u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg)
>>  	case DACR32_EL2:	return read_sysreg_s(SYS_DACR32_EL2);
>>  	case IFSR32_EL2:	return read_sysreg_s(SYS_IFSR32_EL2);
>>  	case DBGVCR32_EL2:	return read_sysreg_s(SYS_DBGVCR32_EL2);
>> +	case SP_EL2:		return read_sysreg(sp_el1);
>> +	case ELR_EL2:		return read_sysreg_el1(SYS_ELR);
>>  	}
>>  
>>  immediate_read:
>> @@ -125,6 +258,34 @@ void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg)
>>  	if (!vcpu->arch.sysregs_loaded_on_cpu)
>>  		goto immediate_write;
>>  
>> +	if (unlikely(sysreg_is_el2(reg))) {
>> +		const struct el2_sysreg_map *el2_reg;
>> +
>> +		if (!is_hyp_ctxt(vcpu))
>> +			goto immediate_write;
>> +
>> +		/* Store the EL2 version in the sysregs array. */
>> +		__vcpu_sys_reg(vcpu, reg) = val;
>> +
>> +		el2_reg = find_el2_sysreg(nested_sysreg_map, reg);
>> +		if (el2_reg) {
>> +			/* Does this register have an EL1 counterpart? */
>> +			if (el2_reg->mapping == __INVALID_SYSREG__)
>> +				return;
> As in the read case, this is never reached and we'll go through the
> switch case.
>
> Cheers,
>
Alexandru Elisei June 25, 2019, 3:18 p.m. UTC | #3
Hi Marc,

A question regarding this patch. This patch modifies vcpu_{read,write}_sys_reg
to handle virtual EL2 registers. However, the file kvm/emulate-nested.c added by
patch 10/59 "KVM: arm64: nv: Support virtual EL2 exceptions" already uses
vcpu_{read,write}_sys_reg to access EL2 registers. In my opinion, it doesn't
really matter which comes first because nested support is only enabled in the
last patch of the series, but I thought I should bring this up in case it is not
what you intended.

On 6/21/19 10:37 AM, Marc Zyngier wrote:
> From: Andre Przywara <andre.przywara@arm.com>
>
> KVM internally uses accessor functions when reading or writing the
> guest's system registers. This takes care of accessing either the stored
> copy or using the "live" EL1 system registers when the host uses VHE.
>
> With the introduction of virtual EL2 we add a bunch of EL2 system
> registers, which now must also be taken care of:
> - If the guest is running in vEL2, and we access an EL1 sysreg, we must
>   revert to the stored version of that, and not use the CPU's copy.
> - If the guest is running in vEL1, and we access an EL2 sysreg, we must
>   also use the stored version, since the CPU carries the EL1 copy.
> - Some EL2 system registers are supposed to affect the current execution
>   of the system, so we need to put them into their respective EL1
>   counterparts. For this we need to define a mapping between the two.
>   This is done using the newly introduced struct el2_sysreg_map.
> - Some EL2 system registers have a different format than their EL1
>   counterpart, so we need to translate them before writing them to the
>   CPU. This is done using an (optional) translate function in the map.
> - There are the three special registers SP_EL2, SPSR_EL2 and ELR_EL2,
>   which need some separate handling.
I see no change in this patch related to SPSR_EL2. Special handling of SPSR_EL2
is added in the next patch, patch 14/59 "KVM: arm64: nv: Handle SPSR_EL2 specially".
>
> All of these cases are now wrapped into the existing accessor functions,
> so KVM users wouldn't need to care whether they access EL2 or EL1
> registers and also which state the guest is in.
>
> This handles what was formerly known as the "shadow state" dynamically,
> without requiring a separate copy for each vCPU EL.
>
> Signed-off-by: Andre Przywara <andre.przywara@arm.com>
> Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
> ---
>  arch/arm64/include/asm/kvm_emulate.h |   6 +
>  arch/arm64/include/asm/kvm_host.h    |   5 +
>  arch/arm64/kvm/sys_regs.c            | 163 +++++++++++++++++++++++++++
>  3 files changed, 174 insertions(+)
>
> diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
> index c43aac5fed69..f37006b6eec4 100644
> --- a/arch/arm64/include/asm/kvm_emulate.h
> +++ b/arch/arm64/include/asm/kvm_emulate.h
> @@ -70,6 +70,12 @@ void kvm_emulate_nested_eret(struct kvm_vcpu *vcpu);
>  int kvm_inject_nested_sync(struct kvm_vcpu *vcpu, u64 esr_el2);
>  int kvm_inject_nested_irq(struct kvm_vcpu *vcpu);
>  
> +u64 translate_tcr(u64 tcr);
> +u64 translate_cptr(u64 tcr);
> +u64 translate_sctlr(u64 tcr);
> +u64 translate_ttbr0(u64 tcr);
> +u64 translate_cnthctl(u64 tcr);
> +
>  static inline bool vcpu_el1_is_32bit(struct kvm_vcpu *vcpu)
>  {
>  	return !(vcpu->arch.hcr_el2 & HCR_RW);
> diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
> index 2d4290d2513a..dae9c42a7219 100644
> --- a/arch/arm64/include/asm/kvm_host.h
> +++ b/arch/arm64/include/asm/kvm_host.h
> @@ -217,6 +217,11 @@ enum vcpu_sysreg {
>  	NR_SYS_REGS	/* Nothing after this line! */
>  };
>  
> +static inline bool sysreg_is_el2(int reg)
> +{
> +	return reg >= FIRST_EL2_SYSREG && reg < NR_SYS_REGS;
> +}
> +
>  /* 32bit mapping */
>  #define c0_MPIDR	(MPIDR_EL1 * 2)	/* MultiProcessor ID Register */
>  #define c0_CSSELR	(CSSELR_EL1 * 2)/* Cache Size Selection Register */
> diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
> index 693dd063c9c2..d024114da162 100644
> --- a/arch/arm64/kvm/sys_regs.c
> +++ b/arch/arm64/kvm/sys_regs.c
> @@ -76,11 +76,142 @@ static bool write_to_read_only(struct kvm_vcpu *vcpu,
>  	return false;
>  }
>  
> +static u64 tcr_el2_ips_to_tcr_el1_ps(u64 tcr_el2)
The code seems to suggest that you are translating TCR_EL2.PS to TCR_EL1.IPS.
Perhaps the function should be named tcr_el2_ps_to_tcr_el1_ips?
> +{
> +	return ((tcr_el2 & TCR_EL2_PS_MASK) >> TCR_EL2_PS_SHIFT)
> +		<< TCR_IPS_SHIFT;
> +}
> +
> +u64 translate_tcr(u64 tcr)
> +{
> +	return TCR_EPD1_MASK |				/* disable TTBR1_EL1 */
> +	       ((tcr & TCR_EL2_TBI) ? TCR_TBI0 : 0) |
> +	       tcr_el2_ips_to_tcr_el1_ps(tcr) |
> +	       (tcr & TCR_EL2_TG0_MASK) |
> +	       (tcr & TCR_EL2_ORGN0_MASK) |
> +	       (tcr & TCR_EL2_IRGN0_MASK) |
> +	       (tcr & TCR_EL2_T0SZ_MASK);
> +}
> +
> +u64 translate_cptr(u64 cptr_el2)
The argument name is not consistent with the other translate_* functions. I
think it is reasonably obvious that you are translating an EL2 register.
> +{
> +	u64 cpacr_el1 = 0;
> +
> +	if (!(cptr_el2 & CPTR_EL2_TFP))
> +		cpacr_el1 |= CPACR_EL1_FPEN;
> +	if (cptr_el2 & CPTR_EL2_TTA)
> +		cpacr_el1 |= CPACR_EL1_TTA;
> +	if (!(cptr_el2 & CPTR_EL2_TZ))
> +		cpacr_el1 |= CPACR_EL1_ZEN;
> +
> +	return cpacr_el1;
> +}
> +
> +u64 translate_sctlr(u64 sctlr)
> +{
> +	/* Bit 20 is RES1 in SCTLR_EL1, but RES0 in SCTLR_EL2 */
> +	return sctlr | BIT(20);
> +}
> +
> +u64 translate_ttbr0(u64 ttbr0)
> +{
> +	/* Force ASID to 0 (ASID 0 or RES0) */
Are you forcing ASID to 0 because you are only expecting a non-vhe guest
hypervisor to access ttbr0_el2, in which case the architecture says that the
ASID field is RES0? Is it so unlikely that a vhe guest hypervisor will access
ttbr0_el2 directly that it's not worth adding a check for that?
> +	return ttbr0 & ~GENMASK_ULL(63, 48);
> +}
> +
> +u64 translate_cnthctl(u64 cnthctl)
> +{
> +	return ((cnthctl & 0x3) << 10) | (cnthctl & 0xfc);

Patch 16/59 "KVM: arm64: nv: Save/Restore vEL2 sysregs" suggests that you are
translating CNTHCTL to write it to CNTKCTL_EL1. Looking at ARM DDI 0487D.b,
CNTKCTL_EL1 has bits 63:10 RES0. I think the correct value should be ((cnthctl &
0x3) << 8) | (cnthctl & 0xfc).

> +}
> +
> +#define EL2_SYSREG(el2, el1, translate)	\
> +	[el2 - FIRST_EL2_SYSREG] = { el2, el1, translate }
> +#define PURE_EL2_SYSREG(el2) \
> +	[el2 - FIRST_EL2_SYSREG] = { el2,__INVALID_SYSREG__, NULL }
> +/*
> + * Associate vEL2 registers to their EL1 counterparts on the CPU.
> + * The translate function can be NULL, when the register layout is identical.
> + */
> +struct el2_sysreg_map {
> +	int sysreg;	/* EL2 register index into the array above */
> +	int mapping;	/* associated EL1 register */
> +	u64 (*translate)(u64 value);
> +} nested_sysreg_map[NR_SYS_REGS - FIRST_EL2_SYSREG] = {
> +	PURE_EL2_SYSREG( VPIDR_EL2 ),
> +	PURE_EL2_SYSREG( VMPIDR_EL2 ),
> +	PURE_EL2_SYSREG( ACTLR_EL2 ),
> +	PURE_EL2_SYSREG( HCR_EL2 ),
> +	PURE_EL2_SYSREG( MDCR_EL2 ),
> +	PURE_EL2_SYSREG( HSTR_EL2 ),
> +	PURE_EL2_SYSREG( HACR_EL2 ),
> +	PURE_EL2_SYSREG( VTTBR_EL2 ),
> +	PURE_EL2_SYSREG( VTCR_EL2 ),
> +	PURE_EL2_SYSREG( RVBAR_EL2 ),
> +	PURE_EL2_SYSREG( RMR_EL2 ),
> +	PURE_EL2_SYSREG( TPIDR_EL2 ),
> +	PURE_EL2_SYSREG( CNTVOFF_EL2 ),
> +	PURE_EL2_SYSREG( CNTHCTL_EL2 ),
> +	PURE_EL2_SYSREG( HPFAR_EL2 ),
> +	EL2_SYSREG(      SCTLR_EL2,  SCTLR_EL1,      translate_sctlr ),
> +	EL2_SYSREG(      CPTR_EL2,   CPACR_EL1,      translate_cptr  ),
> +	EL2_SYSREG(      TTBR0_EL2,  TTBR0_EL1,      translate_ttbr0 ),
> +	EL2_SYSREG(      TTBR1_EL2,  TTBR1_EL1,      NULL            ),
> +	EL2_SYSREG(      TCR_EL2,    TCR_EL1,        translate_tcr   ),
> +	EL2_SYSREG(      VBAR_EL2,   VBAR_EL1,       NULL            ),
> +	EL2_SYSREG(      AFSR0_EL2,  AFSR0_EL1,      NULL            ),
> +	EL2_SYSREG(      AFSR1_EL2,  AFSR1_EL1,      NULL            ),
> +	EL2_SYSREG(      ESR_EL2,    ESR_EL1,        NULL            ),
> +	EL2_SYSREG(      FAR_EL2,    FAR_EL1,        NULL            ),
> +	EL2_SYSREG(      MAIR_EL2,   MAIR_EL1,       NULL            ),
> +	EL2_SYSREG(      AMAIR_EL2,  AMAIR_EL1,      NULL            ),
> +};
Figuring out which registers are in this map and which aren't and are supposed
to be treated differently is really cumbersome because they are split into two
types of el2 registers and their order is different from the order in enum
vcpu_sysreg (in kvm_host.h). Perhaps adding a comment about what registers will
be treated differently would make the code a bit easier to follow?
> +
> +static
> +const struct el2_sysreg_map *find_el2_sysreg(const struct el2_sysreg_map *map,
> +					     int reg)
> +{
> +	const struct el2_sysreg_map *entry;
> +
> +	if (!sysreg_is_el2(reg))
> +		return NULL;
> +
> +	entry = &nested_sysreg_map[reg - FIRST_EL2_SYSREG];
> +	if (entry->sysreg == __INVALID_SYSREG__)
> +		return NULL;
> +
> +	return entry;
> +}
> +
>  u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg)
>  {
> +
>  	if (!vcpu->arch.sysregs_loaded_on_cpu)
>  		goto immediate_read;
>  
> +	if (unlikely(sysreg_is_el2(reg))) {
> +		const struct el2_sysreg_map *el2_reg;
> +
> +		if (!is_hyp_ctxt(vcpu))
> +			goto immediate_read;
I'm confused by this. is_hyp_ctxt returns false when the guest is not in vEL2
AND HCR_EL.E2H or HCR_EL2.TGE are not set. In this case, the NV bit will not be
set and the hardware will raise an undefined instruction exception when
accessing an EL2 register from EL1. What am I missing?
> +
> +		el2_reg = find_el2_sysreg(nested_sysreg_map, reg);
> +		if (el2_reg) {
> +			/*
> +			 * If this register does not have an EL1 counterpart,
> +			 * then read the stored EL2 version.
> +			 */
> +			if (el2_reg->mapping == __INVALID_SYSREG__)
> +				goto immediate_read;
> +
> +			/* Get the current version of the EL1 counterpart. */
> +			reg = el2_reg->mapping;
> +		}
> +	} else {
> +		/* EL1 register can't be on the CPU if the guest is in vEL2. */
> +		if (unlikely(is_hyp_ctxt(vcpu)))
> +			goto immediate_read;
> +	}
> +
>  	/*
>  	 * System registers listed in the switch are not saved on every
>  	 * exit from the guest but are only saved on vcpu_put.
> @@ -114,6 +245,8 @@ u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg)
>  	case DACR32_EL2:	return read_sysreg_s(SYS_DACR32_EL2);
>  	case IFSR32_EL2:	return read_sysreg_s(SYS_IFSR32_EL2);
>  	case DBGVCR32_EL2:	return read_sysreg_s(SYS_DBGVCR32_EL2);
> +	case SP_EL2:		return read_sysreg(sp_el1);
> +	case ELR_EL2:		return read_sysreg_el1(SYS_ELR);
>  	}
>  
>  immediate_read:
> @@ -125,6 +258,34 @@ void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg)
>  	if (!vcpu->arch.sysregs_loaded_on_cpu)
>  		goto immediate_write;
>  
> +	if (unlikely(sysreg_is_el2(reg))) {
> +		const struct el2_sysreg_map *el2_reg;
> +
> +		if (!is_hyp_ctxt(vcpu))
> +			goto immediate_write;
> +
> +		/* Store the EL2 version in the sysregs array. */
> +		__vcpu_sys_reg(vcpu, reg) = val;
> +
> +		el2_reg = find_el2_sysreg(nested_sysreg_map, reg);
> +		if (el2_reg) {
> +			/* Does this register have an EL1 counterpart? */
> +			if (el2_reg->mapping == __INVALID_SYSREG__)
> +				return;
> +
> +			if (!vcpu_el2_e2h_is_set(vcpu) &&
> +			    el2_reg->translate)
> +				val = el2_reg->translate(val);
> +
> +			/* Redirect this to the EL1 version of the register. */
> +			reg = el2_reg->mapping;
> +		}
> +	} else {
> +		/* EL1 register can't be on the CPU if the guest is in vEL2. */
> +		if (unlikely(is_hyp_ctxt(vcpu)))
> +			goto immediate_write;
> +	}
> +
>  	/*
>  	 * System registers listed in the switch are not restored on every
>  	 * entry to the guest but are only restored on vcpu_load.
> @@ -157,6 +318,8 @@ void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg)
>  	case DACR32_EL2:	write_sysreg_s(val, SYS_DACR32_EL2);	return;
>  	case IFSR32_EL2:	write_sysreg_s(val, SYS_IFSR32_EL2);	return;
>  	case DBGVCR32_EL2:	write_sysreg_s(val, SYS_DBGVCR32_EL2);	return;
> +	case SP_EL2:		write_sysreg(val, sp_el1);		return;
> +	case ELR_EL2:		write_sysreg_el1(val, SYS_ELR);		return;
>  	}
>  
>  immediate_write:
Alexandru Elisei June 26, 2019, 3:04 p.m. UTC | #4
On 6/21/19 10:37 AM, Marc Zyngier wrote:
> From: Andre Przywara <andre.przywara@arm.com>
>
> KVM internally uses accessor functions when reading or writing the
> guest's system registers. This takes care of accessing either the stored
> copy or using the "live" EL1 system registers when the host uses VHE.
>
> With the introduction of virtual EL2 we add a bunch of EL2 system
> registers, which now must also be taken care of:
> - If the guest is running in vEL2, and we access an EL1 sysreg, we must
>   revert to the stored version of that, and not use the CPU's copy.
> - If the guest is running in vEL1, and we access an EL2 sysreg, we must
>   also use the stored version, since the CPU carries the EL1 copy.
> - Some EL2 system registers are supposed to affect the current execution
>   of the system, so we need to put them into their respective EL1
>   counterparts. For this we need to define a mapping between the two.
>   This is done using the newly introduced struct el2_sysreg_map.
> - Some EL2 system registers have a different format than their EL1
>   counterpart, so we need to translate them before writing them to the
>   CPU. This is done using an (optional) translate function in the map.
> - There are the three special registers SP_EL2, SPSR_EL2 and ELR_EL2,
>   which need some separate handling.
>
> All of these cases are now wrapped into the existing accessor functions,
> so KVM users wouldn't need to care whether they access EL2 or EL1
> registers and also which state the guest is in.
>
> This handles what was formerly known as the "shadow state" dynamically,
> without requiring a separate copy for each vCPU EL.
>
> Signed-off-by: Andre Przywara <andre.przywara@arm.com>
> Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
> ---
>  arch/arm64/include/asm/kvm_emulate.h |   6 +
>  arch/arm64/include/asm/kvm_host.h    |   5 +
>  arch/arm64/kvm/sys_regs.c            | 163 +++++++++++++++++++++++++++
>  3 files changed, 174 insertions(+)
>
> diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
> index c43aac5fed69..f37006b6eec4 100644
> --- a/arch/arm64/include/asm/kvm_emulate.h
> +++ b/arch/arm64/include/asm/kvm_emulate.h
> @@ -70,6 +70,12 @@ void kvm_emulate_nested_eret(struct kvm_vcpu *vcpu);
>  int kvm_inject_nested_sync(struct kvm_vcpu *vcpu, u64 esr_el2);
>  int kvm_inject_nested_irq(struct kvm_vcpu *vcpu);
>  
> +u64 translate_tcr(u64 tcr);
> +u64 translate_cptr(u64 tcr);
> +u64 translate_sctlr(u64 tcr);
> +u64 translate_ttbr0(u64 tcr);
> +u64 translate_cnthctl(u64 tcr);
> +
>  static inline bool vcpu_el1_is_32bit(struct kvm_vcpu *vcpu)
>  {
>  	return !(vcpu->arch.hcr_el2 & HCR_RW);
> diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
> index 2d4290d2513a..dae9c42a7219 100644
> --- a/arch/arm64/include/asm/kvm_host.h
> +++ b/arch/arm64/include/asm/kvm_host.h
> @@ -217,6 +217,11 @@ enum vcpu_sysreg {
>  	NR_SYS_REGS	/* Nothing after this line! */
>  };
>  
> +static inline bool sysreg_is_el2(int reg)
> +{
> +	return reg >= FIRST_EL2_SYSREG && reg < NR_SYS_REGS;
> +}
> +
>  /* 32bit mapping */
>  #define c0_MPIDR	(MPIDR_EL1 * 2)	/* MultiProcessor ID Register */
>  #define c0_CSSELR	(CSSELR_EL1 * 2)/* Cache Size Selection Register */
> diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
> index 693dd063c9c2..d024114da162 100644
> --- a/arch/arm64/kvm/sys_regs.c
> +++ b/arch/arm64/kvm/sys_regs.c
> @@ -76,11 +76,142 @@ static bool write_to_read_only(struct kvm_vcpu *vcpu,
>  	return false;
>  }
>  
> +static u64 tcr_el2_ips_to_tcr_el1_ps(u64 tcr_el2)
> +{
> +	return ((tcr_el2 & TCR_EL2_PS_MASK) >> TCR_EL2_PS_SHIFT)
> +		<< TCR_IPS_SHIFT;
> +}
> +
> +u64 translate_tcr(u64 tcr)
> +{
> +	return TCR_EPD1_MASK |				/* disable TTBR1_EL1 */
> +	       ((tcr & TCR_EL2_TBI) ? TCR_TBI0 : 0) |
> +	       tcr_el2_ips_to_tcr_el1_ps(tcr) |
> +	       (tcr & TCR_EL2_TG0_MASK) |
> +	       (tcr & TCR_EL2_ORGN0_MASK) |
> +	       (tcr & TCR_EL2_IRGN0_MASK) |
> +	       (tcr & TCR_EL2_T0SZ_MASK);
> +}
> +
> +u64 translate_cptr(u64 cptr_el2)
> +{
> +	u64 cpacr_el1 = 0;
> +
> +	if (!(cptr_el2 & CPTR_EL2_TFP))
> +		cpacr_el1 |= CPACR_EL1_FPEN;
> +	if (cptr_el2 & CPTR_EL2_TTA)
> +		cpacr_el1 |= CPACR_EL1_TTA;
> +	if (!(cptr_el2 & CPTR_EL2_TZ))
> +		cpacr_el1 |= CPACR_EL1_ZEN;
> +
> +	return cpacr_el1;
> +}
> +
> +u64 translate_sctlr(u64 sctlr)
> +{
> +	/* Bit 20 is RES1 in SCTLR_EL1, but RES0 in SCTLR_EL2 */
> +	return sctlr | BIT(20);
> +}
> +
> +u64 translate_ttbr0(u64 ttbr0)
> +{
> +	/* Force ASID to 0 (ASID 0 or RES0) */
> +	return ttbr0 & ~GENMASK_ULL(63, 48);
> +}
> +
> +u64 translate_cnthctl(u64 cnthctl)
> +{
> +	return ((cnthctl & 0x3) << 10) | (cnthctl & 0xfc);
> +}
> +
> +#define EL2_SYSREG(el2, el1, translate)	\
> +	[el2 - FIRST_EL2_SYSREG] = { el2, el1, translate }
> +#define PURE_EL2_SYSREG(el2) \
> +	[el2 - FIRST_EL2_SYSREG] = { el2,__INVALID_SYSREG__, NULL }
> +/*
> + * Associate vEL2 registers to their EL1 counterparts on the CPU.
> + * The translate function can be NULL, when the register layout is identical.
> + */
> +struct el2_sysreg_map {
> +	int sysreg;	/* EL2 register index into the array above */
> +	int mapping;	/* associated EL1 register */
> +	u64 (*translate)(u64 value);
> +} nested_sysreg_map[NR_SYS_REGS - FIRST_EL2_SYSREG] = {
> +	PURE_EL2_SYSREG( VPIDR_EL2 ),
> +	PURE_EL2_SYSREG( VMPIDR_EL2 ),
> +	PURE_EL2_SYSREG( ACTLR_EL2 ),
> +	PURE_EL2_SYSREG( HCR_EL2 ),
> +	PURE_EL2_SYSREG( MDCR_EL2 ),
> +	PURE_EL2_SYSREG( HSTR_EL2 ),
> +	PURE_EL2_SYSREG( HACR_EL2 ),
> +	PURE_EL2_SYSREG( VTTBR_EL2 ),
> +	PURE_EL2_SYSREG( VTCR_EL2 ),
> +	PURE_EL2_SYSREG( RVBAR_EL2 ),
> +	PURE_EL2_SYSREG( RMR_EL2 ),
> +	PURE_EL2_SYSREG( TPIDR_EL2 ),
> +	PURE_EL2_SYSREG( CNTVOFF_EL2 ),
> +	PURE_EL2_SYSREG( CNTHCTL_EL2 ),
> +	PURE_EL2_SYSREG( HPFAR_EL2 ),
> +	EL2_SYSREG(      SCTLR_EL2,  SCTLR_EL1,      translate_sctlr ),
> +	EL2_SYSREG(      CPTR_EL2,   CPACR_EL1,      translate_cptr  ),
> +	EL2_SYSREG(      TTBR0_EL2,  TTBR0_EL1,      translate_ttbr0 ),
> +	EL2_SYSREG(      TTBR1_EL2,  TTBR1_EL1,      NULL            ),
> +	EL2_SYSREG(      TCR_EL2,    TCR_EL1,        translate_tcr   ),
> +	EL2_SYSREG(      VBAR_EL2,   VBAR_EL1,       NULL            ),
> +	EL2_SYSREG(      AFSR0_EL2,  AFSR0_EL1,      NULL            ),
> +	EL2_SYSREG(      AFSR1_EL2,  AFSR1_EL1,      NULL            ),
> +	EL2_SYSREG(      ESR_EL2,    ESR_EL1,        NULL            ),
> +	EL2_SYSREG(      FAR_EL2,    FAR_EL1,        NULL            ),
> +	EL2_SYSREG(      MAIR_EL2,   MAIR_EL1,       NULL            ),
> +	EL2_SYSREG(      AMAIR_EL2,  AMAIR_EL1,      NULL            ),
> +};
> +
> +static
> +const struct el2_sysreg_map *find_el2_sysreg(const struct el2_sysreg_map *map,
> +					     int reg)
> +{
> +	const struct el2_sysreg_map *entry;
> +
> +	if (!sysreg_is_el2(reg))
> +		return NULL;
> +
> +	entry = &nested_sysreg_map[reg - FIRST_EL2_SYSREG];
> +	if (entry->sysreg == __INVALID_SYSREG__)
> +		return NULL;
> +
> +	return entry;
> +}
> +
>  u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg)
>  {
> +
>  	if (!vcpu->arch.sysregs_loaded_on_cpu)
>  		goto immediate_read;
>  
> +	if (unlikely(sysreg_is_el2(reg))) {
> +		const struct el2_sysreg_map *el2_reg;
> +
> +		if (!is_hyp_ctxt(vcpu))
> +			goto immediate_read;
> +
> +		el2_reg = find_el2_sysreg(nested_sysreg_map, reg);
> +		if (el2_reg) {
> +			/*
> +			 * If this register does not have an EL1 counterpart,
> +			 * then read the stored EL2 version.
> +			 */
> +			if (el2_reg->mapping == __INVALID_SYSREG__)
> +				goto immediate_read;
> +
> +			/* Get the current version of the EL1 counterpart. */
> +			reg = el2_reg->mapping;
> +		}
> +	} else {
> +		/* EL1 register can't be on the CPU if the guest is in vEL2. */
> +		if (unlikely(is_hyp_ctxt(vcpu)))
> +			goto immediate_read;
> +	}
> +
>  	/*
>  	 * System registers listed in the switch are not saved on every
>  	 * exit from the guest but are only saved on vcpu_put.
> @@ -114,6 +245,8 @@ u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg)
>  	case DACR32_EL2:	return read_sysreg_s(SYS_DACR32_EL2);
>  	case IFSR32_EL2:	return read_sysreg_s(SYS_IFSR32_EL2);
>  	case DBGVCR32_EL2:	return read_sysreg_s(SYS_DBGVCR32_EL2);
> +	case SP_EL2:		return read_sysreg(sp_el1);
From ARM DDI 0487D.b, section Behavior when HCR_EL2.NV == 1: "Reads or writes to
any allocated and implemented System register or Special-purpose register named
*_EL2, *_EL02, or *_EL12 in the MRS or MSR instruction, other than SP_EL2, are
trapped to EL2 rather than being UNDEFINED" (page D5-2480). My interpretation of
the text is that attempted reads of SP_EL2 from virtual EL2 cause an undefined
instruction exception.
> +	case ELR_EL2:		return read_sysreg_el1(SYS_ELR);
>  	}
>  
>  immediate_read:
> @@ -125,6 +258,34 @@ void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg)
>  	if (!vcpu->arch.sysregs_loaded_on_cpu)
>  		goto immediate_write;
>  
> +	if (unlikely(sysreg_is_el2(reg))) {
> +		const struct el2_sysreg_map *el2_reg;
> +
> +		if (!is_hyp_ctxt(vcpu))
> +			goto immediate_write;
> +
> +		/* Store the EL2 version in the sysregs array. */
> +		__vcpu_sys_reg(vcpu, reg) = val;
> +
> +		el2_reg = find_el2_sysreg(nested_sysreg_map, reg);
> +		if (el2_reg) {
> +			/* Does this register have an EL1 counterpart? */
> +			if (el2_reg->mapping == __INVALID_SYSREG__)
> +				return;
> +
> +			if (!vcpu_el2_e2h_is_set(vcpu) &&
> +			    el2_reg->translate)
> +				val = el2_reg->translate(val);
> +
> +			/* Redirect this to the EL1 version of the register. */
> +			reg = el2_reg->mapping;
> +		}
> +	} else {
> +		/* EL1 register can't be on the CPU if the guest is in vEL2. */
> +		if (unlikely(is_hyp_ctxt(vcpu)))
> +			goto immediate_write;
> +	}
> +
>  	/*
>  	 * System registers listed in the switch are not restored on every
>  	 * entry to the guest but are only restored on vcpu_load.
> @@ -157,6 +318,8 @@ void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg)
>  	case DACR32_EL2:	write_sysreg_s(val, SYS_DACR32_EL2);	return;
>  	case IFSR32_EL2:	write_sysreg_s(val, SYS_IFSR32_EL2);	return;
>  	case DBGVCR32_EL2:	write_sysreg_s(val, SYS_DBGVCR32_EL2);	return;
> +	case SP_EL2:		write_sysreg(val, sp_el1);		return;
> +	case ELR_EL2:		write_sysreg_el1(val, SYS_ELR);		return;
>  	}
>  
>  immediate_write:
Alexandru Elisei July 1, 2019, 9:58 a.m. UTC | #5
On 6/25/19 4:18 PM, Alexandru Elisei wrote:
> Hi Marc,
>
> A question regarding this patch. This patch modifies vcpu_{read,write}_sys_reg
> to handle virtual EL2 registers. However, the file kvm/emulate-nested.c added by
> patch 10/59 "KVM: arm64: nv: Support virtual EL2 exceptions" already uses
> vcpu_{read,write}_sys_reg to access EL2 registers. In my opinion, it doesn't
> really matter which comes first because nested support is only enabled in the
> last patch of the series, but I thought I should bring this up in case it is not
> what you intended.
>
> On 6/21/19 10:37 AM, Marc Zyngier wrote:
>> From: Andre Przywara <andre.przywara@arm.com>
>>
>> KVM internally uses accessor functions when reading or writing the
>> guest's system registers. This takes care of accessing either the stored
>> copy or using the "live" EL1 system registers when the host uses VHE.
>>
>> With the introduction of virtual EL2 we add a bunch of EL2 system
>> registers, which now must also be taken care of:
>> - If the guest is running in vEL2, and we access an EL1 sysreg, we must
>>   revert to the stored version of that, and not use the CPU's copy.
>> - If the guest is running in vEL1, and we access an EL2 sysreg, we must
>>   also use the stored version, since the CPU carries the EL1 copy.
>> - Some EL2 system registers are supposed to affect the current execution
>>   of the system, so we need to put them into their respective EL1
>>   counterparts. For this we need to define a mapping between the two.
>>   This is done using the newly introduced struct el2_sysreg_map.
>> - Some EL2 system registers have a different format than their EL1
>>   counterpart, so we need to translate them before writing them to the
>>   CPU. This is done using an (optional) translate function in the map.
>> - There are the three special registers SP_EL2, SPSR_EL2 and ELR_EL2,
>>   which need some separate handling.
> I see no change in this patch related to SPSR_EL2. Special handling of SPSR_EL2
> is added in the next patch, patch 14/59 "KVM: arm64: nv: Handle SPSR_EL2 specially".
>> All of these cases are now wrapped into the existing accessor functions,
>> so KVM users wouldn't need to care whether they access EL2 or EL1
>> registers and also which state the guest is in.
>>
>> This handles what was formerly known as the "shadow state" dynamically,
>> without requiring a separate copy for each vCPU EL.
>>
>> Signed-off-by: Andre Przywara <andre.przywara@arm.com>
>> Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
>> ---
>>  arch/arm64/include/asm/kvm_emulate.h |   6 +
>>  arch/arm64/include/asm/kvm_host.h    |   5 +
>>  arch/arm64/kvm/sys_regs.c            | 163 +++++++++++++++++++++++++++
>>  3 files changed, 174 insertions(+)
>>
>> diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
>> index c43aac5fed69..f37006b6eec4 100644
>> --- a/arch/arm64/include/asm/kvm_emulate.h
>> +++ b/arch/arm64/include/asm/kvm_emulate.h
>> @@ -70,6 +70,12 @@ void kvm_emulate_nested_eret(struct kvm_vcpu *vcpu);
>>  int kvm_inject_nested_sync(struct kvm_vcpu *vcpu, u64 esr_el2);
>>  int kvm_inject_nested_irq(struct kvm_vcpu *vcpu);
>>  
>> +u64 translate_tcr(u64 tcr);
>> +u64 translate_cptr(u64 tcr);
>> +u64 translate_sctlr(u64 tcr);
>> +u64 translate_ttbr0(u64 tcr);
>> +u64 translate_cnthctl(u64 tcr);
>> +
>>  static inline bool vcpu_el1_is_32bit(struct kvm_vcpu *vcpu)
>>  {
>>  	return !(vcpu->arch.hcr_el2 & HCR_RW);
>> diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
>> index 2d4290d2513a..dae9c42a7219 100644
>> --- a/arch/arm64/include/asm/kvm_host.h
>> +++ b/arch/arm64/include/asm/kvm_host.h
>> @@ -217,6 +217,11 @@ enum vcpu_sysreg {
>>  	NR_SYS_REGS	/* Nothing after this line! */
>>  };
>>  
>> +static inline bool sysreg_is_el2(int reg)
>> +{
>> +	return reg >= FIRST_EL2_SYSREG && reg < NR_SYS_REGS;
>> +}
>> +
>>  /* 32bit mapping */
>>  #define c0_MPIDR	(MPIDR_EL1 * 2)	/* MultiProcessor ID Register */
>>  #define c0_CSSELR	(CSSELR_EL1 * 2)/* Cache Size Selection Register */
>> diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
>> index 693dd063c9c2..d024114da162 100644
>> --- a/arch/arm64/kvm/sys_regs.c
>> +++ b/arch/arm64/kvm/sys_regs.c
>> @@ -76,11 +76,142 @@ static bool write_to_read_only(struct kvm_vcpu *vcpu,
>>  	return false;
>>  }
>>  
>> +static u64 tcr_el2_ips_to_tcr_el1_ps(u64 tcr_el2)
> The code seems to suggest that you are translating TCR_EL2.PS to TCR_EL1.IPS.
> Perhaps the function should be named tcr_el2_ps_to_tcr_el1_ips?
>> +{
>> +	return ((tcr_el2 & TCR_EL2_PS_MASK) >> TCR_EL2_PS_SHIFT)
>> +		<< TCR_IPS_SHIFT;
>> +}
>> +
>> +u64 translate_tcr(u64 tcr)
>> +{
>> +	return TCR_EPD1_MASK |				/* disable TTBR1_EL1 */
>> +	       ((tcr & TCR_EL2_TBI) ? TCR_TBI0 : 0) |
>> +	       tcr_el2_ips_to_tcr_el1_ps(tcr) |
>> +	       (tcr & TCR_EL2_TG0_MASK) |
>> +	       (tcr & TCR_EL2_ORGN0_MASK) |
>> +	       (tcr & TCR_EL2_IRGN0_MASK) |
>> +	       (tcr & TCR_EL2_T0SZ_MASK);
>> +}
>> +
>> +u64 translate_cptr(u64 cptr_el2)
> The argument name is not consistent with the other translate_* functions. I
> think it is reasonably obvious that you are translating an EL2 register.
>> +{
>> +	u64 cpacr_el1 = 0;
>> +
>> +	if (!(cptr_el2 & CPTR_EL2_TFP))
>> +		cpacr_el1 |= CPACR_EL1_FPEN;
>> +	if (cptr_el2 & CPTR_EL2_TTA)
>> +		cpacr_el1 |= CPACR_EL1_TTA;
>> +	if (!(cptr_el2 & CPTR_EL2_TZ))
>> +		cpacr_el1 |= CPACR_EL1_ZEN;
>> +
>> +	return cpacr_el1;
>> +}
>> +
>> +u64 translate_sctlr(u64 sctlr)
>> +{
>> +	/* Bit 20 is RES1 in SCTLR_EL1, but RES0 in SCTLR_EL2 */
>> +	return sctlr | BIT(20);
>> +}
>> +
>> +u64 translate_ttbr0(u64 ttbr0)
>> +{
>> +	/* Force ASID to 0 (ASID 0 or RES0) */
> Are you forcing ASID to 0 because you are only expecting a non-vhe guest
> hypervisor to access ttbr0_el2, in which case the architecture says that the
> ASID field is RES0? Is it so unlikely that a vhe guest hypervisor will access
> ttbr0_el2 directly that it's not worth adding a check for that?

My mistake, obviously the translate functions are used only when VHE is
disabled, because when E2H is set, they have the same format as their EL1
counterparts.

Sorry for the noise,

Alex
Alexandru Elisei July 1, 2019, 12:10 p.m. UTC | #6
On 6/21/19 10:37 AM, Marc Zyngier wrote:
> From: Andre Przywara <andre.przywara@arm.com>
>
> KVM internally uses accessor functions when reading or writing the
> guest's system registers. This takes care of accessing either the stored
> copy or using the "live" EL1 system registers when the host uses VHE.
>
> With the introduction of virtual EL2 we add a bunch of EL2 system
> registers, which now must also be taken care of:
> - If the guest is running in vEL2, and we access an EL1 sysreg, we must
>   revert to the stored version of that, and not use the CPU's copy.
> - If the guest is running in vEL1, and we access an EL2 sysreg, we must
>   also use the stored version, since the CPU carries the EL1 copy.
> - Some EL2 system registers are supposed to affect the current execution
>   of the system, so we need to put them into their respective EL1
>   counterparts. For this we need to define a mapping between the two.
>   This is done using the newly introduced struct el2_sysreg_map.
> - Some EL2 system registers have a different format than their EL1
>   counterpart, so we need to translate them before writing them to the
>   CPU. This is done using an (optional) translate function in the map.
> - There are the three special registers SP_EL2, SPSR_EL2 and ELR_EL2,
>   which need some separate handling.
>
> All of these cases are now wrapped into the existing accessor functions,
> so KVM users wouldn't need to care whether they access EL2 or EL1
> registers and also which state the guest is in.
>
> This handles what was formerly known as the "shadow state" dynamically,
> without requiring a separate copy for each vCPU EL.
>
> Signed-off-by: Andre Przywara <andre.przywara@arm.com>
> Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
> ---
>  arch/arm64/include/asm/kvm_emulate.h |   6 +
>  arch/arm64/include/asm/kvm_host.h    |   5 +
>  arch/arm64/kvm/sys_regs.c            | 163 +++++++++++++++++++++++++++
>  3 files changed, 174 insertions(+)
>
> diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
> index c43aac5fed69..f37006b6eec4 100644
> --- a/arch/arm64/include/asm/kvm_emulate.h
> +++ b/arch/arm64/include/asm/kvm_emulate.h
> @@ -70,6 +70,12 @@ void kvm_emulate_nested_eret(struct kvm_vcpu *vcpu);
>  int kvm_inject_nested_sync(struct kvm_vcpu *vcpu, u64 esr_el2);
>  int kvm_inject_nested_irq(struct kvm_vcpu *vcpu);
>  
> +u64 translate_tcr(u64 tcr);
> +u64 translate_cptr(u64 tcr);
> +u64 translate_sctlr(u64 tcr);
> +u64 translate_ttbr0(u64 tcr);
> +u64 translate_cnthctl(u64 tcr);
> +
>  static inline bool vcpu_el1_is_32bit(struct kvm_vcpu *vcpu)
>  {
>  	return !(vcpu->arch.hcr_el2 & HCR_RW);
> diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
> index 2d4290d2513a..dae9c42a7219 100644
> --- a/arch/arm64/include/asm/kvm_host.h
> +++ b/arch/arm64/include/asm/kvm_host.h
> @@ -217,6 +217,11 @@ enum vcpu_sysreg {
>  	NR_SYS_REGS	/* Nothing after this line! */
>  };
>  
> +static inline bool sysreg_is_el2(int reg)
> +{
> +	return reg >= FIRST_EL2_SYSREG && reg < NR_SYS_REGS;
> +}
> +
>  /* 32bit mapping */
>  #define c0_MPIDR	(MPIDR_EL1 * 2)	/* MultiProcessor ID Register */
>  #define c0_CSSELR	(CSSELR_EL1 * 2)/* Cache Size Selection Register */
> diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
> index 693dd063c9c2..d024114da162 100644
> --- a/arch/arm64/kvm/sys_regs.c
> +++ b/arch/arm64/kvm/sys_regs.c
> @@ -76,11 +76,142 @@ static bool write_to_read_only(struct kvm_vcpu *vcpu,
>  	return false;
>  }
>  
> +static u64 tcr_el2_ips_to_tcr_el1_ps(u64 tcr_el2)
> +{
> +	return ((tcr_el2 & TCR_EL2_PS_MASK) >> TCR_EL2_PS_SHIFT)
> +		<< TCR_IPS_SHIFT;
> +}
> +
> +u64 translate_tcr(u64 tcr)
> +{
> +	return TCR_EPD1_MASK |				/* disable TTBR1_EL1 */
> +	       ((tcr & TCR_EL2_TBI) ? TCR_TBI0 : 0) |
> +	       tcr_el2_ips_to_tcr_el1_ps(tcr) |
> +	       (tcr & TCR_EL2_TG0_MASK) |
> +	       (tcr & TCR_EL2_ORGN0_MASK) |
> +	       (tcr & TCR_EL2_IRGN0_MASK) |
> +	       (tcr & TCR_EL2_T0SZ_MASK);
> +}
> +
> +u64 translate_cptr(u64 cptr_el2)
> +{
> +	u64 cpacr_el1 = 0;
> +
> +	if (!(cptr_el2 & CPTR_EL2_TFP))
> +		cpacr_el1 |= CPACR_EL1_FPEN;
> +	if (cptr_el2 & CPTR_EL2_TTA)
> +		cpacr_el1 |= CPACR_EL1_TTA;
> +	if (!(cptr_el2 & CPTR_EL2_TZ))
> +		cpacr_el1 |= CPACR_EL1_ZEN;
> +
> +	return cpacr_el1;
> +}
> +
> +u64 translate_sctlr(u64 sctlr)
> +{
> +	/* Bit 20 is RES1 in SCTLR_EL1, but RES0 in SCTLR_EL2 */
> +	return sctlr | BIT(20);
> +}
> +
> +u64 translate_ttbr0(u64 ttbr0)
> +{
> +	/* Force ASID to 0 (ASID 0 or RES0) */
> +	return ttbr0 & ~GENMASK_ULL(63, 48);
> +}
> +
> +u64 translate_cnthctl(u64 cnthctl)
> +{
> +	return ((cnthctl & 0x3) << 10) | (cnthctl & 0xfc);
> +}
> +
> +#define EL2_SYSREG(el2, el1, translate)	\
> +	[el2 - FIRST_EL2_SYSREG] = { el2, el1, translate }
> +#define PURE_EL2_SYSREG(el2) \
> +	[el2 - FIRST_EL2_SYSREG] = { el2,__INVALID_SYSREG__, NULL }
> +/*
> + * Associate vEL2 registers to their EL1 counterparts on the CPU.
> + * The translate function can be NULL, when the register layout is identical.
> + */
> +struct el2_sysreg_map {
> +	int sysreg;	/* EL2 register index into the array above */
> +	int mapping;	/* associated EL1 register */
> +	u64 (*translate)(u64 value);
> +} nested_sysreg_map[NR_SYS_REGS - FIRST_EL2_SYSREG] = {
> +	PURE_EL2_SYSREG( VPIDR_EL2 ),
> +	PURE_EL2_SYSREG( VMPIDR_EL2 ),
> +	PURE_EL2_SYSREG( ACTLR_EL2 ),
> +	PURE_EL2_SYSREG( HCR_EL2 ),
> +	PURE_EL2_SYSREG( MDCR_EL2 ),
> +	PURE_EL2_SYSREG( HSTR_EL2 ),
> +	PURE_EL2_SYSREG( HACR_EL2 ),
> +	PURE_EL2_SYSREG( VTTBR_EL2 ),
> +	PURE_EL2_SYSREG( VTCR_EL2 ),
> +	PURE_EL2_SYSREG( RVBAR_EL2 ),
> +	PURE_EL2_SYSREG( RMR_EL2 ),
> +	PURE_EL2_SYSREG( TPIDR_EL2 ),
> +	PURE_EL2_SYSREG( CNTVOFF_EL2 ),
> +	PURE_EL2_SYSREG( CNTHCTL_EL2 ),
I don't think having CNTHCTL_EL2 as a "pure" EL2 register is the right approach.
More details below.
> +	PURE_EL2_SYSREG( HPFAR_EL2 ),
> +	EL2_SYSREG(      SCTLR_EL2,  SCTLR_EL1,      translate_sctlr ),
> +	EL2_SYSREG(      CPTR_EL2,   CPACR_EL1,      translate_cptr  ),
> +	EL2_SYSREG(      TTBR0_EL2,  TTBR0_EL1,      translate_ttbr0 ),
> +	EL2_SYSREG(      TTBR1_EL2,  TTBR1_EL1,      NULL            ),
> +	EL2_SYSREG(      TCR_EL2,    TCR_EL1,        translate_tcr   ),
> +	EL2_SYSREG(      VBAR_EL2,   VBAR_EL1,       NULL            ),
> +	EL2_SYSREG(      AFSR0_EL2,  AFSR0_EL1,      NULL            ),
> +	EL2_SYSREG(      AFSR1_EL2,  AFSR1_EL1,      NULL            ),
> +	EL2_SYSREG(      ESR_EL2,    ESR_EL1,        NULL            ),
> +	EL2_SYSREG(      FAR_EL2,    FAR_EL1,        NULL            ),
> +	EL2_SYSREG(      MAIR_EL2,   MAIR_EL1,       NULL            ),
> +	EL2_SYSREG(      AMAIR_EL2,  AMAIR_EL1,      NULL            ),
> +};
> +
> +static
> +const struct el2_sysreg_map *find_el2_sysreg(const struct el2_sysreg_map *map,
> +					     int reg)
> +{
> +	const struct el2_sysreg_map *entry;
> +
> +	if (!sysreg_is_el2(reg))
> +		return NULL;
> +
> +	entry = &nested_sysreg_map[reg - FIRST_EL2_SYSREG];
> +	if (entry->sysreg == __INVALID_SYSREG__)
> +		return NULL;
> +
> +	return entry;
> +}
> +
>  u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg)
>  {
> +
>  	if (!vcpu->arch.sysregs_loaded_on_cpu)
>  		goto immediate_read;
>  
> +	if (unlikely(sysreg_is_el2(reg))) {
> +		const struct el2_sysreg_map *el2_reg;
> +
> +		if (!is_hyp_ctxt(vcpu))
> +			goto immediate_read;
> +
> +		el2_reg = find_el2_sysreg(nested_sysreg_map, reg);
> +		if (el2_reg) {
> +			/*
> +			 * If this register does not have an EL1 counterpart,
> +			 * then read the stored EL2 version.
> +			 */
> +			if (el2_reg->mapping == __INVALID_SYSREG__)
> +				goto immediate_read;

With CNTHCTL_EL2 as a "pure" EL2 register, reads (and writes, in
vcpu_write_sys_reg) will go to memory. However, when vhe is enabled, CNTHCTL_EL2
has the same format as CNTKCTL_EL1 and reads/writes to CNTKCTL_EL1 should be
reflected in the value of CNTHCTL_EL2 according to the pseudocode for accessing
CNTKCTL_EL1 (ARM DDI 0487D.b, page D12-3496). This doesn't happen for vhe guest
hypervisors because EL2 is declared as a "pure" EL2 register.

I have tested that with this hack for reads (function chosen at random):

diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c
index 04e554cae3a2..3a6260745680 100644
--- a/virt/kvm/arm/arm.c
+++ b/virt/kvm/arm/arm.c
@@ -653,8 +653,22 @@ static void check_vcpu_requests(struct kvm_vcpu *vcpu)
  */
 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 {
+       u64 cntkctl, cnthctl;
        int ret;
 
+       /* Check that CNTKCTL_EL1 writes are redirected to CNTHCTL_EL2 */
+       if (has_vhe()) {
+               /* Check that CNTKCTL_EL1 reads are redirected to CNTHCTL_EL2 */
+               cntkctl = read_sysreg(cntkctl_el1);
+               cnthctl = cntkctl ^ 1;
+               write_sysreg(cnthctl, cnthctl_el2);
+               cntkctl = read_sysreg(cntkctl_el1);
+               BUG_ON(cntkctl != cnthctl);
+               /* Restore original value */
+               cnthctl ^= 1;
+               write_sysreg(cnthctl, cnthctl_el2);
+       }
+
        if (unlikely(!kvm_vcpu_initialized(vcpu)))
                return -ENOEXEC;

and this hack for writes:

diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c
index 04e554cae3a2..1cfe47b6fa99 100644
--- a/virt/kvm/arm/arm.c
+++ b/virt/kvm/arm/arm.c
@@ -653,8 +653,21 @@ static void check_vcpu_requests(struct kvm_vcpu *vcpu)
  */
 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 {
+       u64 cntkctl, cnthctl;
        int ret;
 
+       /* Check that CNTKCTL_EL1 writes are redirected to CNTHCTL_EL2 */
+       if (has_vhe()) {
+               cntkctl = read_sysreg(cntkctl_el1);
+               cntkctl ^= 1;
+               write_sysreg(cntkctl, cntkctl_el1);
+               cnthctl = read_sysreg(cnthctl_el2);
+               BUG_ON(cntkctl != cnthctl);
+               /* Restore original value */
+               cntkctl ^= 1;
+               write_sysreg(cntkctl, cntkctl_el1);
+       }
+
        if (unlikely(!kvm_vcpu_initialized(vcpu)))
                return -ENOEXEC;

The BUG_ON is not triggered on baremetal, but is triggered when running as a L1
guest hypervisor.

Another issue with CNTHCTL_EL2 being a "pure" EL2 register is that with non-vhe
guests, writes to CNTHCTL_EL2 aren't translated and written to CNTKCTL_EL1.

This patch seems to fix the issues with vhe and non-vhe guest hypervisors
(tested with booting a L2 guest):

diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 1235a88ec575..bd21f0f45a86 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -153,7 +153,6 @@ struct el2_sysreg_map {
        PURE_EL2_SYSREG( RVBAR_EL2 ),
        PURE_EL2_SYSREG( RMR_EL2 ),
        PURE_EL2_SYSREG( TPIDR_EL2 ),
-       PURE_EL2_SYSREG( CNTHCTL_EL2 ),
        PURE_EL2_SYSREG( HPFAR_EL2 ),
        EL2_SYSREG(      SCTLR_EL2,  SCTLR_EL1,      translate_sctlr ),
        EL2_SYSREG(      CPTR_EL2,   CPACR_EL1,      translate_cptr  ),
@@ -167,6 +166,7 @@ struct el2_sysreg_map {
        EL2_SYSREG(      FAR_EL2,    FAR_EL1,        NULL            ),
        EL2_SYSREG(      MAIR_EL2,   MAIR_EL1,       NULL            ),
        EL2_SYSREG(      AMAIR_EL2,  AMAIR_EL1,      NULL            ),
+       EL2_SYSREG(      CNTHCTL_EL2,CNTKCTL_EL1,    translate_cnthctl),
 };
 
 static

> +
> +			/* Get the current version of the EL1 counterpart. */
> +			reg = el2_reg->mapping;
> +		}
> +	} else {
> +		/* EL1 register can't be on the CPU if the guest is in vEL2. */
> +		if (unlikely(is_hyp_ctxt(vcpu)))
> +			goto immediate_read;
> +	}
> +
>  	/*
>  	 * System registers listed in the switch are not saved on every
>  	 * exit from the guest but are only saved on vcpu_put.
> @@ -114,6 +245,8 @@ u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg)
>  	case DACR32_EL2:	return read_sysreg_s(SYS_DACR32_EL2);
>  	case IFSR32_EL2:	return read_sysreg_s(SYS_IFSR32_EL2);
>  	case DBGVCR32_EL2:	return read_sysreg_s(SYS_DBGVCR32_EL2);
> +	case SP_EL2:		return read_sysreg(sp_el1);
> +	case ELR_EL2:		return read_sysreg_el1(SYS_ELR);
>  	}
>  
>  immediate_read:
> @@ -125,6 +258,34 @@ void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg)
>  	if (!vcpu->arch.sysregs_loaded_on_cpu)
>  		goto immediate_write;
>  
> +	if (unlikely(sysreg_is_el2(reg))) {
> +		const struct el2_sysreg_map *el2_reg;
> +
> +		if (!is_hyp_ctxt(vcpu))
> +			goto immediate_write;
> +
> +		/* Store the EL2 version in the sysregs array. */
> +		__vcpu_sys_reg(vcpu, reg) = val;
> +
> +		el2_reg = find_el2_sysreg(nested_sysreg_map, reg);
> +		if (el2_reg) {
> +			/* Does this register have an EL1 counterpart? */
> +			if (el2_reg->mapping == __INVALID_SYSREG__)
> +				return;
> +
> +			if (!vcpu_el2_e2h_is_set(vcpu) &&
> +			    el2_reg->translate)
> +				val = el2_reg->translate(val);
> +
> +			/* Redirect this to the EL1 version of the register. */
> +			reg = el2_reg->mapping;
> +		}
> +	} else {
> +		/* EL1 register can't be on the CPU if the guest is in vEL2. */
> +		if (unlikely(is_hyp_ctxt(vcpu)))
> +			goto immediate_write;
> +	}
> +
>  	/*
>  	 * System registers listed in the switch are not restored on every
>  	 * entry to the guest but are only restored on vcpu_load.
> @@ -157,6 +318,8 @@ void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg)
>  	case DACR32_EL2:	write_sysreg_s(val, SYS_DACR32_EL2);	return;
>  	case IFSR32_EL2:	write_sysreg_s(val, SYS_IFSR32_EL2);	return;
>  	case DBGVCR32_EL2:	write_sysreg_s(val, SYS_DBGVCR32_EL2);	return;
> +	case SP_EL2:		write_sysreg(val, sp_el1);		return;
> +	case ELR_EL2:		write_sysreg_el1(val, SYS_ELR);		return;
>  	}
>  
>  immediate_write:
Marc Zyngier July 3, 2019, 12:15 p.m. UTC | #7
On 24/06/2019 13:42, Julien Thierry wrote:
> 
> 
> On 06/21/2019 10:37 AM, Marc Zyngier wrote:
>> From: Andre Przywara <andre.przywara@arm.com>
>>
>> KVM internally uses accessor functions when reading or writing the
>> guest's system registers. This takes care of accessing either the stored
>> copy or using the "live" EL1 system registers when the host uses VHE.
>>
>> With the introduction of virtual EL2 we add a bunch of EL2 system
>> registers, which now must also be taken care of:
>> - If the guest is running in vEL2, and we access an EL1 sysreg, we must
>>   revert to the stored version of that, and not use the CPU's copy.
>> - If the guest is running in vEL1, and we access an EL2 sysreg, we must
>>   also use the stored version, since the CPU carries the EL1 copy.
>> - Some EL2 system registers are supposed to affect the current execution
>>   of the system, so we need to put them into their respective EL1
>>   counterparts. For this we need to define a mapping between the two.
>>   This is done using the newly introduced struct el2_sysreg_map.
>> - Some EL2 system registers have a different format than their EL1
>>   counterpart, so we need to translate them before writing them to the
>>   CPU. This is done using an (optional) translate function in the map.
>> - There are the three special registers SP_EL2, SPSR_EL2 and ELR_EL2,
>>   which need some separate handling.
>>
>> All of these cases are now wrapped into the existing accessor functions,
>> so KVM users wouldn't need to care whether they access EL2 or EL1
>> registers and also which state the guest is in.
>>
>> This handles what was formerly known as the "shadow state" dynamically,
>> without requiring a separate copy for each vCPU EL.
>>
>> Signed-off-by: Andre Przywara <andre.przywara@arm.com>
>> Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
>> ---
>>  arch/arm64/include/asm/kvm_emulate.h |   6 +
>>  arch/arm64/include/asm/kvm_host.h    |   5 +
>>  arch/arm64/kvm/sys_regs.c            | 163 +++++++++++++++++++++++++++
>>  3 files changed, 174 insertions(+)
>>
>> diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
>> index c43aac5fed69..f37006b6eec4 100644
>> --- a/arch/arm64/include/asm/kvm_emulate.h
>> +++ b/arch/arm64/include/asm/kvm_emulate.h
>> @@ -70,6 +70,12 @@ void kvm_emulate_nested_eret(struct kvm_vcpu *vcpu);
>>  int kvm_inject_nested_sync(struct kvm_vcpu *vcpu, u64 esr_el2);
>>  int kvm_inject_nested_irq(struct kvm_vcpu *vcpu);
>>  
>> +u64 translate_tcr(u64 tcr);
>> +u64 translate_cptr(u64 tcr);
>> +u64 translate_sctlr(u64 tcr);
>> +u64 translate_ttbr0(u64 tcr);
>> +u64 translate_cnthctl(u64 tcr);
>> +
>>  static inline bool vcpu_el1_is_32bit(struct kvm_vcpu *vcpu)
>>  {
>>  	return !(vcpu->arch.hcr_el2 & HCR_RW);
>> diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
>> index 2d4290d2513a..dae9c42a7219 100644
>> --- a/arch/arm64/include/asm/kvm_host.h
>> +++ b/arch/arm64/include/asm/kvm_host.h
>> @@ -217,6 +217,11 @@ enum vcpu_sysreg {
>>  	NR_SYS_REGS	/* Nothing after this line! */
>>  };
>>  
>> +static inline bool sysreg_is_el2(int reg)
>> +{
>> +	return reg >= FIRST_EL2_SYSREG && reg < NR_SYS_REGS;
>> +}
>> +
>>  /* 32bit mapping */
>>  #define c0_MPIDR	(MPIDR_EL1 * 2)	/* MultiProcessor ID Register */
>>  #define c0_CSSELR	(CSSELR_EL1 * 2)/* Cache Size Selection Register */
>> diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
>> index 693dd063c9c2..d024114da162 100644
>> --- a/arch/arm64/kvm/sys_regs.c
>> +++ b/arch/arm64/kvm/sys_regs.c
>> @@ -76,11 +76,142 @@ static bool write_to_read_only(struct kvm_vcpu *vcpu,
>>  	return false;
>>  }
>>  
>> +static u64 tcr_el2_ips_to_tcr_el1_ps(u64 tcr_el2)
>> +{
>> +	return ((tcr_el2 & TCR_EL2_PS_MASK) >> TCR_EL2_PS_SHIFT)
>> +		<< TCR_IPS_SHIFT;
>> +}
>> +
>> +u64 translate_tcr(u64 tcr)
>> +{
>> +	return TCR_EPD1_MASK |				/* disable TTBR1_EL1 */
>> +	       ((tcr & TCR_EL2_TBI) ? TCR_TBI0 : 0) |
>> +	       tcr_el2_ips_to_tcr_el1_ps(tcr) |
>> +	       (tcr & TCR_EL2_TG0_MASK) |
>> +	       (tcr & TCR_EL2_ORGN0_MASK) |
>> +	       (tcr & TCR_EL2_IRGN0_MASK) |
>> +	       (tcr & TCR_EL2_T0SZ_MASK);
>> +}
>> +
>> +u64 translate_cptr(u64 cptr_el2)
>> +{
>> +	u64 cpacr_el1 = 0;
>> +
>> +	if (!(cptr_el2 & CPTR_EL2_TFP))
>> +		cpacr_el1 |= CPACR_EL1_FPEN;
>> +	if (cptr_el2 & CPTR_EL2_TTA)
>> +		cpacr_el1 |= CPACR_EL1_TTA;
>> +	if (!(cptr_el2 & CPTR_EL2_TZ))
>> +		cpacr_el1 |= CPACR_EL1_ZEN;
>> +
>> +	return cpacr_el1;
>> +}
>> +
>> +u64 translate_sctlr(u64 sctlr)
>> +{
>> +	/* Bit 20 is RES1 in SCTLR_EL1, but RES0 in SCTLR_EL2 */
>> +	return sctlr | BIT(20);
>> +}
>> +
>> +u64 translate_ttbr0(u64 ttbr0)
>> +{
>> +	/* Force ASID to 0 (ASID 0 or RES0) */
>> +	return ttbr0 & ~GENMASK_ULL(63, 48);
>> +}
>> +
>> +u64 translate_cnthctl(u64 cnthctl)
>> +{
>> +	return ((cnthctl & 0x3) << 10) | (cnthctl & 0xfc);
>> +}
>> +
>> +#define EL2_SYSREG(el2, el1, translate)	\
>> +	[el2 - FIRST_EL2_SYSREG] = { el2, el1, translate }
>> +#define PURE_EL2_SYSREG(el2) \
>> +	[el2 - FIRST_EL2_SYSREG] = { el2,__INVALID_SYSREG__, NULL }
>> +/*
>> + * Associate vEL2 registers to their EL1 counterparts on the CPU.
>> + * The translate function can be NULL, when the register layout is identical.
>> + */
>> +struct el2_sysreg_map {
>> +	int sysreg;	/* EL2 register index into the array above */
>> +	int mapping;	/* associated EL1 register */
>> +	u64 (*translate)(u64 value);
>> +} nested_sysreg_map[NR_SYS_REGS - FIRST_EL2_SYSREG] = {
>> +	PURE_EL2_SYSREG( VPIDR_EL2 ),
>> +	PURE_EL2_SYSREG( VMPIDR_EL2 ),
>> +	PURE_EL2_SYSREG( ACTLR_EL2 ),
>> +	PURE_EL2_SYSREG( HCR_EL2 ),
>> +	PURE_EL2_SYSREG( MDCR_EL2 ),
>> +	PURE_EL2_SYSREG( HSTR_EL2 ),
>> +	PURE_EL2_SYSREG( HACR_EL2 ),
>> +	PURE_EL2_SYSREG( VTTBR_EL2 ),
>> +	PURE_EL2_SYSREG( VTCR_EL2 ),
>> +	PURE_EL2_SYSREG( RVBAR_EL2 ),
>> +	PURE_EL2_SYSREG( RMR_EL2 ),
>> +	PURE_EL2_SYSREG( TPIDR_EL2 ),
>> +	PURE_EL2_SYSREG( CNTVOFF_EL2 ),
>> +	PURE_EL2_SYSREG( CNTHCTL_EL2 ),
>> +	PURE_EL2_SYSREG( HPFAR_EL2 ),
>> +	EL2_SYSREG(      SCTLR_EL2,  SCTLR_EL1,      translate_sctlr ),
>> +	EL2_SYSREG(      CPTR_EL2,   CPACR_EL1,      translate_cptr  ),
>> +	EL2_SYSREG(      TTBR0_EL2,  TTBR0_EL1,      translate_ttbr0 ),
>> +	EL2_SYSREG(      TTBR1_EL2,  TTBR1_EL1,      NULL            ),
>> +	EL2_SYSREG(      TCR_EL2,    TCR_EL1,        translate_tcr   ),
>> +	EL2_SYSREG(      VBAR_EL2,   VBAR_EL1,       NULL            ),
>> +	EL2_SYSREG(      AFSR0_EL2,  AFSR0_EL1,      NULL            ),
>> +	EL2_SYSREG(      AFSR1_EL2,  AFSR1_EL1,      NULL            ),
>> +	EL2_SYSREG(      ESR_EL2,    ESR_EL1,        NULL            ),
>> +	EL2_SYSREG(      FAR_EL2,    FAR_EL1,        NULL            ),
>> +	EL2_SYSREG(      MAIR_EL2,   MAIR_EL1,       NULL            ),
>> +	EL2_SYSREG(      AMAIR_EL2,  AMAIR_EL1,      NULL            ),
>> +};
>> +
>> +static
>> +const struct el2_sysreg_map *find_el2_sysreg(const struct el2_sysreg_map *map,
>> +					     int reg)
>> +{
>> +	const struct el2_sysreg_map *entry;
>> +
>> +	if (!sysreg_is_el2(reg))
>> +		return NULL;
>> +
>> +	entry = &nested_sysreg_map[reg - FIRST_EL2_SYSREG];
>> +	if (entry->sysreg == __INVALID_SYSREG__)
>> +		return NULL;
>> +
>> +	return entry;
>> +}
>> +
>>  u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg)
>>  {
>> +
>>  	if (!vcpu->arch.sysregs_loaded_on_cpu)
>>  		goto immediate_read;
>>  
>> +	if (unlikely(sysreg_is_el2(reg))) {
>> +		const struct el2_sysreg_map *el2_reg;
>> +
>> +		if (!is_hyp_ctxt(vcpu))
>> +			goto immediate_read;
>> +
>> +		el2_reg = find_el2_sysreg(nested_sysreg_map, reg);
>> +		if (el2_reg) {
>> +			/*
>> +			 * If this register does not have an EL1 counterpart,
>> +			 * then read the stored EL2 version.
>> +			 */
>> +			if (el2_reg->mapping == __INVALID_SYSREG__)
> 
> In this patch, find_el2_sysreg returns NULL for PURE_EL2 registers. So

That's not how I read this code. You get NULL if the the EL2 sysreg is
set to __INVALID_SYSREG__, of which there is no occurrence (yeah, dead
code).

> for PURE_EL2, the access would go through the switch case. However this
> branch suggest that for PURE_EL2 register we intend to do the read from
> the memory backed version.
> 
> Which should it be?

My understanding of this code is that we're actually hitting memory
here. Am I missing anything? Note that I'm actively refactoring this
code as part of of the ARMv8.4-NV effort, hopefully making it a bit simpler:

https://git.kernel.org/pub/scm/linux/kernel/git/maz/arm-platforms.git/commit/?h=kvm-arm64/nv-wip-5.2-rc6&id=ea93236776772ce08e0eab51d9b77a9197121fde

> 
>> +				goto immediate_read;
>> +
>> +			/* Get the current version of the EL1 counterpart. */
>> +			reg = el2_reg->mapping;
>> +		}
>> +	} else {
>> +		/* EL1 register can't be on the CPU if the guest is in vEL2. */
>> +		if (unlikely(is_hyp_ctxt(vcpu)))
>> +			goto immediate_read;
>> +	}
>> +
>>  	/*
>>  	 * System registers listed in the switch are not saved on every
>>  	 * exit from the guest but are only saved on vcpu_put.
>> @@ -114,6 +245,8 @@ u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg)
>>  	case DACR32_EL2:	return read_sysreg_s(SYS_DACR32_EL2);
>>  	case IFSR32_EL2:	return read_sysreg_s(SYS_IFSR32_EL2);
>>  	case DBGVCR32_EL2:	return read_sysreg_s(SYS_DBGVCR32_EL2);
>> +	case SP_EL2:		return read_sysreg(sp_el1);
>> +	case ELR_EL2:		return read_sysreg_el1(SYS_ELR);
>>  	}
>>  
>>  immediate_read:
>> @@ -125,6 +258,34 @@ void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg)
>>  	if (!vcpu->arch.sysregs_loaded_on_cpu)
>>  		goto immediate_write;
>>  
>> +	if (unlikely(sysreg_is_el2(reg))) {
>> +		const struct el2_sysreg_map *el2_reg;
>> +
>> +		if (!is_hyp_ctxt(vcpu))
>> +			goto immediate_write;
>> +
>> +		/* Store the EL2 version in the sysregs array. */
>> +		__vcpu_sys_reg(vcpu, reg) = val;
>> +
>> +		el2_reg = find_el2_sysreg(nested_sysreg_map, reg);
>> +		if (el2_reg) {
>> +			/* Does this register have an EL1 counterpart? */
>> +			if (el2_reg->mapping == __INVALID_SYSREG__)
>> +				return;
> 
> As in the read case, this is never reached and we'll go through the
> switch case.

Same thing. That's the mapping that is evaluated, not the sysreg itself.

Thanks,

	M.
Julien Thierry July 3, 2019, 3:21 p.m. UTC | #8
On 03/07/2019 13:15, Marc Zyngier wrote:
> On 24/06/2019 13:42, Julien Thierry wrote:
>>
>>
>> On 06/21/2019 10:37 AM, Marc Zyngier wrote:
>>> From: Andre Przywara <andre.przywara@arm.com>
>>>
>>> KVM internally uses accessor functions when reading or writing the
>>> guest's system registers. This takes care of accessing either the stored
>>> copy or using the "live" EL1 system registers when the host uses VHE.
>>>
>>> With the introduction of virtual EL2 we add a bunch of EL2 system
>>> registers, which now must also be taken care of:
>>> - If the guest is running in vEL2, and we access an EL1 sysreg, we must
>>>   revert to the stored version of that, and not use the CPU's copy.
>>> - If the guest is running in vEL1, and we access an EL2 sysreg, we must
>>>   also use the stored version, since the CPU carries the EL1 copy.
>>> - Some EL2 system registers are supposed to affect the current execution
>>>   of the system, so we need to put them into their respective EL1
>>>   counterparts. For this we need to define a mapping between the two.
>>>   This is done using the newly introduced struct el2_sysreg_map.
>>> - Some EL2 system registers have a different format than their EL1
>>>   counterpart, so we need to translate them before writing them to the
>>>   CPU. This is done using an (optional) translate function in the map.
>>> - There are the three special registers SP_EL2, SPSR_EL2 and ELR_EL2,
>>>   which need some separate handling.
>>>
>>> All of these cases are now wrapped into the existing accessor functions,
>>> so KVM users wouldn't need to care whether they access EL2 or EL1
>>> registers and also which state the guest is in.
>>>
>>> This handles what was formerly known as the "shadow state" dynamically,
>>> without requiring a separate copy for each vCPU EL.
>>>
>>> Signed-off-by: Andre Przywara <andre.przywara@arm.com>
>>> Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
>>> ---
>>>  arch/arm64/include/asm/kvm_emulate.h |   6 +
>>>  arch/arm64/include/asm/kvm_host.h    |   5 +
>>>  arch/arm64/kvm/sys_regs.c            | 163 +++++++++++++++++++++++++++
>>>  3 files changed, 174 insertions(+)
>>>
>>> diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
>>> index c43aac5fed69..f37006b6eec4 100644
>>> --- a/arch/arm64/include/asm/kvm_emulate.h
>>> +++ b/arch/arm64/include/asm/kvm_emulate.h
>>> @@ -70,6 +70,12 @@ void kvm_emulate_nested_eret(struct kvm_vcpu *vcpu);
>>>  int kvm_inject_nested_sync(struct kvm_vcpu *vcpu, u64 esr_el2);
>>>  int kvm_inject_nested_irq(struct kvm_vcpu *vcpu);
>>>  
>>> +u64 translate_tcr(u64 tcr);
>>> +u64 translate_cptr(u64 tcr);
>>> +u64 translate_sctlr(u64 tcr);
>>> +u64 translate_ttbr0(u64 tcr);
>>> +u64 translate_cnthctl(u64 tcr);
>>> +
>>>  static inline bool vcpu_el1_is_32bit(struct kvm_vcpu *vcpu)
>>>  {
>>>  	return !(vcpu->arch.hcr_el2 & HCR_RW);
>>> diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
>>> index 2d4290d2513a..dae9c42a7219 100644
>>> --- a/arch/arm64/include/asm/kvm_host.h
>>> +++ b/arch/arm64/include/asm/kvm_host.h
>>> @@ -217,6 +217,11 @@ enum vcpu_sysreg {
>>>  	NR_SYS_REGS	/* Nothing after this line! */
>>>  };
>>>  
>>> +static inline bool sysreg_is_el2(int reg)
>>> +{
>>> +	return reg >= FIRST_EL2_SYSREG && reg < NR_SYS_REGS;
>>> +}
>>> +
>>>  /* 32bit mapping */
>>>  #define c0_MPIDR	(MPIDR_EL1 * 2)	/* MultiProcessor ID Register */
>>>  #define c0_CSSELR	(CSSELR_EL1 * 2)/* Cache Size Selection Register */
>>> diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
>>> index 693dd063c9c2..d024114da162 100644
>>> --- a/arch/arm64/kvm/sys_regs.c
>>> +++ b/arch/arm64/kvm/sys_regs.c
>>> @@ -76,11 +76,142 @@ static bool write_to_read_only(struct kvm_vcpu *vcpu,
>>>  	return false;
>>>  }
>>>  
>>> +static u64 tcr_el2_ips_to_tcr_el1_ps(u64 tcr_el2)
>>> +{
>>> +	return ((tcr_el2 & TCR_EL2_PS_MASK) >> TCR_EL2_PS_SHIFT)
>>> +		<< TCR_IPS_SHIFT;
>>> +}
>>> +
>>> +u64 translate_tcr(u64 tcr)
>>> +{
>>> +	return TCR_EPD1_MASK |				/* disable TTBR1_EL1 */
>>> +	       ((tcr & TCR_EL2_TBI) ? TCR_TBI0 : 0) |
>>> +	       tcr_el2_ips_to_tcr_el1_ps(tcr) |
>>> +	       (tcr & TCR_EL2_TG0_MASK) |
>>> +	       (tcr & TCR_EL2_ORGN0_MASK) |
>>> +	       (tcr & TCR_EL2_IRGN0_MASK) |
>>> +	       (tcr & TCR_EL2_T0SZ_MASK);
>>> +}
>>> +
>>> +u64 translate_cptr(u64 cptr_el2)
>>> +{
>>> +	u64 cpacr_el1 = 0;
>>> +
>>> +	if (!(cptr_el2 & CPTR_EL2_TFP))
>>> +		cpacr_el1 |= CPACR_EL1_FPEN;
>>> +	if (cptr_el2 & CPTR_EL2_TTA)
>>> +		cpacr_el1 |= CPACR_EL1_TTA;
>>> +	if (!(cptr_el2 & CPTR_EL2_TZ))
>>> +		cpacr_el1 |= CPACR_EL1_ZEN;
>>> +
>>> +	return cpacr_el1;
>>> +}
>>> +
>>> +u64 translate_sctlr(u64 sctlr)
>>> +{
>>> +	/* Bit 20 is RES1 in SCTLR_EL1, but RES0 in SCTLR_EL2 */
>>> +	return sctlr | BIT(20);
>>> +}
>>> +
>>> +u64 translate_ttbr0(u64 ttbr0)
>>> +{
>>> +	/* Force ASID to 0 (ASID 0 or RES0) */
>>> +	return ttbr0 & ~GENMASK_ULL(63, 48);
>>> +}
>>> +
>>> +u64 translate_cnthctl(u64 cnthctl)
>>> +{
>>> +	return ((cnthctl & 0x3) << 10) | (cnthctl & 0xfc);
>>> +}
>>> +
>>> +#define EL2_SYSREG(el2, el1, translate)	\
>>> +	[el2 - FIRST_EL2_SYSREG] = { el2, el1, translate }
>>> +#define PURE_EL2_SYSREG(el2) \
>>> +	[el2 - FIRST_EL2_SYSREG] = { el2,__INVALID_SYSREG__, NULL }
>>> +/*
>>> + * Associate vEL2 registers to their EL1 counterparts on the CPU.
>>> + * The translate function can be NULL, when the register layout is identical.
>>> + */
>>> +struct el2_sysreg_map {
>>> +	int sysreg;	/* EL2 register index into the array above */
>>> +	int mapping;	/* associated EL1 register */
>>> +	u64 (*translate)(u64 value);
>>> +} nested_sysreg_map[NR_SYS_REGS - FIRST_EL2_SYSREG] = {
>>> +	PURE_EL2_SYSREG( VPIDR_EL2 ),
>>> +	PURE_EL2_SYSREG( VMPIDR_EL2 ),
>>> +	PURE_EL2_SYSREG( ACTLR_EL2 ),
>>> +	PURE_EL2_SYSREG( HCR_EL2 ),
>>> +	PURE_EL2_SYSREG( MDCR_EL2 ),
>>> +	PURE_EL2_SYSREG( HSTR_EL2 ),
>>> +	PURE_EL2_SYSREG( HACR_EL2 ),
>>> +	PURE_EL2_SYSREG( VTTBR_EL2 ),
>>> +	PURE_EL2_SYSREG( VTCR_EL2 ),
>>> +	PURE_EL2_SYSREG( RVBAR_EL2 ),
>>> +	PURE_EL2_SYSREG( RMR_EL2 ),
>>> +	PURE_EL2_SYSREG( TPIDR_EL2 ),
>>> +	PURE_EL2_SYSREG( CNTVOFF_EL2 ),
>>> +	PURE_EL2_SYSREG( CNTHCTL_EL2 ),
>>> +	PURE_EL2_SYSREG( HPFAR_EL2 ),
>>> +	EL2_SYSREG(      SCTLR_EL2,  SCTLR_EL1,      translate_sctlr ),
>>> +	EL2_SYSREG(      CPTR_EL2,   CPACR_EL1,      translate_cptr  ),
>>> +	EL2_SYSREG(      TTBR0_EL2,  TTBR0_EL1,      translate_ttbr0 ),
>>> +	EL2_SYSREG(      TTBR1_EL2,  TTBR1_EL1,      NULL            ),
>>> +	EL2_SYSREG(      TCR_EL2,    TCR_EL1,        translate_tcr   ),
>>> +	EL2_SYSREG(      VBAR_EL2,   VBAR_EL1,       NULL            ),
>>> +	EL2_SYSREG(      AFSR0_EL2,  AFSR0_EL1,      NULL            ),
>>> +	EL2_SYSREG(      AFSR1_EL2,  AFSR1_EL1,      NULL            ),
>>> +	EL2_SYSREG(      ESR_EL2,    ESR_EL1,        NULL            ),
>>> +	EL2_SYSREG(      FAR_EL2,    FAR_EL1,        NULL            ),
>>> +	EL2_SYSREG(      MAIR_EL2,   MAIR_EL1,       NULL            ),
>>> +	EL2_SYSREG(      AMAIR_EL2,  AMAIR_EL1,      NULL            ),
>>> +};
>>> +
>>> +static
>>> +const struct el2_sysreg_map *find_el2_sysreg(const struct el2_sysreg_map *map,
>>> +					     int reg)
>>> +{
>>> +	const struct el2_sysreg_map *entry;
>>> +
>>> +	if (!sysreg_is_el2(reg))
>>> +		return NULL;
>>> +
>>> +	entry = &nested_sysreg_map[reg - FIRST_EL2_SYSREG];
>>> +	if (entry->sysreg == __INVALID_SYSREG__)
>>> +		return NULL;
>>> +
>>> +	return entry;
>>> +}
>>> +
>>>  u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg)
>>>  {
>>> +
>>>  	if (!vcpu->arch.sysregs_loaded_on_cpu)
>>>  		goto immediate_read;
>>>  
>>> +	if (unlikely(sysreg_is_el2(reg))) {
>>> +		const struct el2_sysreg_map *el2_reg;
>>> +
>>> +		if (!is_hyp_ctxt(vcpu))
>>> +			goto immediate_read;
>>> +
>>> +		el2_reg = find_el2_sysreg(nested_sysreg_map, reg);
>>> +		if (el2_reg) {
>>> +			/*
>>> +			 * If this register does not have an EL1 counterpart,
>>> +			 * then read the stored EL2 version.
>>> +			 */
>>> +			if (el2_reg->mapping == __INVALID_SYSREG__)
>>
>> In this patch, find_el2_sysreg returns NULL for PURE_EL2 registers. So
> 
> That's not how I read this code. You get NULL if the the EL2 sysreg is
> set to __INVALID_SYSREG__, of which there is no occurrence (yeah, dead
> code).
> 

Ah yes, as you guessed, I got confused between ->sysreg and ->mapping.
Something must have gotten in my eyes when I was doing the review!

You can ignore my comments on patch then!

Thanks,
Marc Zyngier July 3, 2019, 3:59 p.m. UTC | #9
On 25/06/2019 16:18, Alexandru Elisei wrote:
> Hi Marc,
> 
> A question regarding this patch. This patch modifies vcpu_{read,write}_sys_reg
> to handle virtual EL2 registers. However, the file kvm/emulate-nested.c added by
> patch 10/59 "KVM: arm64: nv: Support virtual EL2 exceptions" already uses
> vcpu_{read,write}_sys_reg to access EL2 registers. In my opinion, it doesn't
> really matter which comes first because nested support is only enabled in the
> last patch of the series, but I thought I should bring this up in case it is not
> what you intended.

It doesn't really matter at that stage. The only thing I'm trying to
achieve in the middle of the series is not to break the build, and not
to cause non-NV to fail.

> 
> On 6/21/19 10:37 AM, Marc Zyngier wrote:
>> From: Andre Przywara <andre.przywara@arm.com>
>>
>> KVM internally uses accessor functions when reading or writing the
>> guest's system registers. This takes care of accessing either the stored
>> copy or using the "live" EL1 system registers when the host uses VHE.
>>
>> With the introduction of virtual EL2 we add a bunch of EL2 system
>> registers, which now must also be taken care of:
>> - If the guest is running in vEL2, and we access an EL1 sysreg, we must
>>   revert to the stored version of that, and not use the CPU's copy.
>> - If the guest is running in vEL1, and we access an EL2 sysreg, we must
>>   also use the stored version, since the CPU carries the EL1 copy.
>> - Some EL2 system registers are supposed to affect the current execution
>>   of the system, so we need to put them into their respective EL1
>>   counterparts. For this we need to define a mapping between the two.
>>   This is done using the newly introduced struct el2_sysreg_map.
>> - Some EL2 system registers have a different format than their EL1
>>   counterpart, so we need to translate them before writing them to the
>>   CPU. This is done using an (optional) translate function in the map.
>> - There are the three special registers SP_EL2, SPSR_EL2 and ELR_EL2,
>>   which need some separate handling.
> I see no change in this patch related to SPSR_EL2. Special handling of SPSR_EL2
> is added in the next patch, patch 14/59 "KVM: arm64: nv: Handle SPSR_EL2 specially".

Indeed, this needs rewriting (we ended-up splitting the SPSR stuff out
as it was messy and not completely correct). I may take the rest of the
special stuff out as well.

>>
>> All of these cases are now wrapped into the existing accessor functions,
>> so KVM users wouldn't need to care whether they access EL2 or EL1
>> registers and also which state the guest is in.
>>
>> This handles what was formerly known as the "shadow state" dynamically,
>> without requiring a separate copy for each vCPU EL.
>>
>> Signed-off-by: Andre Przywara <andre.przywara@arm.com>
>> Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
>> ---
>>  arch/arm64/include/asm/kvm_emulate.h |   6 +
>>  arch/arm64/include/asm/kvm_host.h    |   5 +
>>  arch/arm64/kvm/sys_regs.c            | 163 +++++++++++++++++++++++++++
>>  3 files changed, 174 insertions(+)
>>
>> diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
>> index c43aac5fed69..f37006b6eec4 100644
>> --- a/arch/arm64/include/asm/kvm_emulate.h
>> +++ b/arch/arm64/include/asm/kvm_emulate.h
>> @@ -70,6 +70,12 @@ void kvm_emulate_nested_eret(struct kvm_vcpu *vcpu);
>>  int kvm_inject_nested_sync(struct kvm_vcpu *vcpu, u64 esr_el2);
>>  int kvm_inject_nested_irq(struct kvm_vcpu *vcpu);
>>  
>> +u64 translate_tcr(u64 tcr);
>> +u64 translate_cptr(u64 tcr);
>> +u64 translate_sctlr(u64 tcr);
>> +u64 translate_ttbr0(u64 tcr);
>> +u64 translate_cnthctl(u64 tcr);
>> +
>>  static inline bool vcpu_el1_is_32bit(struct kvm_vcpu *vcpu)
>>  {
>>  	return !(vcpu->arch.hcr_el2 & HCR_RW);
>> diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
>> index 2d4290d2513a..dae9c42a7219 100644
>> --- a/arch/arm64/include/asm/kvm_host.h
>> +++ b/arch/arm64/include/asm/kvm_host.h
>> @@ -217,6 +217,11 @@ enum vcpu_sysreg {
>>  	NR_SYS_REGS	/* Nothing after this line! */
>>  };
>>  
>> +static inline bool sysreg_is_el2(int reg)
>> +{
>> +	return reg >= FIRST_EL2_SYSREG && reg < NR_SYS_REGS;
>> +}
>> +
>>  /* 32bit mapping */
>>  #define c0_MPIDR	(MPIDR_EL1 * 2)	/* MultiProcessor ID Register */
>>  #define c0_CSSELR	(CSSELR_EL1 * 2)/* Cache Size Selection Register */
>> diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
>> index 693dd063c9c2..d024114da162 100644
>> --- a/arch/arm64/kvm/sys_regs.c
>> +++ b/arch/arm64/kvm/sys_regs.c
>> @@ -76,11 +76,142 @@ static bool write_to_read_only(struct kvm_vcpu *vcpu,
>>  	return false;
>>  }
>>  
>> +static u64 tcr_el2_ips_to_tcr_el1_ps(u64 tcr_el2)
> The code seems to suggest that you are translating TCR_EL2.PS to TCR_EL1.IPS.
> Perhaps the function should be named tcr_el2_ps_to_tcr_el1_ips?

yup.

>> +{
>> +	return ((tcr_el2 & TCR_EL2_PS_MASK) >> TCR_EL2_PS_SHIFT)
>> +		<< TCR_IPS_SHIFT;
>> +}
>> +
>> +u64 translate_tcr(u64 tcr)
>> +{
>> +	return TCR_EPD1_MASK |				/* disable TTBR1_EL1 */
>> +	       ((tcr & TCR_EL2_TBI) ? TCR_TBI0 : 0) |
>> +	       tcr_el2_ips_to_tcr_el1_ps(tcr) |
>> +	       (tcr & TCR_EL2_TG0_MASK) |
>> +	       (tcr & TCR_EL2_ORGN0_MASK) |
>> +	       (tcr & TCR_EL2_IRGN0_MASK) |
>> +	       (tcr & TCR_EL2_T0SZ_MASK);
>> +}
>> +
>> +u64 translate_cptr(u64 cptr_el2)
> The argument name is not consistent with the other translate_* functions. I
> think it is reasonably obvious that you are translating an EL2 register.

That's pretty much immaterial, and the variable could be called zorglub.
Consistency is good, but I don't think we need to worry about that level
of detail.

>> +{
>> +	u64 cpacr_el1 = 0;
>> +
>> +	if (!(cptr_el2 & CPTR_EL2_TFP))
>> +		cpacr_el1 |= CPACR_EL1_FPEN;
>> +	if (cptr_el2 & CPTR_EL2_TTA)
>> +		cpacr_el1 |= CPACR_EL1_TTA;
>> +	if (!(cptr_el2 & CPTR_EL2_TZ))
>> +		cpacr_el1 |= CPACR_EL1_ZEN;
>> +
>> +	return cpacr_el1;
>> +}
>> +
>> +u64 translate_sctlr(u64 sctlr)
>> +{
>> +	/* Bit 20 is RES1 in SCTLR_EL1, but RES0 in SCTLR_EL2 */
>> +	return sctlr | BIT(20);
>> +}
>> +
>> +u64 translate_ttbr0(u64 ttbr0)
>> +{
>> +	/* Force ASID to 0 (ASID 0 or RES0) */
> Are you forcing ASID to 0 because you are only expecting a non-vhe guest
> hypervisor to access ttbr0_el2, in which case the architecture says that the
> ASID field is RES0? Is it so unlikely that a vhe guest hypervisor will access
> ttbr0_el2 directly that it's not worth adding a check for that?

Like all the translate_* function, this is only called when running a
non-VHE guest so that the EL2 register is translated to the EL1 format.
A VHE guest usually has its sysregs in the EL1 format, and certainly
does for TTBR0_EL2.

>> +	return ttbr0 & ~GENMASK_ULL(63, 48);
>> +}
>> +
>> +u64 translate_cnthctl(u64 cnthctl)
>> +{
>> +	return ((cnthctl & 0x3) << 10) | (cnthctl & 0xfc);
> 
> Patch 16/59 "KVM: arm64: nv: Save/Restore vEL2 sysregs" suggests that you are
> translating CNTHCTL to write it to CNTKCTL_EL1. Looking at ARM DDI 0487D.b,
> CNTKCTL_EL1 has bits 63:10 RES0. I think the correct value should be ((cnthctl &
> 0x3) << 8) | (cnthctl & 0xfc).

Rookie mistake! When HCR_EL2.E2h==1 (which is always the case for NV),
CNTKCTL_EL1 accesses CNTHCTL_EL2. What you have here is the translation
of non-VHE CNTHCTL_EL2 to its VHE equivalent.

> 
>> +}
>> +
>> +#define EL2_SYSREG(el2, el1, translate)	\
>> +	[el2 - FIRST_EL2_SYSREG] = { el2, el1, translate }
>> +#define PURE_EL2_SYSREG(el2) \
>> +	[el2 - FIRST_EL2_SYSREG] = { el2,__INVALID_SYSREG__, NULL }
>> +/*
>> + * Associate vEL2 registers to their EL1 counterparts on the CPU.
>> + * The translate function can be NULL, when the register layout is identical.
>> + */
>> +struct el2_sysreg_map {
>> +	int sysreg;	/* EL2 register index into the array above */
>> +	int mapping;	/* associated EL1 register */
>> +	u64 (*translate)(u64 value);
>> +} nested_sysreg_map[NR_SYS_REGS - FIRST_EL2_SYSREG] = {
>> +	PURE_EL2_SYSREG( VPIDR_EL2 ),
>> +	PURE_EL2_SYSREG( VMPIDR_EL2 ),
>> +	PURE_EL2_SYSREG( ACTLR_EL2 ),
>> +	PURE_EL2_SYSREG( HCR_EL2 ),
>> +	PURE_EL2_SYSREG( MDCR_EL2 ),
>> +	PURE_EL2_SYSREG( HSTR_EL2 ),
>> +	PURE_EL2_SYSREG( HACR_EL2 ),
>> +	PURE_EL2_SYSREG( VTTBR_EL2 ),
>> +	PURE_EL2_SYSREG( VTCR_EL2 ),
>> +	PURE_EL2_SYSREG( RVBAR_EL2 ),
>> +	PURE_EL2_SYSREG( RMR_EL2 ),
>> +	PURE_EL2_SYSREG( TPIDR_EL2 ),
>> +	PURE_EL2_SYSREG( CNTVOFF_EL2 ),
>> +	PURE_EL2_SYSREG( CNTHCTL_EL2 ),
>> +	PURE_EL2_SYSREG( HPFAR_EL2 ),
>> +	EL2_SYSREG(      SCTLR_EL2,  SCTLR_EL1,      translate_sctlr ),
>> +	EL2_SYSREG(      CPTR_EL2,   CPACR_EL1,      translate_cptr  ),
>> +	EL2_SYSREG(      TTBR0_EL2,  TTBR0_EL1,      translate_ttbr0 ),
>> +	EL2_SYSREG(      TTBR1_EL2,  TTBR1_EL1,      NULL            ),
>> +	EL2_SYSREG(      TCR_EL2,    TCR_EL1,        translate_tcr   ),
>> +	EL2_SYSREG(      VBAR_EL2,   VBAR_EL1,       NULL            ),
>> +	EL2_SYSREG(      AFSR0_EL2,  AFSR0_EL1,      NULL            ),
>> +	EL2_SYSREG(      AFSR1_EL2,  AFSR1_EL1,      NULL            ),
>> +	EL2_SYSREG(      ESR_EL2,    ESR_EL1,        NULL            ),
>> +	EL2_SYSREG(      FAR_EL2,    FAR_EL1,        NULL            ),
>> +	EL2_SYSREG(      MAIR_EL2,   MAIR_EL1,       NULL            ),
>> +	EL2_SYSREG(      AMAIR_EL2,  AMAIR_EL1,      NULL            ),
>> +};
> Figuring out which registers are in this map and which aren't and are supposed
> to be treated differently is really cumbersome because they are split into two
> types of el2 registers and their order is different from the order in enum
> vcpu_sysreg (in kvm_host.h). Perhaps adding a comment about what registers will
> be treated differently would make the code a bit easier to follow?

I'm not sure what this buys us. We have 3 categories of EL2 sysregs:
- Purely emulated
- Directly mapped onto an EL1 sysreg
- Translated from EL2 to EL1

I think the wrappers represent that pretty well, although we could split
EL2_SYSREG into DIRECT_EL2_SYSREG and TRANSLATE_EL2_SYSREG. As for the
order, does it really matter? We also have the trap table order, which
is also different from the enum. Do you propose we reorder everything?

>> +
>> +static
>> +const struct el2_sysreg_map *find_el2_sysreg(const struct el2_sysreg_map *map,
>> +					     int reg)
>> +{
>> +	const struct el2_sysreg_map *entry;
>> +
>> +	if (!sysreg_is_el2(reg))
>> +		return NULL;
>> +
>> +	entry = &nested_sysreg_map[reg - FIRST_EL2_SYSREG];
>> +	if (entry->sysreg == __INVALID_SYSREG__)
>> +		return NULL;
>> +
>> +	return entry;
>> +}
>> +
>>  u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg)
>>  {
>> +
>>  	if (!vcpu->arch.sysregs_loaded_on_cpu)
>>  		goto immediate_read;
>>  
>> +	if (unlikely(sysreg_is_el2(reg))) {
>> +		const struct el2_sysreg_map *el2_reg;
>> +
>> +		if (!is_hyp_ctxt(vcpu))
>> +			goto immediate_read;
> I'm confused by this. is_hyp_ctxt returns false when the guest is not in vEL2
> AND HCR_EL.E2H or HCR_EL2.TGE are not set. In this case, the NV bit will not be
> set and the hardware will raise an undefined instruction exception when
> accessing an EL2 register from EL1. What am I missing?

You don't necessarily access an EL2 register just because you run at
EL2. You also access it because you emulate an EL1 instruction whose
behaviour is conditioned by an EL2 register.

Thanks,

	M.
Alexandru Elisei July 3, 2019, 4:32 p.m. UTC | #10
On 7/3/19 4:59 PM, Marc Zyngier wrote:
> On 25/06/2019 16:18, Alexandru Elisei wrote:
>> Hi Marc,
>>
>> A question regarding this patch. This patch modifies vcpu_{read,write}_sys_reg
>> to handle virtual EL2 registers. However, the file kvm/emulate-nested.c added by
>> patch 10/59 "KVM: arm64: nv: Support virtual EL2 exceptions" already uses
>> vcpu_{read,write}_sys_reg to access EL2 registers. In my opinion, it doesn't
>> really matter which comes first because nested support is only enabled in the
>> last patch of the series, but I thought I should bring this up in case it is not
>> what you intended.
> It doesn't really matter at that stage. The only thing I'm trying to
> achieve in the middle of the series is not to break the build, and not
> to cause non-NV to fail.
>
>> On 6/21/19 10:37 AM, Marc Zyngier wrote:
>>> From: Andre Przywara <andre.przywara@arm.com>
>>>
>>> KVM internally uses accessor functions when reading or writing the
>>> guest's system registers. This takes care of accessing either the stored
>>> copy or using the "live" EL1 system registers when the host uses VHE.
>>>
>>> With the introduction of virtual EL2 we add a bunch of EL2 system
>>> registers, which now must also be taken care of:
>>> - If the guest is running in vEL2, and we access an EL1 sysreg, we must
>>>   revert to the stored version of that, and not use the CPU's copy.
>>> - If the guest is running in vEL1, and we access an EL2 sysreg, we must
>>>   also use the stored version, since the CPU carries the EL1 copy.
>>> - Some EL2 system registers are supposed to affect the current execution
>>>   of the system, so we need to put them into their respective EL1
>>>   counterparts. For this we need to define a mapping between the two.
>>>   This is done using the newly introduced struct el2_sysreg_map.
>>> - Some EL2 system registers have a different format than their EL1
>>>   counterpart, so we need to translate them before writing them to the
>>>   CPU. This is done using an (optional) translate function in the map.
>>> - There are the three special registers SP_EL2, SPSR_EL2 and ELR_EL2,
>>>   which need some separate handling.
>> I see no change in this patch related to SPSR_EL2. Special handling of SPSR_EL2
>> is added in the next patch, patch 14/59 "KVM: arm64: nv: Handle SPSR_EL2 specially".
> Indeed, this needs rewriting (we ended-up splitting the SPSR stuff out
> as it was messy and not completely correct). I may take the rest of the
> special stuff out as well.
>
>>> All of these cases are now wrapped into the existing accessor functions,
>>> so KVM users wouldn't need to care whether they access EL2 or EL1
>>> registers and also which state the guest is in.
>>>
>>> This handles what was formerly known as the "shadow state" dynamically,
>>> without requiring a separate copy for each vCPU EL.
>>>
>>> Signed-off-by: Andre Przywara <andre.przywara@arm.com>
>>> Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
>>> ---
>>>  arch/arm64/include/asm/kvm_emulate.h |   6 +
>>>  arch/arm64/include/asm/kvm_host.h    |   5 +
>>>  arch/arm64/kvm/sys_regs.c            | 163 +++++++++++++++++++++++++++
>>>  3 files changed, 174 insertions(+)
>>>
>>> diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
>>> index c43aac5fed69..f37006b6eec4 100644
>>> --- a/arch/arm64/include/asm/kvm_emulate.h
>>> +++ b/arch/arm64/include/asm/kvm_emulate.h
>>> @@ -70,6 +70,12 @@ void kvm_emulate_nested_eret(struct kvm_vcpu *vcpu);
>>>  int kvm_inject_nested_sync(struct kvm_vcpu *vcpu, u64 esr_el2);
>>>  int kvm_inject_nested_irq(struct kvm_vcpu *vcpu);
>>>  
>>> +u64 translate_tcr(u64 tcr);
>>> +u64 translate_cptr(u64 tcr);
>>> +u64 translate_sctlr(u64 tcr);
>>> +u64 translate_ttbr0(u64 tcr);
>>> +u64 translate_cnthctl(u64 tcr);
>>> +
>>>  static inline bool vcpu_el1_is_32bit(struct kvm_vcpu *vcpu)
>>>  {
>>>  	return !(vcpu->arch.hcr_el2 & HCR_RW);
>>> diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
>>> index 2d4290d2513a..dae9c42a7219 100644
>>> --- a/arch/arm64/include/asm/kvm_host.h
>>> +++ b/arch/arm64/include/asm/kvm_host.h
>>> @@ -217,6 +217,11 @@ enum vcpu_sysreg {
>>>  	NR_SYS_REGS	/* Nothing after this line! */
>>>  };
>>>  
>>> +static inline bool sysreg_is_el2(int reg)
>>> +{
>>> +	return reg >= FIRST_EL2_SYSREG && reg < NR_SYS_REGS;
>>> +}
>>> +
>>>  /* 32bit mapping */
>>>  #define c0_MPIDR	(MPIDR_EL1 * 2)	/* MultiProcessor ID Register */
>>>  #define c0_CSSELR	(CSSELR_EL1 * 2)/* Cache Size Selection Register */
>>> diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
>>> index 693dd063c9c2..d024114da162 100644
>>> --- a/arch/arm64/kvm/sys_regs.c
>>> +++ b/arch/arm64/kvm/sys_regs.c
>>> @@ -76,11 +76,142 @@ static bool write_to_read_only(struct kvm_vcpu *vcpu,
>>>  	return false;
>>>  }
>>>  
>>> +static u64 tcr_el2_ips_to_tcr_el1_ps(u64 tcr_el2)
>> The code seems to suggest that you are translating TCR_EL2.PS to TCR_EL1.IPS.
>> Perhaps the function should be named tcr_el2_ps_to_tcr_el1_ips?
> yup.
>
>>> +{
>>> +	return ((tcr_el2 & TCR_EL2_PS_MASK) >> TCR_EL2_PS_SHIFT)
>>> +		<< TCR_IPS_SHIFT;
>>> +}
>>> +
>>> +u64 translate_tcr(u64 tcr)
>>> +{
>>> +	return TCR_EPD1_MASK |				/* disable TTBR1_EL1 */
>>> +	       ((tcr & TCR_EL2_TBI) ? TCR_TBI0 : 0) |
>>> +	       tcr_el2_ips_to_tcr_el1_ps(tcr) |
>>> +	       (tcr & TCR_EL2_TG0_MASK) |
>>> +	       (tcr & TCR_EL2_ORGN0_MASK) |
>>> +	       (tcr & TCR_EL2_IRGN0_MASK) |
>>> +	       (tcr & TCR_EL2_T0SZ_MASK);
>>> +}
>>> +
>>> +u64 translate_cptr(u64 cptr_el2)
>> The argument name is not consistent with the other translate_* functions. I
>> think it is reasonably obvious that you are translating an EL2 register.
> That's pretty much immaterial, and the variable could be called zorglub.
> Consistency is good, but I don't think we need to worry about that level
> of detail.

Sure.

>
>>> +{
>>> +	u64 cpacr_el1 = 0;
>>> +
>>> +	if (!(cptr_el2 & CPTR_EL2_TFP))
>>> +		cpacr_el1 |= CPACR_EL1_FPEN;
>>> +	if (cptr_el2 & CPTR_EL2_TTA)
>>> +		cpacr_el1 |= CPACR_EL1_TTA;
>>> +	if (!(cptr_el2 & CPTR_EL2_TZ))
>>> +		cpacr_el1 |= CPACR_EL1_ZEN;
>>> +
>>> +	return cpacr_el1;
>>> +}
>>> +
>>> +u64 translate_sctlr(u64 sctlr)
>>> +{
>>> +	/* Bit 20 is RES1 in SCTLR_EL1, but RES0 in SCTLR_EL2 */
>>> +	return sctlr | BIT(20);
>>> +}
>>> +
>>> +u64 translate_ttbr0(u64 ttbr0)
>>> +{
>>> +	/* Force ASID to 0 (ASID 0 or RES0) */
>> Are you forcing ASID to 0 because you are only expecting a non-vhe guest
>> hypervisor to access ttbr0_el2, in which case the architecture says that the
>> ASID field is RES0? Is it so unlikely that a vhe guest hypervisor will access
>> ttbr0_el2 directly that it's not worth adding a check for that?
> Like all the translate_* function, this is only called when running a
> non-VHE guest so that the EL2 register is translated to the EL1 format.
> A VHE guest usually has its sysregs in the EL1 format, and certainly
> does for TTBR0_EL2.

Yeah, figured that out after I sent this patch.

>
>>> +	return ttbr0 & ~GENMASK_ULL(63, 48);
>>> +}
>>> +
>>> +u64 translate_cnthctl(u64 cnthctl)
>>> +{
>>> +	return ((cnthctl & 0x3) << 10) | (cnthctl & 0xfc);
>> Patch 16/59 "KVM: arm64: nv: Save/Restore vEL2 sysregs" suggests that you are
>> translating CNTHCTL to write it to CNTKCTL_EL1. Looking at ARM DDI 0487D.b,
>> CNTKCTL_EL1 has bits 63:10 RES0. I think the correct value should be ((cnthctl &
>> 0x3) << 8) | (cnthctl & 0xfc).
> Rookie mistake! When HCR_EL2.E2h==1 (which is always the case for NV),
> CNTKCTL_EL1 accesses CNTHCTL_EL2. What you have here is the translation
> of non-VHE CNTHCTL_EL2 to its VHE equivalent.

Indeed! Thank you for pointing it out.

>
>>> +}
>>> +
>>> +#define EL2_SYSREG(el2, el1, translate)	\
>>> +	[el2 - FIRST_EL2_SYSREG] = { el2, el1, translate }
>>> +#define PURE_EL2_SYSREG(el2) \
>>> +	[el2 - FIRST_EL2_SYSREG] = { el2,__INVALID_SYSREG__, NULL }
>>> +/*
>>> + * Associate vEL2 registers to their EL1 counterparts on the CPU.
>>> + * The translate function can be NULL, when the register layout is identical.
>>> + */
>>> +struct el2_sysreg_map {
>>> +	int sysreg;	/* EL2 register index into the array above */
>>> +	int mapping;	/* associated EL1 register */
>>> +	u64 (*translate)(u64 value);
>>> +} nested_sysreg_map[NR_SYS_REGS - FIRST_EL2_SYSREG] = {
>>> +	PURE_EL2_SYSREG( VPIDR_EL2 ),
>>> +	PURE_EL2_SYSREG( VMPIDR_EL2 ),
>>> +	PURE_EL2_SYSREG( ACTLR_EL2 ),
>>> +	PURE_EL2_SYSREG( HCR_EL2 ),
>>> +	PURE_EL2_SYSREG( MDCR_EL2 ),
>>> +	PURE_EL2_SYSREG( HSTR_EL2 ),
>>> +	PURE_EL2_SYSREG( HACR_EL2 ),
>>> +	PURE_EL2_SYSREG( VTTBR_EL2 ),
>>> +	PURE_EL2_SYSREG( VTCR_EL2 ),
>>> +	PURE_EL2_SYSREG( RVBAR_EL2 ),
>>> +	PURE_EL2_SYSREG( RMR_EL2 ),
>>> +	PURE_EL2_SYSREG( TPIDR_EL2 ),
>>> +	PURE_EL2_SYSREG( CNTVOFF_EL2 ),
>>> +	PURE_EL2_SYSREG( CNTHCTL_EL2 ),
>>> +	PURE_EL2_SYSREG( HPFAR_EL2 ),
>>> +	EL2_SYSREG(      SCTLR_EL2,  SCTLR_EL1,      translate_sctlr ),
>>> +	EL2_SYSREG(      CPTR_EL2,   CPACR_EL1,      translate_cptr  ),
>>> +	EL2_SYSREG(      TTBR0_EL2,  TTBR0_EL1,      translate_ttbr0 ),
>>> +	EL2_SYSREG(      TTBR1_EL2,  TTBR1_EL1,      NULL            ),
>>> +	EL2_SYSREG(      TCR_EL2,    TCR_EL1,        translate_tcr   ),
>>> +	EL2_SYSREG(      VBAR_EL2,   VBAR_EL1,       NULL            ),
>>> +	EL2_SYSREG(      AFSR0_EL2,  AFSR0_EL1,      NULL            ),
>>> +	EL2_SYSREG(      AFSR1_EL2,  AFSR1_EL1,      NULL            ),
>>> +	EL2_SYSREG(      ESR_EL2,    ESR_EL1,        NULL            ),
>>> +	EL2_SYSREG(      FAR_EL2,    FAR_EL1,        NULL            ),
>>> +	EL2_SYSREG(      MAIR_EL2,   MAIR_EL1,       NULL            ),
>>> +	EL2_SYSREG(      AMAIR_EL2,  AMAIR_EL1,      NULL            ),
>>> +};
>> Figuring out which registers are in this map and which aren't and are supposed
>> to be treated differently is really cumbersome because they are split into two
>> types of el2 registers and their order is different from the order in enum
>> vcpu_sysreg (in kvm_host.h). Perhaps adding a comment about what registers will
>> be treated differently would make the code a bit easier to follow?
> I'm not sure what this buys us. We have 3 categories of EL2 sysregs:
> - Purely emulated
> - Directly mapped onto an EL1 sysreg
> - Translated from EL2 to EL1
>
> I think the wrappers represent that pretty well, although we could split
> EL2_SYSREG into DIRECT_EL2_SYSREG and TRANSLATE_EL2_SYSREG. As for the
> order, does it really matter? We also have the trap table order, which
> is also different from the enum. Do you propose we reorder everything?

The wrappers and the naming are fine.

I was trying to figure out which EL2 registers are in the nested_sysreg_map and
which aren't (that's what I meant by "two types of registers") by looking at the
vcpu_sysreg enum. Because the order in the map is different than the order in
the enum, I was having a difficult time figuring out which registers are not in
the nested_sysreg_map to make sure we haven't somehow forgot to emulate a register.

So no, I wasn't asking to reorder everything. I was asking if it would be
appropriate to write a comment stating the intention to treat registers X, Y and
Z separately from the registers in nested_sysreg_map.

>
>>> +
>>> +static
>>> +const struct el2_sysreg_map *find_el2_sysreg(const struct el2_sysreg_map *map,
>>> +					     int reg)
>>> +{
>>> +	const struct el2_sysreg_map *entry;
>>> +
>>> +	if (!sysreg_is_el2(reg))
>>> +		return NULL;
>>> +
>>> +	entry = &nested_sysreg_map[reg - FIRST_EL2_SYSREG];
>>> +	if (entry->sysreg == __INVALID_SYSREG__)
>>> +		return NULL;
>>> +
>>> +	return entry;
>>> +}
>>> +
>>>  u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg)
>>>  {
>>> +
>>>  	if (!vcpu->arch.sysregs_loaded_on_cpu)
>>>  		goto immediate_read;
>>>  
>>> +	if (unlikely(sysreg_is_el2(reg))) {
>>> +		const struct el2_sysreg_map *el2_reg;
>>> +
>>> +		if (!is_hyp_ctxt(vcpu))
>>> +			goto immediate_read;
>> I'm confused by this. is_hyp_ctxt returns false when the guest is not in vEL2
>> AND HCR_EL.E2H or HCR_EL2.TGE are not set. In this case, the NV bit will not be
>> set and the hardware will raise an undefined instruction exception when
>> accessing an EL2 register from EL1. What am I missing?
> You don't necessarily access an EL2 register just because you run at
> EL2. You also access it because you emulate an EL1 instruction whose
> behaviour is conditioned by an EL2 register.

Got it, now it makes a lot more sense.

>
> Thanks,
>
> 	M.
Marc Zyngier July 4, 2019, 2:39 p.m. UTC | #11
On 03/07/2019 17:32, Alexandru Elisei wrote:

[...]

>>>> +}
>>>> +
>>>> +#define EL2_SYSREG(el2, el1, translate)	\
>>>> +	[el2 - FIRST_EL2_SYSREG] = { el2, el1, translate }
>>>> +#define PURE_EL2_SYSREG(el2) \
>>>> +	[el2 - FIRST_EL2_SYSREG] = { el2,__INVALID_SYSREG__, NULL }
>>>> +/*
>>>> + * Associate vEL2 registers to their EL1 counterparts on the CPU.
>>>> + * The translate function can be NULL, when the register layout is identical.
>>>> + */
>>>> +struct el2_sysreg_map {
>>>> +	int sysreg;	/* EL2 register index into the array above */
>>>> +	int mapping;	/* associated EL1 register */
>>>> +	u64 (*translate)(u64 value);
>>>> +} nested_sysreg_map[NR_SYS_REGS - FIRST_EL2_SYSREG] = {
>>>> +	PURE_EL2_SYSREG( VPIDR_EL2 ),
>>>> +	PURE_EL2_SYSREG( VMPIDR_EL2 ),
>>>> +	PURE_EL2_SYSREG( ACTLR_EL2 ),
>>>> +	PURE_EL2_SYSREG( HCR_EL2 ),
>>>> +	PURE_EL2_SYSREG( MDCR_EL2 ),
>>>> +	PURE_EL2_SYSREG( HSTR_EL2 ),
>>>> +	PURE_EL2_SYSREG( HACR_EL2 ),
>>>> +	PURE_EL2_SYSREG( VTTBR_EL2 ),
>>>> +	PURE_EL2_SYSREG( VTCR_EL2 ),
>>>> +	PURE_EL2_SYSREG( RVBAR_EL2 ),
>>>> +	PURE_EL2_SYSREG( RMR_EL2 ),
>>>> +	PURE_EL2_SYSREG( TPIDR_EL2 ),
>>>> +	PURE_EL2_SYSREG( CNTVOFF_EL2 ),
>>>> +	PURE_EL2_SYSREG( CNTHCTL_EL2 ),
>>>> +	PURE_EL2_SYSREG( HPFAR_EL2 ),
>>>> +	EL2_SYSREG(      SCTLR_EL2,  SCTLR_EL1,      translate_sctlr ),
>>>> +	EL2_SYSREG(      CPTR_EL2,   CPACR_EL1,      translate_cptr  ),
>>>> +	EL2_SYSREG(      TTBR0_EL2,  TTBR0_EL1,      translate_ttbr0 ),
>>>> +	EL2_SYSREG(      TTBR1_EL2,  TTBR1_EL1,      NULL            ),
>>>> +	EL2_SYSREG(      TCR_EL2,    TCR_EL1,        translate_tcr   ),
>>>> +	EL2_SYSREG(      VBAR_EL2,   VBAR_EL1,       NULL            ),
>>>> +	EL2_SYSREG(      AFSR0_EL2,  AFSR0_EL1,      NULL            ),
>>>> +	EL2_SYSREG(      AFSR1_EL2,  AFSR1_EL1,      NULL            ),
>>>> +	EL2_SYSREG(      ESR_EL2,    ESR_EL1,        NULL            ),
>>>> +	EL2_SYSREG(      FAR_EL2,    FAR_EL1,        NULL            ),
>>>> +	EL2_SYSREG(      MAIR_EL2,   MAIR_EL1,       NULL            ),
>>>> +	EL2_SYSREG(      AMAIR_EL2,  AMAIR_EL1,      NULL            ),
>>>> +};
>>> Figuring out which registers are in this map and which aren't and are supposed
>>> to be treated differently is really cumbersome because they are split into two
>>> types of el2 registers and their order is different from the order in enum
>>> vcpu_sysreg (in kvm_host.h). Perhaps adding a comment about what registers will
>>> be treated differently would make the code a bit easier to follow?
>> I'm not sure what this buys us. We have 3 categories of EL2 sysregs:
>> - Purely emulated
>> - Directly mapped onto an EL1 sysreg
>> - Translated from EL2 to EL1
>>
>> I think the wrappers represent that pretty well, although we could split
>> EL2_SYSREG into DIRECT_EL2_SYSREG and TRANSLATE_EL2_SYSREG. As for the
>> order, does it really matter? We also have the trap table order, which
>> is also different from the enum. Do you propose we reorder everything?
> 
> The wrappers and the naming are fine.
> 
> I was trying to figure out which EL2 registers are in the nested_sysreg_map and
> which aren't (that's what I meant by "two types of registers") by looking at the
> vcpu_sysreg enum. Because the order in the map is different than the order in
> the enum, I was having a difficult time figuring out which registers are not in
> the nested_sysreg_map to make sure we haven't somehow forgot to emulate a register.
> 
> So no, I wasn't asking to reorder everything. I was asking if it would be
> appropriate to write a comment stating the intention to treat registers X, Y and
> Z separately from the registers in nested_sysreg_map.

Ah, fair enough. Yes, that's a very reasonable suggestion.

Thanks,

	M.
Marc Zyngier July 4, 2019, 3:05 p.m. UTC | #12
On 26/06/2019 16:04, Alexandru Elisei wrote:
> On 6/21/19 10:37 AM, Marc Zyngier wrote:
>> From: Andre Przywara <andre.przywara@arm.com>
>>
>> KVM internally uses accessor functions when reading or writing the
>> guest's system registers. This takes care of accessing either the stored
>> copy or using the "live" EL1 system registers when the host uses VHE.
>>
>> With the introduction of virtual EL2 we add a bunch of EL2 system
>> registers, which now must also be taken care of:
>> - If the guest is running in vEL2, and we access an EL1 sysreg, we must
>>   revert to the stored version of that, and not use the CPU's copy.
>> - If the guest is running in vEL1, and we access an EL2 sysreg, we must
>>   also use the stored version, since the CPU carries the EL1 copy.
>> - Some EL2 system registers are supposed to affect the current execution
>>   of the system, so we need to put them into their respective EL1
>>   counterparts. For this we need to define a mapping between the two.
>>   This is done using the newly introduced struct el2_sysreg_map.
>> - Some EL2 system registers have a different format than their EL1
>>   counterpart, so we need to translate them before writing them to the
>>   CPU. This is done using an (optional) translate function in the map.
>> - There are the three special registers SP_EL2, SPSR_EL2 and ELR_EL2,
>>   which need some separate handling.
>>
>> All of these cases are now wrapped into the existing accessor functions,
>> so KVM users wouldn't need to care whether they access EL2 or EL1
>> registers and also which state the guest is in.
>>
>> This handles what was formerly known as the "shadow state" dynamically,
>> without requiring a separate copy for each vCPU EL.
>>
>> Signed-off-by: Andre Przywara <andre.przywara@arm.com>
>> Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
>> ---
>>  arch/arm64/include/asm/kvm_emulate.h |   6 +
>>  arch/arm64/include/asm/kvm_host.h    |   5 +
>>  arch/arm64/kvm/sys_regs.c            | 163 +++++++++++++++++++++++++++
>>  3 files changed, 174 insertions(+)
>>
>> diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
>> index c43aac5fed69..f37006b6eec4 100644
>> --- a/arch/arm64/include/asm/kvm_emulate.h
>> +++ b/arch/arm64/include/asm/kvm_emulate.h
>> @@ -70,6 +70,12 @@ void kvm_emulate_nested_eret(struct kvm_vcpu *vcpu);
>>  int kvm_inject_nested_sync(struct kvm_vcpu *vcpu, u64 esr_el2);
>>  int kvm_inject_nested_irq(struct kvm_vcpu *vcpu);
>>  
>> +u64 translate_tcr(u64 tcr);
>> +u64 translate_cptr(u64 tcr);
>> +u64 translate_sctlr(u64 tcr);
>> +u64 translate_ttbr0(u64 tcr);
>> +u64 translate_cnthctl(u64 tcr);
>> +
>>  static inline bool vcpu_el1_is_32bit(struct kvm_vcpu *vcpu)
>>  {
>>  	return !(vcpu->arch.hcr_el2 & HCR_RW);
>> diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
>> index 2d4290d2513a..dae9c42a7219 100644
>> --- a/arch/arm64/include/asm/kvm_host.h
>> +++ b/arch/arm64/include/asm/kvm_host.h
>> @@ -217,6 +217,11 @@ enum vcpu_sysreg {
>>  	NR_SYS_REGS	/* Nothing after this line! */
>>  };
>>  
>> +static inline bool sysreg_is_el2(int reg)
>> +{
>> +	return reg >= FIRST_EL2_SYSREG && reg < NR_SYS_REGS;
>> +}
>> +
>>  /* 32bit mapping */
>>  #define c0_MPIDR	(MPIDR_EL1 * 2)	/* MultiProcessor ID Register */
>>  #define c0_CSSELR	(CSSELR_EL1 * 2)/* Cache Size Selection Register */
>> diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
>> index 693dd063c9c2..d024114da162 100644
>> --- a/arch/arm64/kvm/sys_regs.c
>> +++ b/arch/arm64/kvm/sys_regs.c
>> @@ -76,11 +76,142 @@ static bool write_to_read_only(struct kvm_vcpu *vcpu,
>>  	return false;
>>  }
>>  
>> +static u64 tcr_el2_ips_to_tcr_el1_ps(u64 tcr_el2)
>> +{
>> +	return ((tcr_el2 & TCR_EL2_PS_MASK) >> TCR_EL2_PS_SHIFT)
>> +		<< TCR_IPS_SHIFT;
>> +}
>> +
>> +u64 translate_tcr(u64 tcr)
>> +{
>> +	return TCR_EPD1_MASK |				/* disable TTBR1_EL1 */
>> +	       ((tcr & TCR_EL2_TBI) ? TCR_TBI0 : 0) |
>> +	       tcr_el2_ips_to_tcr_el1_ps(tcr) |
>> +	       (tcr & TCR_EL2_TG0_MASK) |
>> +	       (tcr & TCR_EL2_ORGN0_MASK) |
>> +	       (tcr & TCR_EL2_IRGN0_MASK) |
>> +	       (tcr & TCR_EL2_T0SZ_MASK);
>> +}
>> +
>> +u64 translate_cptr(u64 cptr_el2)
>> +{
>> +	u64 cpacr_el1 = 0;
>> +
>> +	if (!(cptr_el2 & CPTR_EL2_TFP))
>> +		cpacr_el1 |= CPACR_EL1_FPEN;
>> +	if (cptr_el2 & CPTR_EL2_TTA)
>> +		cpacr_el1 |= CPACR_EL1_TTA;
>> +	if (!(cptr_el2 & CPTR_EL2_TZ))
>> +		cpacr_el1 |= CPACR_EL1_ZEN;
>> +
>> +	return cpacr_el1;
>> +}
>> +
>> +u64 translate_sctlr(u64 sctlr)
>> +{
>> +	/* Bit 20 is RES1 in SCTLR_EL1, but RES0 in SCTLR_EL2 */
>> +	return sctlr | BIT(20);
>> +}
>> +
>> +u64 translate_ttbr0(u64 ttbr0)
>> +{
>> +	/* Force ASID to 0 (ASID 0 or RES0) */
>> +	return ttbr0 & ~GENMASK_ULL(63, 48);
>> +}
>> +
>> +u64 translate_cnthctl(u64 cnthctl)
>> +{
>> +	return ((cnthctl & 0x3) << 10) | (cnthctl & 0xfc);
>> +}
>> +
>> +#define EL2_SYSREG(el2, el1, translate)	\
>> +	[el2 - FIRST_EL2_SYSREG] = { el2, el1, translate }
>> +#define PURE_EL2_SYSREG(el2) \
>> +	[el2 - FIRST_EL2_SYSREG] = { el2,__INVALID_SYSREG__, NULL }
>> +/*
>> + * Associate vEL2 registers to their EL1 counterparts on the CPU.
>> + * The translate function can be NULL, when the register layout is identical.
>> + */
>> +struct el2_sysreg_map {
>> +	int sysreg;	/* EL2 register index into the array above */
>> +	int mapping;	/* associated EL1 register */
>> +	u64 (*translate)(u64 value);
>> +} nested_sysreg_map[NR_SYS_REGS - FIRST_EL2_SYSREG] = {
>> +	PURE_EL2_SYSREG( VPIDR_EL2 ),
>> +	PURE_EL2_SYSREG( VMPIDR_EL2 ),
>> +	PURE_EL2_SYSREG( ACTLR_EL2 ),
>> +	PURE_EL2_SYSREG( HCR_EL2 ),
>> +	PURE_EL2_SYSREG( MDCR_EL2 ),
>> +	PURE_EL2_SYSREG( HSTR_EL2 ),
>> +	PURE_EL2_SYSREG( HACR_EL2 ),
>> +	PURE_EL2_SYSREG( VTTBR_EL2 ),
>> +	PURE_EL2_SYSREG( VTCR_EL2 ),
>> +	PURE_EL2_SYSREG( RVBAR_EL2 ),
>> +	PURE_EL2_SYSREG( RMR_EL2 ),
>> +	PURE_EL2_SYSREG( TPIDR_EL2 ),
>> +	PURE_EL2_SYSREG( CNTVOFF_EL2 ),
>> +	PURE_EL2_SYSREG( CNTHCTL_EL2 ),
>> +	PURE_EL2_SYSREG( HPFAR_EL2 ),
>> +	EL2_SYSREG(      SCTLR_EL2,  SCTLR_EL1,      translate_sctlr ),
>> +	EL2_SYSREG(      CPTR_EL2,   CPACR_EL1,      translate_cptr  ),
>> +	EL2_SYSREG(      TTBR0_EL2,  TTBR0_EL1,      translate_ttbr0 ),
>> +	EL2_SYSREG(      TTBR1_EL2,  TTBR1_EL1,      NULL            ),
>> +	EL2_SYSREG(      TCR_EL2,    TCR_EL1,        translate_tcr   ),
>> +	EL2_SYSREG(      VBAR_EL2,   VBAR_EL1,       NULL            ),
>> +	EL2_SYSREG(      AFSR0_EL2,  AFSR0_EL1,      NULL            ),
>> +	EL2_SYSREG(      AFSR1_EL2,  AFSR1_EL1,      NULL            ),
>> +	EL2_SYSREG(      ESR_EL2,    ESR_EL1,        NULL            ),
>> +	EL2_SYSREG(      FAR_EL2,    FAR_EL1,        NULL            ),
>> +	EL2_SYSREG(      MAIR_EL2,   MAIR_EL1,       NULL            ),
>> +	EL2_SYSREG(      AMAIR_EL2,  AMAIR_EL1,      NULL            ),
>> +};
>> +
>> +static
>> +const struct el2_sysreg_map *find_el2_sysreg(const struct el2_sysreg_map *map,
>> +					     int reg)
>> +{
>> +	const struct el2_sysreg_map *entry;
>> +
>> +	if (!sysreg_is_el2(reg))
>> +		return NULL;
>> +
>> +	entry = &nested_sysreg_map[reg - FIRST_EL2_SYSREG];
>> +	if (entry->sysreg == __INVALID_SYSREG__)
>> +		return NULL;
>> +
>> +	return entry;
>> +}
>> +
>>  u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg)
>>  {
>> +
>>  	if (!vcpu->arch.sysregs_loaded_on_cpu)
>>  		goto immediate_read;
>>  
>> +	if (unlikely(sysreg_is_el2(reg))) {
>> +		const struct el2_sysreg_map *el2_reg;
>> +
>> +		if (!is_hyp_ctxt(vcpu))
>> +			goto immediate_read;
>> +
>> +		el2_reg = find_el2_sysreg(nested_sysreg_map, reg);
>> +		if (el2_reg) {
>> +			/*
>> +			 * If this register does not have an EL1 counterpart,
>> +			 * then read the stored EL2 version.
>> +			 */
>> +			if (el2_reg->mapping == __INVALID_SYSREG__)
>> +				goto immediate_read;
>> +
>> +			/* Get the current version of the EL1 counterpart. */
>> +			reg = el2_reg->mapping;
>> +		}
>> +	} else {
>> +		/* EL1 register can't be on the CPU if the guest is in vEL2. */
>> +		if (unlikely(is_hyp_ctxt(vcpu)))
>> +			goto immediate_read;
>> +	}
>> +
>>  	/*
>>  	 * System registers listed in the switch are not saved on every
>>  	 * exit from the guest but are only saved on vcpu_put.
>> @@ -114,6 +245,8 @@ u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg)
>>  	case DACR32_EL2:	return read_sysreg_s(SYS_DACR32_EL2);
>>  	case IFSR32_EL2:	return read_sysreg_s(SYS_IFSR32_EL2);
>>  	case DBGVCR32_EL2:	return read_sysreg_s(SYS_DBGVCR32_EL2);
>> +	case SP_EL2:		return read_sysreg(sp_el1);
> From ARM DDI 0487D.b, section Behavior when HCR_EL2.NV == 1: "Reads or writes to
> any allocated and implemented System register or Special-purpose register named
> *_EL2, *_EL02, or *_EL12 in the MRS or MSR instruction, other than SP_EL2, are
> trapped to EL2 rather than being UNDEFINED" (page D5-2480). My interpretation of
> the text is that attempted reads of SP_EL2 from virtual EL2 cause an undefined
> instruction exception.

Sure. Nonetheless, the virtual EL2 has a stack pointer, accessible via
SP_EL1 when it is loaded on the CPU. Somehow, this gets dropped later in
the series (which is a bit wrong). I definitely should bring it back.

Thanks,

	M.
diff mbox series

Patch

diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
index c43aac5fed69..f37006b6eec4 100644
--- a/arch/arm64/include/asm/kvm_emulate.h
+++ b/arch/arm64/include/asm/kvm_emulate.h
@@ -70,6 +70,12 @@  void kvm_emulate_nested_eret(struct kvm_vcpu *vcpu);
 int kvm_inject_nested_sync(struct kvm_vcpu *vcpu, u64 esr_el2);
 int kvm_inject_nested_irq(struct kvm_vcpu *vcpu);
 
+u64 translate_tcr(u64 tcr);
+u64 translate_cptr(u64 tcr);
+u64 translate_sctlr(u64 tcr);
+u64 translate_ttbr0(u64 tcr);
+u64 translate_cnthctl(u64 tcr);
+
 static inline bool vcpu_el1_is_32bit(struct kvm_vcpu *vcpu)
 {
 	return !(vcpu->arch.hcr_el2 & HCR_RW);
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 2d4290d2513a..dae9c42a7219 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -217,6 +217,11 @@  enum vcpu_sysreg {
 	NR_SYS_REGS	/* Nothing after this line! */
 };
 
+static inline bool sysreg_is_el2(int reg)
+{
+	return reg >= FIRST_EL2_SYSREG && reg < NR_SYS_REGS;
+}
+
 /* 32bit mapping */
 #define c0_MPIDR	(MPIDR_EL1 * 2)	/* MultiProcessor ID Register */
 #define c0_CSSELR	(CSSELR_EL1 * 2)/* Cache Size Selection Register */
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 693dd063c9c2..d024114da162 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -76,11 +76,142 @@  static bool write_to_read_only(struct kvm_vcpu *vcpu,
 	return false;
 }
 
+static u64 tcr_el2_ips_to_tcr_el1_ps(u64 tcr_el2)
+{
+	return ((tcr_el2 & TCR_EL2_PS_MASK) >> TCR_EL2_PS_SHIFT)
+		<< TCR_IPS_SHIFT;
+}
+
+u64 translate_tcr(u64 tcr)
+{
+	return TCR_EPD1_MASK |				/* disable TTBR1_EL1 */
+	       ((tcr & TCR_EL2_TBI) ? TCR_TBI0 : 0) |
+	       tcr_el2_ips_to_tcr_el1_ps(tcr) |
+	       (tcr & TCR_EL2_TG0_MASK) |
+	       (tcr & TCR_EL2_ORGN0_MASK) |
+	       (tcr & TCR_EL2_IRGN0_MASK) |
+	       (tcr & TCR_EL2_T0SZ_MASK);
+}
+
+u64 translate_cptr(u64 cptr_el2)
+{
+	u64 cpacr_el1 = 0;
+
+	if (!(cptr_el2 & CPTR_EL2_TFP))
+		cpacr_el1 |= CPACR_EL1_FPEN;
+	if (cptr_el2 & CPTR_EL2_TTA)
+		cpacr_el1 |= CPACR_EL1_TTA;
+	if (!(cptr_el2 & CPTR_EL2_TZ))
+		cpacr_el1 |= CPACR_EL1_ZEN;
+
+	return cpacr_el1;
+}
+
+u64 translate_sctlr(u64 sctlr)
+{
+	/* Bit 20 is RES1 in SCTLR_EL1, but RES0 in SCTLR_EL2 */
+	return sctlr | BIT(20);
+}
+
+u64 translate_ttbr0(u64 ttbr0)
+{
+	/* Force ASID to 0 (ASID 0 or RES0) */
+	return ttbr0 & ~GENMASK_ULL(63, 48);
+}
+
+u64 translate_cnthctl(u64 cnthctl)
+{
+	return ((cnthctl & 0x3) << 10) | (cnthctl & 0xfc);
+}
+
+#define EL2_SYSREG(el2, el1, translate)	\
+	[el2 - FIRST_EL2_SYSREG] = { el2, el1, translate }
+#define PURE_EL2_SYSREG(el2) \
+	[el2 - FIRST_EL2_SYSREG] = { el2,__INVALID_SYSREG__, NULL }
+/*
+ * Associate vEL2 registers to their EL1 counterparts on the CPU.
+ * The translate function can be NULL, when the register layout is identical.
+ */
+struct el2_sysreg_map {
+	int sysreg;	/* EL2 register index into the array above */
+	int mapping;	/* associated EL1 register */
+	u64 (*translate)(u64 value);
+} nested_sysreg_map[NR_SYS_REGS - FIRST_EL2_SYSREG] = {
+	PURE_EL2_SYSREG( VPIDR_EL2 ),
+	PURE_EL2_SYSREG( VMPIDR_EL2 ),
+	PURE_EL2_SYSREG( ACTLR_EL2 ),
+	PURE_EL2_SYSREG( HCR_EL2 ),
+	PURE_EL2_SYSREG( MDCR_EL2 ),
+	PURE_EL2_SYSREG( HSTR_EL2 ),
+	PURE_EL2_SYSREG( HACR_EL2 ),
+	PURE_EL2_SYSREG( VTTBR_EL2 ),
+	PURE_EL2_SYSREG( VTCR_EL2 ),
+	PURE_EL2_SYSREG( RVBAR_EL2 ),
+	PURE_EL2_SYSREG( RMR_EL2 ),
+	PURE_EL2_SYSREG( TPIDR_EL2 ),
+	PURE_EL2_SYSREG( CNTVOFF_EL2 ),
+	PURE_EL2_SYSREG( CNTHCTL_EL2 ),
+	PURE_EL2_SYSREG( HPFAR_EL2 ),
+	EL2_SYSREG(      SCTLR_EL2,  SCTLR_EL1,      translate_sctlr ),
+	EL2_SYSREG(      CPTR_EL2,   CPACR_EL1,      translate_cptr  ),
+	EL2_SYSREG(      TTBR0_EL2,  TTBR0_EL1,      translate_ttbr0 ),
+	EL2_SYSREG(      TTBR1_EL2,  TTBR1_EL1,      NULL            ),
+	EL2_SYSREG(      TCR_EL2,    TCR_EL1,        translate_tcr   ),
+	EL2_SYSREG(      VBAR_EL2,   VBAR_EL1,       NULL            ),
+	EL2_SYSREG(      AFSR0_EL2,  AFSR0_EL1,      NULL            ),
+	EL2_SYSREG(      AFSR1_EL2,  AFSR1_EL1,      NULL            ),
+	EL2_SYSREG(      ESR_EL2,    ESR_EL1,        NULL            ),
+	EL2_SYSREG(      FAR_EL2,    FAR_EL1,        NULL            ),
+	EL2_SYSREG(      MAIR_EL2,   MAIR_EL1,       NULL            ),
+	EL2_SYSREG(      AMAIR_EL2,  AMAIR_EL1,      NULL            ),
+};
+
+static
+const struct el2_sysreg_map *find_el2_sysreg(const struct el2_sysreg_map *map,
+					     int reg)
+{
+	const struct el2_sysreg_map *entry;
+
+	if (!sysreg_is_el2(reg))
+		return NULL;
+
+	entry = &nested_sysreg_map[reg - FIRST_EL2_SYSREG];
+	if (entry->sysreg == __INVALID_SYSREG__)
+		return NULL;
+
+	return entry;
+}
+
 u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg)
 {
+
 	if (!vcpu->arch.sysregs_loaded_on_cpu)
 		goto immediate_read;
 
+	if (unlikely(sysreg_is_el2(reg))) {
+		const struct el2_sysreg_map *el2_reg;
+
+		if (!is_hyp_ctxt(vcpu))
+			goto immediate_read;
+
+		el2_reg = find_el2_sysreg(nested_sysreg_map, reg);
+		if (el2_reg) {
+			/*
+			 * If this register does not have an EL1 counterpart,
+			 * then read the stored EL2 version.
+			 */
+			if (el2_reg->mapping == __INVALID_SYSREG__)
+				goto immediate_read;
+
+			/* Get the current version of the EL1 counterpart. */
+			reg = el2_reg->mapping;
+		}
+	} else {
+		/* EL1 register can't be on the CPU if the guest is in vEL2. */
+		if (unlikely(is_hyp_ctxt(vcpu)))
+			goto immediate_read;
+	}
+
 	/*
 	 * System registers listed in the switch are not saved on every
 	 * exit from the guest but are only saved on vcpu_put.
@@ -114,6 +245,8 @@  u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg)
 	case DACR32_EL2:	return read_sysreg_s(SYS_DACR32_EL2);
 	case IFSR32_EL2:	return read_sysreg_s(SYS_IFSR32_EL2);
 	case DBGVCR32_EL2:	return read_sysreg_s(SYS_DBGVCR32_EL2);
+	case SP_EL2:		return read_sysreg(sp_el1);
+	case ELR_EL2:		return read_sysreg_el1(SYS_ELR);
 	}
 
 immediate_read:
@@ -125,6 +258,34 @@  void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg)
 	if (!vcpu->arch.sysregs_loaded_on_cpu)
 		goto immediate_write;
 
+	if (unlikely(sysreg_is_el2(reg))) {
+		const struct el2_sysreg_map *el2_reg;
+
+		if (!is_hyp_ctxt(vcpu))
+			goto immediate_write;
+
+		/* Store the EL2 version in the sysregs array. */
+		__vcpu_sys_reg(vcpu, reg) = val;
+
+		el2_reg = find_el2_sysreg(nested_sysreg_map, reg);
+		if (el2_reg) {
+			/* Does this register have an EL1 counterpart? */
+			if (el2_reg->mapping == __INVALID_SYSREG__)
+				return;
+
+			if (!vcpu_el2_e2h_is_set(vcpu) &&
+			    el2_reg->translate)
+				val = el2_reg->translate(val);
+
+			/* Redirect this to the EL1 version of the register. */
+			reg = el2_reg->mapping;
+		}
+	} else {
+		/* EL1 register can't be on the CPU if the guest is in vEL2. */
+		if (unlikely(is_hyp_ctxt(vcpu)))
+			goto immediate_write;
+	}
+
 	/*
 	 * System registers listed in the switch are not restored on every
 	 * entry to the guest but are only restored on vcpu_load.
@@ -157,6 +318,8 @@  void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg)
 	case DACR32_EL2:	write_sysreg_s(val, SYS_DACR32_EL2);	return;
 	case IFSR32_EL2:	write_sysreg_s(val, SYS_IFSR32_EL2);	return;
 	case DBGVCR32_EL2:	write_sysreg_s(val, SYS_DBGVCR32_EL2);	return;
+	case SP_EL2:		write_sysreg(val, sp_el1);		return;
+	case ELR_EL2:		write_sysreg_el1(val, SYS_ELR);		return;
 	}
 
 immediate_write: