diff mbox series

[v11,17/43] KVM: arm64: nv: Support multiple nested Stage-2 mmu structures

Message ID 20231120131027.854038-18-maz@kernel.org (mailing list archive)
State New, archived
Headers show
Series KVM: arm64: Nested Virtualization support (FEAT_NV2 only) | expand

Commit Message

Marc Zyngier Nov. 20, 2023, 1:10 p.m. UTC
Add Stage-2 mmu data structures for virtual EL2 and for nested guests.
We don't yet populate shadow Stage-2 page tables, but we now have a
framework for getting to a shadow Stage-2 pgd.

We allocate twice the number of vcpus as Stage-2 mmu structures because
that's sufficient for each vcpu running two translation regimes without
having to flush the Stage-2 page tables.

Co-developed-by: Christoffer Dall <christoffer.dall@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@arm.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 arch/arm64/include/asm/kvm_host.h   |  41 ++++++
 arch/arm64/include/asm/kvm_mmu.h    |   9 ++
 arch/arm64/include/asm/kvm_nested.h |   7 +
 arch/arm64/kvm/arm.c                |  12 ++
 arch/arm64/kvm/mmu.c                |  78 ++++++++---
 arch/arm64/kvm/nested.c             | 207 ++++++++++++++++++++++++++++
 arch/arm64/kvm/reset.c              |   6 +
 7 files changed, 338 insertions(+), 22 deletions(-)

Comments

Ganapatrao Kulkarni Jan. 23, 2024, 9:55 a.m. UTC | #1
Hi Marc,

On 20-11-2023 06:40 pm, Marc Zyngier wrote:
> Add Stage-2 mmu data structures for virtual EL2 and for nested guests.
> We don't yet populate shadow Stage-2 page tables, but we now have a
> framework for getting to a shadow Stage-2 pgd.
> 
> We allocate twice the number of vcpus as Stage-2 mmu structures because
> that's sufficient for each vcpu running two translation regimes without
> having to flush the Stage-2 page tables.
> 
> Co-developed-by: Christoffer Dall <christoffer.dall@arm.com>
> Signed-off-by: Christoffer Dall <christoffer.dall@arm.com>
> Signed-off-by: Marc Zyngier <maz@kernel.org>
> ---
>   arch/arm64/include/asm/kvm_host.h   |  41 ++++++
>   arch/arm64/include/asm/kvm_mmu.h    |   9 ++
>   arch/arm64/include/asm/kvm_nested.h |   7 +
>   arch/arm64/kvm/arm.c                |  12 ++
>   arch/arm64/kvm/mmu.c                |  78 ++++++++---
>   arch/arm64/kvm/nested.c             | 207 ++++++++++++++++++++++++++++
>   arch/arm64/kvm/reset.c              |   6 +
>   7 files changed, 338 insertions(+), 22 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
> index f17fb7c42973..eb96fe9b686e 100644
> --- a/arch/arm64/include/asm/kvm_host.h
> +++ b/arch/arm64/include/asm/kvm_host.h
> @@ -188,8 +188,40 @@ struct kvm_s2_mmu {
>   	uint64_t split_page_chunk_size;
>   
>   	struct kvm_arch *arch;
> +
> +	/*
> +	 * For a shadow stage-2 MMU, the virtual vttbr used by the
> +	 * host to parse the guest S2.
> +	 * This either contains:
> +	 * - the virtual VTTBR programmed by the guest hypervisor with
> +         *   CnP cleared
> +	 * - The value 1 (VMID=0, BADDR=0, CnP=1) if invalid
> +	 *
> +	 * We also cache the full VTCR which gets used for TLB invalidation,
> +	 * taking the ARM ARM's "Any of the bits in VTCR_EL2 are permitted
> +	 * to be cached in a TLB" to the letter.
> +	 */
> +	u64	tlb_vttbr;
> +	u64	tlb_vtcr;
> +
> +	/*
> +	 * true when this represents a nested context where virtual
> +	 * HCR_EL2.VM == 1
> +	 */
> +	bool	nested_stage2_enabled;
> +
> +	/*
> +	 *  0: Nobody is currently using this, check vttbr for validity
> +	 * >0: Somebody is actively using this.
> +	 */
> +	atomic_t refcnt;
>   };
>   
> +static inline bool kvm_s2_mmu_valid(struct kvm_s2_mmu *mmu)
> +{
> +	return !(mmu->tlb_vttbr & 1);
> +}
> +
>   struct kvm_arch_memory_slot {
>   };
>   
> @@ -241,6 +273,14 @@ static inline u16 kvm_mpidr_index(struct kvm_mpidr_data *data, u64 mpidr)
>   struct kvm_arch {
>   	struct kvm_s2_mmu mmu;
>   
> +	/*
> +	 * Stage 2 paging state for VMs with nested S2 using a virtual
> +	 * VMID.
> +	 */
> +	struct kvm_s2_mmu *nested_mmus;
> +	size_t nested_mmus_size;
> +	int nested_mmus_next;
> +
>   	/* Interrupt controller */
>   	struct vgic_dist	vgic;
>   
> @@ -1186,6 +1226,7 @@ void kvm_vcpu_load_vhe(struct kvm_vcpu *vcpu);
>   void kvm_vcpu_put_vhe(struct kvm_vcpu *vcpu);
>   
>   int __init kvm_set_ipa_limit(void);
> +u32 kvm_get_pa_bits(struct kvm *kvm);
>   
>   #define __KVM_HAVE_ARCH_VM_ALLOC
>   struct kvm *kvm_arch_alloc_vm(void);
> diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
> index 49e0d4b36bd0..5c6fb2fb8287 100644
> --- a/arch/arm64/include/asm/kvm_mmu.h
> +++ b/arch/arm64/include/asm/kvm_mmu.h
> @@ -119,6 +119,7 @@ alternative_cb_end
>   #include <asm/mmu_context.h>
>   #include <asm/kvm_emulate.h>
>   #include <asm/kvm_host.h>
> +#include <asm/kvm_nested.h>
>   
>   void kvm_update_va_mask(struct alt_instr *alt,
>   			__le32 *origptr, __le32 *updptr, int nr_inst);
> @@ -171,6 +172,7 @@ int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size,
>   int create_hyp_stack(phys_addr_t phys_addr, unsigned long *haddr);
>   void __init free_hyp_pgds(void);
>   
> +void kvm_unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size);
>   void stage2_unmap_vm(struct kvm *kvm);
>   int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type);
>   void kvm_uninit_stage2_mmu(struct kvm *kvm);
> @@ -339,5 +341,12 @@ static inline struct kvm *kvm_s2_mmu_to_kvm(struct kvm_s2_mmu *mmu)
>   {
>   	return container_of(mmu->arch, struct kvm, arch);
>   }
> +
> +static inline u64 get_vmid(u64 vttbr)
> +{
> +	return (vttbr & VTTBR_VMID_MASK(kvm_get_vmid_bits())) >>
> +		VTTBR_VMID_SHIFT;
> +}
> +
>   #endif /* __ASSEMBLY__ */
>   #endif /* __ARM64_KVM_MMU_H__ */
> diff --git a/arch/arm64/include/asm/kvm_nested.h b/arch/arm64/include/asm/kvm_nested.h
> index aa085f2f1947..f421ad294e68 100644
> --- a/arch/arm64/include/asm/kvm_nested.h
> +++ b/arch/arm64/include/asm/kvm_nested.h
> @@ -60,6 +60,13 @@ static inline u64 translate_ttbr0_el2_to_ttbr0_el1(u64 ttbr0)
>   	return ttbr0 & ~GENMASK_ULL(63, 48);
>   }
>   
> +extern void kvm_init_nested(struct kvm *kvm);
> +extern int kvm_vcpu_init_nested(struct kvm_vcpu *vcpu);
> +extern void kvm_init_nested_s2_mmu(struct kvm_s2_mmu *mmu);
> +extern struct kvm_s2_mmu *lookup_s2_mmu(struct kvm_vcpu *vcpu);
> +extern void kvm_vcpu_load_hw_mmu(struct kvm_vcpu *vcpu);
> +extern void kvm_vcpu_put_hw_mmu(struct kvm_vcpu *vcpu);
> +
>   extern bool forward_smc_trap(struct kvm_vcpu *vcpu);
>   extern bool __check_nv_sr_forward(struct kvm_vcpu *vcpu);
>   
> diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
> index b65df612b41b..2e76892c1a56 100644
> --- a/arch/arm64/kvm/arm.c
> +++ b/arch/arm64/kvm/arm.c
> @@ -147,6 +147,8 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
>   	mutex_unlock(&kvm->lock);
>   #endif
>   
> +	kvm_init_nested(kvm);
> +
>   	ret = kvm_share_hyp(kvm, kvm + 1);
>   	if (ret)
>   		return ret;
> @@ -429,6 +431,9 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
>   	struct kvm_s2_mmu *mmu;
>   	int *last_ran;
>   
> +	if (vcpu_has_nv(vcpu))
> +		kvm_vcpu_load_hw_mmu(vcpu);
> +
>   	mmu = vcpu->arch.hw_mmu;
>   	last_ran = this_cpu_ptr(mmu->last_vcpu_ran);
>   
> @@ -479,9 +484,12 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
>   	kvm_timer_vcpu_put(vcpu);
>   	kvm_vgic_put(vcpu);
>   	kvm_vcpu_pmu_restore_host(vcpu);
> +	if (vcpu_has_nv(vcpu))
> +		kvm_vcpu_put_hw_mmu(vcpu);
>   	kvm_arm_vmid_clear_active();
>   
>   	vcpu_clear_on_unsupported_cpu(vcpu);
> +
>   	vcpu->cpu = -1;
>   }
>   
> @@ -1336,6 +1344,10 @@ static int kvm_setup_vcpu(struct kvm_vcpu *vcpu)
>   	if (kvm_vcpu_has_pmu(vcpu) && !kvm->arch.arm_pmu)
>   		ret = kvm_arm_set_default_pmu(kvm);
>   
> +	/* Prepare for nested if required */
> +	if (!ret)
> +		ret = kvm_vcpu_init_nested(vcpu);
> +
>   	return ret;
>   }
>   
> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> index d87c8fcc4c24..588ce46c0ad0 100644
> --- a/arch/arm64/kvm/mmu.c
> +++ b/arch/arm64/kvm/mmu.c
> @@ -305,7 +305,7 @@ static void invalidate_icache_guest_page(void *va, size_t size)
>    * does.
>    */
>   /**
> - * unmap_stage2_range -- Clear stage2 page table entries to unmap a range
> + * __unmap_stage2_range -- Clear stage2 page table entries to unmap a range
>    * @mmu:   The KVM stage-2 MMU pointer
>    * @start: The intermediate physical base address of the range to unmap
>    * @size:  The size of the area to unmap
> @@ -328,7 +328,7 @@ static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64
>   				   may_block));
>   }
>   
> -static void unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size)
> +void kvm_unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size)
>   {
>   	__unmap_stage2_range(mmu, start, size, true);
>   }
> @@ -853,21 +853,9 @@ static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = {
>   	.icache_inval_pou	= invalidate_icache_guest_page,
>   };
>   
> -/**
> - * kvm_init_stage2_mmu - Initialise a S2 MMU structure
> - * @kvm:	The pointer to the KVM structure
> - * @mmu:	The pointer to the s2 MMU structure
> - * @type:	The machine type of the virtual machine
> - *
> - * Allocates only the stage-2 HW PGD level table(s).
> - * Note we don't need locking here as this is only called when the VM is
> - * created, which can only be done once.
> - */
> -int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type)
> +static int kvm_init_ipa_range(struct kvm_s2_mmu *mmu, unsigned long type)
>   {
>   	u32 kvm_ipa_limit = get_kvm_ipa_limit();
> -	int cpu, err;
> -	struct kvm_pgtable *pgt;
>   	u64 mmfr0, mmfr1;
>   	u32 phys_shift;
>   
> @@ -894,11 +882,58 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long t
>   	mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);
>   	mmu->vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift);
>   
> +	return 0;
> +}
> +
> +/**
> + * kvm_init_stage2_mmu - Initialise a S2 MMU structure
> + * @kvm:	The pointer to the KVM structure
> + * @mmu:	The pointer to the s2 MMU structure
> + * @type:	The machine type of the virtual machine
> + *
> + * Allocates only the stage-2 HW PGD level table(s).
> + * Note we don't need locking here as this is only called in two cases:
> + *
> + * - when the VM is created, which can't race against anything
> + *
> + * - when secondary kvm_s2_mmu structures are initialised for NV
> + *   guests, and the caller must hold kvm->lock as this is called on a
> + *   per-vcpu basis.
> + */
> +int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type)
> +{
> +	int cpu, err;
> +	struct kvm_pgtable *pgt;
> +
> +	/*
> +	 * If we already have our page tables in place, and that the
> +	 * MMU context is the canonical one, we have a bug somewhere,
> +	 * as this is only supposed to ever happen once per VM.
> +	 *
> +	 * Otherwise, we're building nested page tables, and that's
> +	 * probably because userspace called KVM_ARM_VCPU_INIT more
> +	 * than once on the same vcpu. Since that's actually legal,
> +	 * don't kick a fuss and leave gracefully.
> +	 */
>   	if (mmu->pgt != NULL) {
> +		if (&kvm->arch.mmu != mmu)
> +			return 0;
> +
>   		kvm_err("kvm_arch already initialized?\n");
>   		return -EINVAL;
>   	}
>   
> +	/*
> +	 * We only initialise the IPA range on the canonical MMU, so
> +	 * the type is meaningless in all other situations.
> +	 */
> +	if (&kvm->arch.mmu != mmu)
> +		type = kvm_get_pa_bits(kvm);
> +
> +	err = kvm_init_ipa_range(mmu, type);
> +	if (err)
> +		return err;
> +
>   	pgt = kzalloc(sizeof(*pgt), GFP_KERNEL_ACCOUNT);
>   	if (!pgt)
>   		return -ENOMEM;
> @@ -923,6 +958,10 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long t
>   
>   	mmu->pgt = pgt;
>   	mmu->pgd_phys = __pa(pgt->pgd);
> +
> +	if (&kvm->arch.mmu != mmu)
> +		kvm_init_nested_s2_mmu(mmu);
> +
>   	return 0;
>   
>   out_destroy_pgtable:
> @@ -974,7 +1013,7 @@ static void stage2_unmap_memslot(struct kvm *kvm,
>   
>   		if (!(vma->vm_flags & VM_PFNMAP)) {
>   			gpa_t gpa = addr + (vm_start - memslot->userspace_addr);
> -			unmap_stage2_range(&kvm->arch.mmu, gpa, vm_end - vm_start);
> +			kvm_unmap_stage2_range(&kvm->arch.mmu, gpa, vm_end - vm_start);
>   		}
>   		hva = vm_end;
>   	} while (hva < reg_end);
> @@ -2054,11 +2093,6 @@ void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
>   {
>   }
>   
> -void kvm_arch_flush_shadow_all(struct kvm *kvm)
> -{
> -	kvm_uninit_stage2_mmu(kvm);
> -}
> -
>   void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
>   				   struct kvm_memory_slot *slot)
>   {
> @@ -2066,7 +2100,7 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
>   	phys_addr_t size = slot->npages << PAGE_SHIFT;
>   
>   	write_lock(&kvm->mmu_lock);
> -	unmap_stage2_range(&kvm->arch.mmu, gpa, size);
> +	kvm_unmap_stage2_range(&kvm->arch.mmu, gpa, size);
>   	write_unlock(&kvm->mmu_lock);
>   }
>   
> diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c
> index 66d05f5d39a2..c5752ab8c3fe 100644
> --- a/arch/arm64/kvm/nested.c
> +++ b/arch/arm64/kvm/nested.c
> @@ -7,7 +7,9 @@
>   #include <linux/kvm.h>
>   #include <linux/kvm_host.h>
>   
> +#include <asm/kvm_arm.h>
>   #include <asm/kvm_emulate.h>
> +#include <asm/kvm_mmu.h>
>   #include <asm/kvm_nested.h>
>   #include <asm/sysreg.h>
>   
> @@ -16,6 +18,211 @@
>   /* Protection against the sysreg repainting madness... */
>   #define NV_FTR(r, f)		ID_AA64##r##_EL1_##f
>   
> +void kvm_init_nested(struct kvm *kvm)
> +{
> +	kvm->arch.nested_mmus = NULL;
> +	kvm->arch.nested_mmus_size = 0;
> +}
> +
> +int kvm_vcpu_init_nested(struct kvm_vcpu *vcpu)
> +{
> +	struct kvm *kvm = vcpu->kvm;
> +	struct kvm_s2_mmu *tmp;
> +	int num_mmus;
> +	int ret = -ENOMEM;
> +
> +	if (!test_bit(KVM_ARM_VCPU_HAS_EL2, vcpu->kvm->arch.vcpu_features))
> +		return 0;
> +
> +	if (!cpus_have_final_cap(ARM64_HAS_NESTED_VIRT))
> +		return -EINVAL;
> +
> +	/*
> +	 * Let's treat memory allocation failures as benign: If we fail to
> +	 * allocate anything, return an error and keep the allocated array
> +	 * alive. Userspace may try to recover by intializing the vcpu
> +	 * again, and there is no reason to affect the whole VM for this.
> +	 */
> +	num_mmus = atomic_read(&kvm->online_vcpus) * 2;
> +	tmp = krealloc(kvm->arch.nested_mmus,
> +		       num_mmus * sizeof(*kvm->arch.nested_mmus),
> +		       GFP_KERNEL_ACCOUNT | __GFP_ZERO);
> +	if (tmp) {
> +		/*
> +		 * If we went through a realocation, adjust the MMU
> +		 * back-pointers in the previously initialised
> +		 * pg_table structures.
> +		 */
> +		if (kvm->arch.nested_mmus != tmp) {
> +			int i;
> +
> +			for (i = 0; i < num_mmus - 2; i++)
> +				tmp[i].pgt->mmu = &tmp[i];
> +		}
> +
> +		if (kvm_init_stage2_mmu(kvm, &tmp[num_mmus - 1], 0) ||
> +		    kvm_init_stage2_mmu(kvm, &tmp[num_mmus - 2], 0)) {
> +			kvm_free_stage2_pgd(&tmp[num_mmus - 1]);
> +			kvm_free_stage2_pgd(&tmp[num_mmus - 2]);
> +		} else {
> +			kvm->arch.nested_mmus_size = num_mmus;
> +			ret = 0;
> +		}
> +
> +		kvm->arch.nested_mmus = tmp;
> +	}
> +
> +	return ret;
> +}
> +
> +/* Must be called with kvm->mmu_lock held */
> +struct kvm_s2_mmu *lookup_s2_mmu(struct kvm_vcpu *vcpu)
> +{
> +	bool nested_stage2_enabled;
> +	u64 vttbr, vtcr, hcr;
> +	struct kvm *kvm;
> +	int i;
> +
> +	kvm = vcpu->kvm;
> +
> +	vttbr = vcpu_read_sys_reg(vcpu, VTTBR_EL2);
> +	vtcr = vcpu_read_sys_reg(vcpu, VTCR_EL2);
> +	hcr = vcpu_read_sys_reg(vcpu, HCR_EL2);
> +
> +	nested_stage2_enabled = hcr & HCR_VM;
> +
> +	/* Don't consider the CnP bit for the vttbr match */
> +	vttbr = vttbr & ~VTTBR_CNP_BIT;
> +
> +	/*
> +	 * Two possibilities when looking up a S2 MMU context:
> +	 *
> +	 * - either S2 is enabled in the guest, and we need a context that is
> +         *   S2-enabled and matches the full VTTBR (VMID+BADDR) and VTCR,
> +         *   which makes it safe from a TLB conflict perspective (a broken
> +         *   guest won't be able to generate them),
> +	 *
> +	 * - or S2 is disabled, and we need a context that is S2-disabled
> +         *   and matches the VMID only, as all TLBs are tagged by VMID even
> +         *   if S2 translation is disabled.
> +	 */
> +	for (i = 0; i < kvm->arch.nested_mmus_size; i++) {
> +		struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i];
> +
> +		if (!kvm_s2_mmu_valid(mmu))
> +			continue;
> +
> +		if (nested_stage2_enabled &&
> +		    mmu->nested_stage2_enabled &&
> +		    vttbr == mmu->tlb_vttbr &&
> +		    vtcr == mmu->tlb_vtcr)
> +			return mmu;
> +
> +		if (!nested_stage2_enabled &&
> +		    !mmu->nested_stage2_enabled &&
> +		    get_vmid(vttbr) == get_vmid(mmu->tlb_vttbr))
> +			return mmu;
> +	}
> +	return NULL;
> +}
> +
> +/* Must be called with kvm->mmu_lock held */
> +static struct kvm_s2_mmu *get_s2_mmu_nested(struct kvm_vcpu *vcpu)
> +{
> +	struct kvm *kvm = vcpu->kvm;
> +	struct kvm_s2_mmu *s2_mmu;
> +	int i;
> +
> +	s2_mmu = lookup_s2_mmu(vcpu);
> +	if (s2_mmu)
> +		goto out;
> +
> +	/*
> +	 * Make sure we don't always search from the same point, or we
> +	 * will always reuse a potentially active context, leaving
> +	 * free contexts unused.
> +	 */
> +	for (i = kvm->arch.nested_mmus_next;
> +	     i < (kvm->arch.nested_mmus_size + kvm->arch.nested_mmus_next);
> +	     i++) {
> +		s2_mmu = &kvm->arch.nested_mmus[i % kvm->arch.nested_mmus_size];
> +
> +		if (atomic_read(&s2_mmu->refcnt) == 0)
> +			break;
> +	}
> +	BUG_ON(atomic_read(&s2_mmu->refcnt)); /* We have struct MMUs to spare */
> +
> +	/* Set the scene for the next search */
> +	kvm->arch.nested_mmus_next = (i + 1) % kvm->arch.nested_mmus_size;
> +
> +	if (kvm_s2_mmu_valid(s2_mmu)) {
> +		/* Clear the old state */
> +		kvm_unmap_stage2_range(s2_mmu, 0, kvm_phys_size(s2_mmu));
> +		if (atomic64_read(&s2_mmu->vmid.id))
> +			kvm_call_hyp(__kvm_tlb_flush_vmid, s2_mmu);
> +	}
> +
> +	/*
> +	 * The virtual VMID (modulo CnP) will be used as a key when matching
> +	 * an existing kvm_s2_mmu.
> +	 *
> +	 * We cache VTCR at allocation time, once and for all. It'd be great
> +	 * if the guest didn't screw that one up, as this is not very
> +	 * forgiving...
> +	 */
> +	s2_mmu->tlb_vttbr = vcpu_read_sys_reg(vcpu, VTTBR_EL2) & ~VTTBR_CNP_BIT;
> +	s2_mmu->tlb_vtcr = vcpu_read_sys_reg(vcpu, VTCR_EL2);
> +	s2_mmu->nested_stage2_enabled = vcpu_read_sys_reg(vcpu, HCR_EL2) & HCR_VM;
> +
> +out:
> +	atomic_inc(&s2_mmu->refcnt);
> +	return s2_mmu;
> +}
> +
> +void kvm_init_nested_s2_mmu(struct kvm_s2_mmu *mmu)
> +{
> +	mmu->tlb_vttbr = 1;
> +	mmu->nested_stage2_enabled = false;
> +	atomic_set(&mmu->refcnt, 0);
> +}
> +
> +void kvm_vcpu_load_hw_mmu(struct kvm_vcpu *vcpu)
> +{
> +	if (is_hyp_ctxt(vcpu)) {
> +		vcpu->arch.hw_mmu = &vcpu->kvm->arch.mmu;
> +	} else {
> +		write_lock(&vcpu->kvm->mmu_lock);
> +		vcpu->arch.hw_mmu = get_s2_mmu_nested(vcpu);
> +		write_unlock(&vcpu->kvm->mmu_lock);
> +	}

Due to race, there is a non-existing L2's mmu table is getting loaded 
for some of vCPU while booting L1(noticed with L1 boot using large 
number of vCPUs). This is happening since at the early stage the 
e2h(hyp-context) is not set and trap to eret of L1 boot-strap code 
resulting in context switch as if it is returning to L2(guest enter) and 
loading not initialized mmu table on those vCPUs resulting in 
unrecoverable traps and aborts.
Adding code to check (below diff fixes the issue) stage 2 is enabled and 
avoid the false ERET and continue with L1's mmu context.

diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c
index 340e2710cdda..1901dd19d770 100644
--- a/arch/arm64/kvm/nested.c
+++ b/arch/arm64/kvm/nested.c
@@ -759,7 +759,12 @@ void kvm_init_nested_s2_mmu(struct kvm_s2_mmu *mmu)

  void kvm_vcpu_load_hw_mmu(struct kvm_vcpu *vcpu)
  {
-       if (is_hyp_ctxt(vcpu)) {
+       bool nested_stage2_enabled = vcpu_read_sys_reg(vcpu, HCR_EL2) & 
HCR_VM;
+
+       /* Load L2 mmu only if nested_stage2_enabled, avoid mmu
+        * load due to false ERET trap.
+        */
+       if (is_hyp_ctxt(vcpu) || !nested_stage2_enabled) {
                 vcpu->arch.hw_mmu = &vcpu->kvm->arch.mmu;
         } else {
                 write_lock(&vcpu->kvm->mmu_lock);

Hoping we dont hit this when we move completely NV2 based implementation 
and e2h is always set?
> +}
> +
> +void kvm_vcpu_put_hw_mmu(struct kvm_vcpu *vcpu)
> +{
> +	if (vcpu->arch.hw_mmu != &vcpu->kvm->arch.mmu) {
> +		atomic_dec(&vcpu->arch.hw_mmu->refcnt);
> +		vcpu->arch.hw_mmu = NULL;
> +	}
> +}
> +
> +void kvm_arch_flush_shadow_all(struct kvm *kvm)
> +{
> +	int i;
> +
> +	for (i = 0; i < kvm->arch.nested_mmus_size; i++) {
> +		struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i];
> +
> +		WARN_ON(atomic_read(&mmu->refcnt));
> +
> +		if (!atomic_read(&mmu->refcnt))
> +			kvm_free_stage2_pgd(mmu);
> +	}
> +	kfree(kvm->arch.nested_mmus);
> +	kvm->arch.nested_mmus = NULL;
> +	kvm->arch.nested_mmus_size = 0;
> +	kvm_uninit_stage2_mmu(kvm);
> +}
> +
>   /*
>    * Our emulated CPU doesn't support all the possible features. For the
>    * sake of simplicity (and probably mental sanity), wipe out a number
> diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c
> index 5bb4de162cab..e106ea01598f 100644
> --- a/arch/arm64/kvm/reset.c
> +++ b/arch/arm64/kvm/reset.c
> @@ -266,6 +266,12 @@ void kvm_reset_vcpu(struct kvm_vcpu *vcpu)
>   	preempt_enable();
>   }
>   
> +u32 kvm_get_pa_bits(struct kvm *kvm)
> +{
> +	/* Fixed limit until we can configure ID_AA64MMFR0.PARange */
> +	return kvm_ipa_limit;
> +}
> +
>   u32 get_kvm_ipa_limit(void)
>   {
>   	return kvm_ipa_limit;

Thanks,
Ganapat
Marc Zyngier Jan. 23, 2024, 2:26 p.m. UTC | #2
Hi Ganapatrao,

On Tue, 23 Jan 2024 09:55:32 +0000,
Ganapatrao Kulkarni <gankulkarni@os.amperecomputing.com> wrote:
> 
> Hi Marc,
> 
> > +void kvm_vcpu_load_hw_mmu(struct kvm_vcpu *vcpu)
> > +{
> > +	if (is_hyp_ctxt(vcpu)) {
> > +		vcpu->arch.hw_mmu = &vcpu->kvm->arch.mmu;
> > +	} else {
> > +		write_lock(&vcpu->kvm->mmu_lock);
> > +		vcpu->arch.hw_mmu = get_s2_mmu_nested(vcpu);
> > +		write_unlock(&vcpu->kvm->mmu_lock);
> > +	}
> 
> Due to race, there is a non-existing L2's mmu table is getting loaded
> for some of vCPU while booting L1(noticed with L1 boot using large
> number of vCPUs). This is happening since at the early stage the
> e2h(hyp-context) is not set and trap to eret of L1 boot-strap code
> resulting in context switch as if it is returning to L2(guest enter)
> and loading not initialized mmu table on those vCPUs resulting in
> unrecoverable traps and aborts.

I'm not sure I understand the problem you're describing here.

What is the race exactly? Why isn't the shadow S2 good enough? Not
having HCR_EL2.VM set doesn't mean we can use the same S2, as the TLBs
are tagged by a different VMID, so staying on the canonical S2 seems
wrong.

My expectations are that the L1 ERET from EL2 to EL1 is trapped, and
that we pick an empty S2 and start populating it. What fails in this
process?

> Adding code to check (below diff fixes the issue) stage 2 is enabled
> and avoid the false ERET and continue with L1's mmu context.
> 
> diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c
> index 340e2710cdda..1901dd19d770 100644
> --- a/arch/arm64/kvm/nested.c
> +++ b/arch/arm64/kvm/nested.c
> @@ -759,7 +759,12 @@ void kvm_init_nested_s2_mmu(struct kvm_s2_mmu *mmu)
> 
>  void kvm_vcpu_load_hw_mmu(struct kvm_vcpu *vcpu)
>  {
> -       if (is_hyp_ctxt(vcpu)) {
> +       bool nested_stage2_enabled = vcpu_read_sys_reg(vcpu, HCR_EL2)
> & HCR_VM;
> +
> +       /* Load L2 mmu only if nested_stage2_enabled, avoid mmu
> +        * load due to false ERET trap.
> +        */
> +       if (is_hyp_ctxt(vcpu) || !nested_stage2_enabled) {
>                 vcpu->arch.hw_mmu = &vcpu->kvm->arch.mmu;
>         } else {
>                 write_lock(&vcpu->kvm->mmu_lock);

As I said above, this doesn't look right.

> Hoping we dont hit this when we move completely NV2 based
> implementation and e2h is always set?

No, the same constraints apply. I don't see why this would change.

Thanks,

	M.
Ganapatrao Kulkarni Jan. 25, 2024, 8:14 a.m. UTC | #3
Hi Marc,

On 23-01-2024 07:56 pm, Marc Zyngier wrote:
> Hi Ganapatrao,
> 
> On Tue, 23 Jan 2024 09:55:32 +0000,
> Ganapatrao Kulkarni <gankulkarni@os.amperecomputing.com> wrote:
>>
>> Hi Marc,
>>
>>> +void kvm_vcpu_load_hw_mmu(struct kvm_vcpu *vcpu)
>>> +{
>>> +	if (is_hyp_ctxt(vcpu)) {
>>> +		vcpu->arch.hw_mmu = &vcpu->kvm->arch.mmu;
>>> +	} else {
>>> +		write_lock(&vcpu->kvm->mmu_lock);
>>> +		vcpu->arch.hw_mmu = get_s2_mmu_nested(vcpu);
>>> +		write_unlock(&vcpu->kvm->mmu_lock);
>>> +	}
>>
>> Due to race, there is a non-existing L2's mmu table is getting loaded
>> for some of vCPU while booting L1(noticed with L1 boot using large
>> number of vCPUs). This is happening since at the early stage the
>> e2h(hyp-context) is not set and trap to eret of L1 boot-strap code
>> resulting in context switch as if it is returning to L2(guest enter)
>> and loading not initialized mmu table on those vCPUs resulting in
>> unrecoverable traps and aborts.
> 
> I'm not sure I understand the problem you're describing here.
> 

IIUC, When the S2 fault happens, the faulted vCPU gets the pages from 
qemu process and maps in S2 and copies the code to allocated memory. 
Mean while other vCPUs which are in race to come online, when they 
switches over to dummy S2 finds the mapping and returns to L1 and 
subsequent execution does not fault instead fetches from memory where no 
code exists yet(for some) and generates stage 1 instruction abort and 
jumps to abort handler and even there is no code exist and keeps 
aborting. This is happening on random vCPUs(no pattern).

> What is the race exactly? Why isn't the shadow S2 good enough? Not
> having HCR_EL2.VM set doesn't mean we can use the same S2, as the TLBs
> are tagged by a different VMID, so staying on the canonical S2 seems
> wrong.

IMO, it is unnecessary to switch-over for first ERET while L1 is booting 
and repeat the faults and page allocation which is anyway dummy once L1 
switches to E2H.
Let L1 use its S2 always which is created by L0. Even we should consider 
avoiding the entry created for L1 in array(first entry in the array) of 
S2-MMUs and avoid unnecessary iteration/lookup while unmap of NestedVMs.

I am anticipating this unwanted switch-over wont happen when we have NV2 
only support in V12?

> 
> My expectations are that the L1 ERET from EL2 to EL1 is trapped, and
> that we pick an empty S2 and start populating it. What fails in this
> process?
> 
>> Adding code to check (below diff fixes the issue) stage 2 is enabled
>> and avoid the false ERET and continue with L1's mmu context.
>>
>> diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c
>> index 340e2710cdda..1901dd19d770 100644
>> --- a/arch/arm64/kvm/nested.c
>> +++ b/arch/arm64/kvm/nested.c
>> @@ -759,7 +759,12 @@ void kvm_init_nested_s2_mmu(struct kvm_s2_mmu *mmu)
>>
>>   void kvm_vcpu_load_hw_mmu(struct kvm_vcpu *vcpu)
>>   {
>> -       if (is_hyp_ctxt(vcpu)) {
>> +       bool nested_stage2_enabled = vcpu_read_sys_reg(vcpu, HCR_EL2)
>> & HCR_VM;
>> +
>> +       /* Load L2 mmu only if nested_stage2_enabled, avoid mmu
>> +        * load due to false ERET trap.
>> +        */
>> +       if (is_hyp_ctxt(vcpu) || !nested_stage2_enabled) {
>>                  vcpu->arch.hw_mmu = &vcpu->kvm->arch.mmu;
>>          } else {
>>                  write_lock(&vcpu->kvm->mmu_lock);
> 
> As I said above, this doesn't look right.
> 
>> Hoping we dont hit this when we move completely NV2 based
>> implementation and e2h is always set?
> 
> No, the same constraints apply. I don't see why this would change.
> 
> Thanks,
> 
> 	M.
> 

Thanks,
Ganapat
Marc Zyngier Jan. 25, 2024, 8:58 a.m. UTC | #4
On Thu, 25 Jan 2024 08:14:32 +0000,
Ganapatrao Kulkarni <gankulkarni@os.amperecomputing.com> wrote:
> 
> 
> Hi Marc,
> 
> On 23-01-2024 07:56 pm, Marc Zyngier wrote:
> > Hi Ganapatrao,
> > 
> > On Tue, 23 Jan 2024 09:55:32 +0000,
> > Ganapatrao Kulkarni <gankulkarni@os.amperecomputing.com> wrote:
> >> 
> >> Hi Marc,
> >> 
> >>> +void kvm_vcpu_load_hw_mmu(struct kvm_vcpu *vcpu)
> >>> +{
> >>> +	if (is_hyp_ctxt(vcpu)) {
> >>> +		vcpu->arch.hw_mmu = &vcpu->kvm->arch.mmu;
> >>> +	} else {
> >>> +		write_lock(&vcpu->kvm->mmu_lock);
> >>> +		vcpu->arch.hw_mmu = get_s2_mmu_nested(vcpu);
> >>> +		write_unlock(&vcpu->kvm->mmu_lock);
> >>> +	}
> >> 
> >> Due to race, there is a non-existing L2's mmu table is getting loaded
> >> for some of vCPU while booting L1(noticed with L1 boot using large
> >> number of vCPUs). This is happening since at the early stage the
> >> e2h(hyp-context) is not set and trap to eret of L1 boot-strap code
> >> resulting in context switch as if it is returning to L2(guest enter)
> >> and loading not initialized mmu table on those vCPUs resulting in
> >> unrecoverable traps and aborts.
> > 
> > I'm not sure I understand the problem you're describing here.
> > 
> 
> IIUC, When the S2 fault happens, the faulted vCPU gets the pages from
> qemu process and maps in S2 and copies the code to allocated
> memory. Mean while other vCPUs which are in race to come online, when
> they switches over to dummy S2 finds the mapping and returns to L1 and
> subsequent execution does not fault instead fetches from memory where
> no code exists yet(for some) and generates stage 1 instruction abort
> and jumps to abort handler and even there is no code exist and keeps
> aborting. This is happening on random vCPUs(no pattern).

Why is that any different from the way we handle faults in the
non-nested case? If there is a case where we can map the PTE at S2
before the data is available, this is a generic bug that can trigger
irrespective of NV.

> 
> > What is the race exactly? Why isn't the shadow S2 good enough? Not
> > having HCR_EL2.VM set doesn't mean we can use the same S2, as the TLBs
> > are tagged by a different VMID, so staying on the canonical S2 seems
> > wrong.
> 
> IMO, it is unnecessary to switch-over for first ERET while L1 is
> booting and repeat the faults and page allocation which is anyway
> dummy once L1 switches to E2H.

It is mandated by the architecture. EL1 is, by definition, a different
translation regime from EL2. So we *must* have a different S2, because
that defines the boundaries of TLB creation and invalidation. The
fact that these are the same pages is totally irrelevant.

> Let L1 use its S2 always which is created by L0. Even we should
> consider avoiding the entry created for L1 in array(first entry in the
> array) of S2-MMUs and avoid unnecessary iteration/lookup while unmap
> of NestedVMs.

I'm sorry, but this is just wrong. You are merging the EL1 and EL2
translation regimes, which is not acceptable.

> I am anticipating this unwanted switch-over wont happen when we have
> NV2 only support in V12?

V11 is already NV2 only, so I really don't get what you mean here.
Everything stays the same, and there is nothing to change here.

What you describe looks like a terrible bug somewhere on the
page-fault path that has the potential to impact non-NV, and I'd like
to focus on that.

I've been booting my L1 with a fairly large number of vcpus (32 vcpu
for 6 physical CPUs), and I don't see this.

Since you seem to have a way to trigger it on your HW, can you please
pinpoint the situation where we map the page without having the
corresponding data?

Thanks,

	M.
Ganapatrao Kulkarni Jan. 31, 2024, 9:39 a.m. UTC | #5
Hi Marc,

On 25-01-2024 02:28 pm, Marc Zyngier wrote:
> On Thu, 25 Jan 2024 08:14:32 +0000,
> Ganapatrao Kulkarni <gankulkarni@os.amperecomputing.com> wrote:
>>
>>
>> Hi Marc,
>>
>> On 23-01-2024 07:56 pm, Marc Zyngier wrote:
>>> Hi Ganapatrao,
>>>
>>> On Tue, 23 Jan 2024 09:55:32 +0000,
>>> Ganapatrao Kulkarni <gankulkarni@os.amperecomputing.com> wrote:
>>>>
>>>> Hi Marc,
>>>>
>>>>> +void kvm_vcpu_load_hw_mmu(struct kvm_vcpu *vcpu)
>>>>> +{
>>>>> +	if (is_hyp_ctxt(vcpu)) {
>>>>> +		vcpu->arch.hw_mmu = &vcpu->kvm->arch.mmu;
>>>>> +	} else {
>>>>> +		write_lock(&vcpu->kvm->mmu_lock);
>>>>> +		vcpu->arch.hw_mmu = get_s2_mmu_nested(vcpu);
>>>>> +		write_unlock(&vcpu->kvm->mmu_lock);
>>>>> +	}
>>>>
>>>> Due to race, there is a non-existing L2's mmu table is getting loaded
>>>> for some of vCPU while booting L1(noticed with L1 boot using large
>>>> number of vCPUs). This is happening since at the early stage the
>>>> e2h(hyp-context) is not set and trap to eret of L1 boot-strap code
>>>> resulting in context switch as if it is returning to L2(guest enter)
>>>> and loading not initialized mmu table on those vCPUs resulting in
>>>> unrecoverable traps and aborts.
>>>
>>> I'm not sure I understand the problem you're describing here.
>>>
>>
>> IIUC, When the S2 fault happens, the faulted vCPU gets the pages from
>> qemu process and maps in S2 and copies the code to allocated
>> memory. Mean while other vCPUs which are in race to come online, when
>> they switches over to dummy S2 finds the mapping and returns to L1 and
>> subsequent execution does not fault instead fetches from memory where
>> no code exists yet(for some) and generates stage 1 instruction abort
>> and jumps to abort handler and even there is no code exist and keeps
>> aborting. This is happening on random vCPUs(no pattern).
> 
> Why is that any different from the way we handle faults in the
> non-nested case? If there is a case where we can map the PTE at S2
> before the data is available, this is a generic bug that can trigger
> irrespective of NV.
> 
>>
>>> What is the race exactly? Why isn't the shadow S2 good enough? Not
>>> having HCR_EL2.VM set doesn't mean we can use the same S2, as the TLBs
>>> are tagged by a different VMID, so staying on the canonical S2 seems
>>> wrong.
>>
>> IMO, it is unnecessary to switch-over for first ERET while L1 is
>> booting and repeat the faults and page allocation which is anyway
>> dummy once L1 switches to E2H.
> 
> It is mandated by the architecture. EL1 is, by definition, a different
> translation regime from EL2. So we *must* have a different S2, because
> that defines the boundaries of TLB creation and invalidation. The
> fact that these are the same pages is totally irrelevant.
> 
>> Let L1 use its S2 always which is created by L0. Even we should
>> consider avoiding the entry created for L1 in array(first entry in the
>> array) of S2-MMUs and avoid unnecessary iteration/lookup while unmap
>> of NestedVMs.
> 
> I'm sorry, but this is just wrong. You are merging the EL1 and EL2
> translation regimes, which is not acceptable.
> 
>> I am anticipating this unwanted switch-over wont happen when we have
>> NV2 only support in V12?
> 
> V11 is already NV2 only, so I really don't get what you mean here.
> Everything stays the same, and there is nothing to change here.
> 

I am using still V10 since V11(also V12/nv-6.9-sr-enforcement) has 
issues to boot with QEMU. Tried V11 with my local branch of QEMU which 
is 7.2 based and also with Eric's QEMU[1] which rebased on 8.2. The 
issue is QEMU crashes at the very beginning itself. Not sure about the 
issue and yet to debug.

[1] https://github.com/eauger/qemu/tree/v8.2-nv

> What you describe looks like a terrible bug somewhere on the
> page-fault path that has the potential to impact non-NV, and I'd like
> to focus on that.

I found the bug/issue and fixed it.
The problem was so random and was happening when tried booting L1 with 
large cores(200 to 300+).

I have implemented(yet to send to ML for review) to fix the performance 
issue[2] due to unmapping of Shadow tables by implementing the lookup 
table to unmap only the mapped Shadow IPAs instead of unmapping complete 
Shadow S2 of all active NestedVMs.

This lookup table was not adding the mappings created for the L1 when it 
is using the shadow S2-MMU(my bad, missed to notice that the L1 hops 
between vEL2 and EL1 at the booting stage), hence when there is a page 
migration, the unmap was not getting done for those pages and resulting 
in access of stale pages/memory by the some of the VCPUs of L1.

I have modified the check while adding the Shadow-IPA to PA mapping to a 
lookup table to check, is this page is getting mapped to NestedVMs or to 
  a L1 while it is using Shadow S2.

[2] https://www.spinics.net/lists/kvm/msg326638.html

> 
> I've been booting my L1 with a fairly large number of vcpus (32 vcpu
> for 6 physical CPUs), and I don't see this.
> 
> Since you seem to have a way to trigger it on your HW, can you please
> pinpoint the situation where we map the page without having the
> corresponding data?
> 
> Thanks,
> 
> 	M.
> 

Thanks,
Ganapat
Marc Zyngier Jan. 31, 2024, 1:50 p.m. UTC | #6
On Wed, 31 Jan 2024 09:39:34 +0000,
Ganapatrao Kulkarni <gankulkarni@os.amperecomputing.com> wrote:
> 
> 
> Hi Marc,
> 
> On 25-01-2024 02:28 pm, Marc Zyngier wrote:
> > On Thu, 25 Jan 2024 08:14:32 +0000,
> > Ganapatrao Kulkarni <gankulkarni@os.amperecomputing.com> wrote:
> >> 
> >> 
> >> Hi Marc,
> >> 
> >> On 23-01-2024 07:56 pm, Marc Zyngier wrote:
> >>> Hi Ganapatrao,
> >>> 
> >>> On Tue, 23 Jan 2024 09:55:32 +0000,
> >>> Ganapatrao Kulkarni <gankulkarni@os.amperecomputing.com> wrote:
> >>>> 
> >>>> Hi Marc,
> >>>> 
> >>>>> +void kvm_vcpu_load_hw_mmu(struct kvm_vcpu *vcpu)
> >>>>> +{
> >>>>> +	if (is_hyp_ctxt(vcpu)) {
> >>>>> +		vcpu->arch.hw_mmu = &vcpu->kvm->arch.mmu;
> >>>>> +	} else {
> >>>>> +		write_lock(&vcpu->kvm->mmu_lock);
> >>>>> +		vcpu->arch.hw_mmu = get_s2_mmu_nested(vcpu);
> >>>>> +		write_unlock(&vcpu->kvm->mmu_lock);
> >>>>> +	}
> >>>> 
> >>>> Due to race, there is a non-existing L2's mmu table is getting loaded
> >>>> for some of vCPU while booting L1(noticed with L1 boot using large
> >>>> number of vCPUs). This is happening since at the early stage the
> >>>> e2h(hyp-context) is not set and trap to eret of L1 boot-strap code
> >>>> resulting in context switch as if it is returning to L2(guest enter)
> >>>> and loading not initialized mmu table on those vCPUs resulting in
> >>>> unrecoverable traps and aborts.
> >>> 
> >>> I'm not sure I understand the problem you're describing here.
> >>> 
> >> 
> >> IIUC, When the S2 fault happens, the faulted vCPU gets the pages from
> >> qemu process and maps in S2 and copies the code to allocated
> >> memory. Mean while other vCPUs which are in race to come online, when
> >> they switches over to dummy S2 finds the mapping and returns to L1 and
> >> subsequent execution does not fault instead fetches from memory where
> >> no code exists yet(for some) and generates stage 1 instruction abort
> >> and jumps to abort handler and even there is no code exist and keeps
> >> aborting. This is happening on random vCPUs(no pattern).
> > 
> > Why is that any different from the way we handle faults in the
> > non-nested case? If there is a case where we can map the PTE at S2
> > before the data is available, this is a generic bug that can trigger
> > irrespective of NV.
> > 
> >> 
> >>> What is the race exactly? Why isn't the shadow S2 good enough? Not
> >>> having HCR_EL2.VM set doesn't mean we can use the same S2, as the TLBs
> >>> are tagged by a different VMID, so staying on the canonical S2 seems
> >>> wrong.
> >> 
> >> IMO, it is unnecessary to switch-over for first ERET while L1 is
> >> booting and repeat the faults and page allocation which is anyway
> >> dummy once L1 switches to E2H.
> > 
> > It is mandated by the architecture. EL1 is, by definition, a different
> > translation regime from EL2. So we *must* have a different S2, because
> > that defines the boundaries of TLB creation and invalidation. The
> > fact that these are the same pages is totally irrelevant.
> > 
> >> Let L1 use its S2 always which is created by L0. Even we should
> >> consider avoiding the entry created for L1 in array(first entry in the
> >> array) of S2-MMUs and avoid unnecessary iteration/lookup while unmap
> >> of NestedVMs.
> > 
> > I'm sorry, but this is just wrong. You are merging the EL1 and EL2
> > translation regimes, which is not acceptable.
> > 
> >> I am anticipating this unwanted switch-over wont happen when we have
> >> NV2 only support in V12?
> > 
> > V11 is already NV2 only, so I really don't get what you mean here.
> > Everything stays the same, and there is nothing to change here.
> > 
> 
> I am using still V10 since V11(also V12/nv-6.9-sr-enforcement) has
> issues to boot with QEMU.

Let's be clear: I have no interest in reports against a version that
is older than the current one. If you still use V10, then
congratulations, you are the maintainer of that version.

> Tried V11 with my local branch of QEMU which
> is 7.2 based and also with Eric's QEMU[1] which rebased on 8.2. The
> issue is QEMU crashes at the very beginning itself. Not sure about the
> issue and yet to debug.
> 
> [1] https://github.com/eauger/qemu/tree/v8.2-nv

I have already reported that QEMU was doing some horrible things
behind the kernel's back, and I don't think it is working correctly.

> 
> > What you describe looks like a terrible bug somewhere on the
> > page-fault path that has the potential to impact non-NV, and I'd like
> > to focus on that.
> 
> I found the bug/issue and fixed it.
> The problem was so random and was happening when tried booting L1 with
> large cores(200 to 300+).
> 
> I have implemented(yet to send to ML for review) to fix the
> performance issue[2] due to unmapping of Shadow tables by implementing
> the lookup table to unmap only the mapped Shadow IPAs instead of
> unmapping complete Shadow S2 of all active NestedVMs.

Again, this is irrelevant:

- you develop against an unmaintained version

- you waste time prematurely optimising code that is clearly
  advertised as throw-away

> 
> This lookup table was not adding the mappings created for the L1 when
> it is using the shadow S2-MMU(my bad, missed to notice that the L1
> hops between vEL2 and EL1 at the booting stage), hence when there is a
> page migration, the unmap was not getting done for those pages and
> resulting in access of stale pages/memory by the some of the VCPUs of
> L1.
> 
> I have modified the check while adding the Shadow-IPA to PA mapping to
> a lookup table to check, is this page is getting mapped to NestedVMs
> or to  a L1 while it is using Shadow S2.
> 
> [2] https://www.spinics.net/lists/kvm/msg326638.html

Do I read it correctly that I wasted hours trying to reproduce
something that only exists with on an obsolete series together with
private patches?

	M.
diff mbox series

Patch

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index f17fb7c42973..eb96fe9b686e 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -188,8 +188,40 @@  struct kvm_s2_mmu {
 	uint64_t split_page_chunk_size;
 
 	struct kvm_arch *arch;
+
+	/*
+	 * For a shadow stage-2 MMU, the virtual vttbr used by the
+	 * host to parse the guest S2.
+	 * This either contains:
+	 * - the virtual VTTBR programmed by the guest hypervisor with
+         *   CnP cleared
+	 * - The value 1 (VMID=0, BADDR=0, CnP=1) if invalid
+	 *
+	 * We also cache the full VTCR which gets used for TLB invalidation,
+	 * taking the ARM ARM's "Any of the bits in VTCR_EL2 are permitted
+	 * to be cached in a TLB" to the letter.
+	 */
+	u64	tlb_vttbr;
+	u64	tlb_vtcr;
+
+	/*
+	 * true when this represents a nested context where virtual
+	 * HCR_EL2.VM == 1
+	 */
+	bool	nested_stage2_enabled;
+
+	/*
+	 *  0: Nobody is currently using this, check vttbr for validity
+	 * >0: Somebody is actively using this.
+	 */
+	atomic_t refcnt;
 };
 
+static inline bool kvm_s2_mmu_valid(struct kvm_s2_mmu *mmu)
+{
+	return !(mmu->tlb_vttbr & 1);
+}
+
 struct kvm_arch_memory_slot {
 };
 
@@ -241,6 +273,14 @@  static inline u16 kvm_mpidr_index(struct kvm_mpidr_data *data, u64 mpidr)
 struct kvm_arch {
 	struct kvm_s2_mmu mmu;
 
+	/*
+	 * Stage 2 paging state for VMs with nested S2 using a virtual
+	 * VMID.
+	 */
+	struct kvm_s2_mmu *nested_mmus;
+	size_t nested_mmus_size;
+	int nested_mmus_next;
+
 	/* Interrupt controller */
 	struct vgic_dist	vgic;
 
@@ -1186,6 +1226,7 @@  void kvm_vcpu_load_vhe(struct kvm_vcpu *vcpu);
 void kvm_vcpu_put_vhe(struct kvm_vcpu *vcpu);
 
 int __init kvm_set_ipa_limit(void);
+u32 kvm_get_pa_bits(struct kvm *kvm);
 
 #define __KVM_HAVE_ARCH_VM_ALLOC
 struct kvm *kvm_arch_alloc_vm(void);
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 49e0d4b36bd0..5c6fb2fb8287 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -119,6 +119,7 @@  alternative_cb_end
 #include <asm/mmu_context.h>
 #include <asm/kvm_emulate.h>
 #include <asm/kvm_host.h>
+#include <asm/kvm_nested.h>
 
 void kvm_update_va_mask(struct alt_instr *alt,
 			__le32 *origptr, __le32 *updptr, int nr_inst);
@@ -171,6 +172,7 @@  int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size,
 int create_hyp_stack(phys_addr_t phys_addr, unsigned long *haddr);
 void __init free_hyp_pgds(void);
 
+void kvm_unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size);
 void stage2_unmap_vm(struct kvm *kvm);
 int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type);
 void kvm_uninit_stage2_mmu(struct kvm *kvm);
@@ -339,5 +341,12 @@  static inline struct kvm *kvm_s2_mmu_to_kvm(struct kvm_s2_mmu *mmu)
 {
 	return container_of(mmu->arch, struct kvm, arch);
 }
+
+static inline u64 get_vmid(u64 vttbr)
+{
+	return (vttbr & VTTBR_VMID_MASK(kvm_get_vmid_bits())) >>
+		VTTBR_VMID_SHIFT;
+}
+
 #endif /* __ASSEMBLY__ */
 #endif /* __ARM64_KVM_MMU_H__ */
diff --git a/arch/arm64/include/asm/kvm_nested.h b/arch/arm64/include/asm/kvm_nested.h
index aa085f2f1947..f421ad294e68 100644
--- a/arch/arm64/include/asm/kvm_nested.h
+++ b/arch/arm64/include/asm/kvm_nested.h
@@ -60,6 +60,13 @@  static inline u64 translate_ttbr0_el2_to_ttbr0_el1(u64 ttbr0)
 	return ttbr0 & ~GENMASK_ULL(63, 48);
 }
 
+extern void kvm_init_nested(struct kvm *kvm);
+extern int kvm_vcpu_init_nested(struct kvm_vcpu *vcpu);
+extern void kvm_init_nested_s2_mmu(struct kvm_s2_mmu *mmu);
+extern struct kvm_s2_mmu *lookup_s2_mmu(struct kvm_vcpu *vcpu);
+extern void kvm_vcpu_load_hw_mmu(struct kvm_vcpu *vcpu);
+extern void kvm_vcpu_put_hw_mmu(struct kvm_vcpu *vcpu);
+
 extern bool forward_smc_trap(struct kvm_vcpu *vcpu);
 extern bool __check_nv_sr_forward(struct kvm_vcpu *vcpu);
 
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index b65df612b41b..2e76892c1a56 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -147,6 +147,8 @@  int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 	mutex_unlock(&kvm->lock);
 #endif
 
+	kvm_init_nested(kvm);
+
 	ret = kvm_share_hyp(kvm, kvm + 1);
 	if (ret)
 		return ret;
@@ -429,6 +431,9 @@  void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 	struct kvm_s2_mmu *mmu;
 	int *last_ran;
 
+	if (vcpu_has_nv(vcpu))
+		kvm_vcpu_load_hw_mmu(vcpu);
+
 	mmu = vcpu->arch.hw_mmu;
 	last_ran = this_cpu_ptr(mmu->last_vcpu_ran);
 
@@ -479,9 +484,12 @@  void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 	kvm_timer_vcpu_put(vcpu);
 	kvm_vgic_put(vcpu);
 	kvm_vcpu_pmu_restore_host(vcpu);
+	if (vcpu_has_nv(vcpu))
+		kvm_vcpu_put_hw_mmu(vcpu);
 	kvm_arm_vmid_clear_active();
 
 	vcpu_clear_on_unsupported_cpu(vcpu);
+
 	vcpu->cpu = -1;
 }
 
@@ -1336,6 +1344,10 @@  static int kvm_setup_vcpu(struct kvm_vcpu *vcpu)
 	if (kvm_vcpu_has_pmu(vcpu) && !kvm->arch.arm_pmu)
 		ret = kvm_arm_set_default_pmu(kvm);
 
+	/* Prepare for nested if required */
+	if (!ret)
+		ret = kvm_vcpu_init_nested(vcpu);
+
 	return ret;
 }
 
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index d87c8fcc4c24..588ce46c0ad0 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -305,7 +305,7 @@  static void invalidate_icache_guest_page(void *va, size_t size)
  * does.
  */
 /**
- * unmap_stage2_range -- Clear stage2 page table entries to unmap a range
+ * __unmap_stage2_range -- Clear stage2 page table entries to unmap a range
  * @mmu:   The KVM stage-2 MMU pointer
  * @start: The intermediate physical base address of the range to unmap
  * @size:  The size of the area to unmap
@@ -328,7 +328,7 @@  static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64
 				   may_block));
 }
 
-static void unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size)
+void kvm_unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size)
 {
 	__unmap_stage2_range(mmu, start, size, true);
 }
@@ -853,21 +853,9 @@  static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = {
 	.icache_inval_pou	= invalidate_icache_guest_page,
 };
 
-/**
- * kvm_init_stage2_mmu - Initialise a S2 MMU structure
- * @kvm:	The pointer to the KVM structure
- * @mmu:	The pointer to the s2 MMU structure
- * @type:	The machine type of the virtual machine
- *
- * Allocates only the stage-2 HW PGD level table(s).
- * Note we don't need locking here as this is only called when the VM is
- * created, which can only be done once.
- */
-int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type)
+static int kvm_init_ipa_range(struct kvm_s2_mmu *mmu, unsigned long type)
 {
 	u32 kvm_ipa_limit = get_kvm_ipa_limit();
-	int cpu, err;
-	struct kvm_pgtable *pgt;
 	u64 mmfr0, mmfr1;
 	u32 phys_shift;
 
@@ -894,11 +882,58 @@  int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long t
 	mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);
 	mmu->vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift);
 
+	return 0;
+}
+
+/**
+ * kvm_init_stage2_mmu - Initialise a S2 MMU structure
+ * @kvm:	The pointer to the KVM structure
+ * @mmu:	The pointer to the s2 MMU structure
+ * @type:	The machine type of the virtual machine
+ *
+ * Allocates only the stage-2 HW PGD level table(s).
+ * Note we don't need locking here as this is only called in two cases:
+ *
+ * - when the VM is created, which can't race against anything
+ *
+ * - when secondary kvm_s2_mmu structures are initialised for NV
+ *   guests, and the caller must hold kvm->lock as this is called on a
+ *   per-vcpu basis.
+ */
+int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type)
+{
+	int cpu, err;
+	struct kvm_pgtable *pgt;
+
+	/*
+	 * If we already have our page tables in place, and that the
+	 * MMU context is the canonical one, we have a bug somewhere,
+	 * as this is only supposed to ever happen once per VM.
+	 *
+	 * Otherwise, we're building nested page tables, and that's
+	 * probably because userspace called KVM_ARM_VCPU_INIT more
+	 * than once on the same vcpu. Since that's actually legal,
+	 * don't kick a fuss and leave gracefully.
+	 */
 	if (mmu->pgt != NULL) {
+		if (&kvm->arch.mmu != mmu)
+			return 0;
+
 		kvm_err("kvm_arch already initialized?\n");
 		return -EINVAL;
 	}
 
+	/*
+	 * We only initialise the IPA range on the canonical MMU, so
+	 * the type is meaningless in all other situations.
+	 */
+	if (&kvm->arch.mmu != mmu)
+		type = kvm_get_pa_bits(kvm);
+
+	err = kvm_init_ipa_range(mmu, type);
+	if (err)
+		return err;
+
 	pgt = kzalloc(sizeof(*pgt), GFP_KERNEL_ACCOUNT);
 	if (!pgt)
 		return -ENOMEM;
@@ -923,6 +958,10 @@  int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long t
 
 	mmu->pgt = pgt;
 	mmu->pgd_phys = __pa(pgt->pgd);
+
+	if (&kvm->arch.mmu != mmu)
+		kvm_init_nested_s2_mmu(mmu);
+
 	return 0;
 
 out_destroy_pgtable:
@@ -974,7 +1013,7 @@  static void stage2_unmap_memslot(struct kvm *kvm,
 
 		if (!(vma->vm_flags & VM_PFNMAP)) {
 			gpa_t gpa = addr + (vm_start - memslot->userspace_addr);
-			unmap_stage2_range(&kvm->arch.mmu, gpa, vm_end - vm_start);
+			kvm_unmap_stage2_range(&kvm->arch.mmu, gpa, vm_end - vm_start);
 		}
 		hva = vm_end;
 	} while (hva < reg_end);
@@ -2054,11 +2093,6 @@  void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
 {
 }
 
-void kvm_arch_flush_shadow_all(struct kvm *kvm)
-{
-	kvm_uninit_stage2_mmu(kvm);
-}
-
 void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
 				   struct kvm_memory_slot *slot)
 {
@@ -2066,7 +2100,7 @@  void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
 	phys_addr_t size = slot->npages << PAGE_SHIFT;
 
 	write_lock(&kvm->mmu_lock);
-	unmap_stage2_range(&kvm->arch.mmu, gpa, size);
+	kvm_unmap_stage2_range(&kvm->arch.mmu, gpa, size);
 	write_unlock(&kvm->mmu_lock);
 }
 
diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c
index 66d05f5d39a2..c5752ab8c3fe 100644
--- a/arch/arm64/kvm/nested.c
+++ b/arch/arm64/kvm/nested.c
@@ -7,7 +7,9 @@ 
 #include <linux/kvm.h>
 #include <linux/kvm_host.h>
 
+#include <asm/kvm_arm.h>
 #include <asm/kvm_emulate.h>
+#include <asm/kvm_mmu.h>
 #include <asm/kvm_nested.h>
 #include <asm/sysreg.h>
 
@@ -16,6 +18,211 @@ 
 /* Protection against the sysreg repainting madness... */
 #define NV_FTR(r, f)		ID_AA64##r##_EL1_##f
 
+void kvm_init_nested(struct kvm *kvm)
+{
+	kvm->arch.nested_mmus = NULL;
+	kvm->arch.nested_mmus_size = 0;
+}
+
+int kvm_vcpu_init_nested(struct kvm_vcpu *vcpu)
+{
+	struct kvm *kvm = vcpu->kvm;
+	struct kvm_s2_mmu *tmp;
+	int num_mmus;
+	int ret = -ENOMEM;
+
+	if (!test_bit(KVM_ARM_VCPU_HAS_EL2, vcpu->kvm->arch.vcpu_features))
+		return 0;
+
+	if (!cpus_have_final_cap(ARM64_HAS_NESTED_VIRT))
+		return -EINVAL;
+
+	/*
+	 * Let's treat memory allocation failures as benign: If we fail to
+	 * allocate anything, return an error and keep the allocated array
+	 * alive. Userspace may try to recover by intializing the vcpu
+	 * again, and there is no reason to affect the whole VM for this.
+	 */
+	num_mmus = atomic_read(&kvm->online_vcpus) * 2;
+	tmp = krealloc(kvm->arch.nested_mmus,
+		       num_mmus * sizeof(*kvm->arch.nested_mmus),
+		       GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+	if (tmp) {
+		/*
+		 * If we went through a realocation, adjust the MMU
+		 * back-pointers in the previously initialised
+		 * pg_table structures.
+		 */
+		if (kvm->arch.nested_mmus != tmp) {
+			int i;
+
+			for (i = 0; i < num_mmus - 2; i++)
+				tmp[i].pgt->mmu = &tmp[i];
+		}
+
+		if (kvm_init_stage2_mmu(kvm, &tmp[num_mmus - 1], 0) ||
+		    kvm_init_stage2_mmu(kvm, &tmp[num_mmus - 2], 0)) {
+			kvm_free_stage2_pgd(&tmp[num_mmus - 1]);
+			kvm_free_stage2_pgd(&tmp[num_mmus - 2]);
+		} else {
+			kvm->arch.nested_mmus_size = num_mmus;
+			ret = 0;
+		}
+
+		kvm->arch.nested_mmus = tmp;
+	}
+
+	return ret;
+}
+
+/* Must be called with kvm->mmu_lock held */
+struct kvm_s2_mmu *lookup_s2_mmu(struct kvm_vcpu *vcpu)
+{
+	bool nested_stage2_enabled;
+	u64 vttbr, vtcr, hcr;
+	struct kvm *kvm;
+	int i;
+
+	kvm = vcpu->kvm;
+
+	vttbr = vcpu_read_sys_reg(vcpu, VTTBR_EL2);
+	vtcr = vcpu_read_sys_reg(vcpu, VTCR_EL2);
+	hcr = vcpu_read_sys_reg(vcpu, HCR_EL2);
+
+	nested_stage2_enabled = hcr & HCR_VM;
+
+	/* Don't consider the CnP bit for the vttbr match */
+	vttbr = vttbr & ~VTTBR_CNP_BIT;
+
+	/*
+	 * Two possibilities when looking up a S2 MMU context:
+	 *
+	 * - either S2 is enabled in the guest, and we need a context that is
+         *   S2-enabled and matches the full VTTBR (VMID+BADDR) and VTCR,
+         *   which makes it safe from a TLB conflict perspective (a broken
+         *   guest won't be able to generate them),
+	 *
+	 * - or S2 is disabled, and we need a context that is S2-disabled
+         *   and matches the VMID only, as all TLBs are tagged by VMID even
+         *   if S2 translation is disabled.
+	 */
+	for (i = 0; i < kvm->arch.nested_mmus_size; i++) {
+		struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i];
+
+		if (!kvm_s2_mmu_valid(mmu))
+			continue;
+
+		if (nested_stage2_enabled &&
+		    mmu->nested_stage2_enabled &&
+		    vttbr == mmu->tlb_vttbr &&
+		    vtcr == mmu->tlb_vtcr)
+			return mmu;
+
+		if (!nested_stage2_enabled &&
+		    !mmu->nested_stage2_enabled &&
+		    get_vmid(vttbr) == get_vmid(mmu->tlb_vttbr))
+			return mmu;
+	}
+	return NULL;
+}
+
+/* Must be called with kvm->mmu_lock held */
+static struct kvm_s2_mmu *get_s2_mmu_nested(struct kvm_vcpu *vcpu)
+{
+	struct kvm *kvm = vcpu->kvm;
+	struct kvm_s2_mmu *s2_mmu;
+	int i;
+
+	s2_mmu = lookup_s2_mmu(vcpu);
+	if (s2_mmu)
+		goto out;
+
+	/*
+	 * Make sure we don't always search from the same point, or we
+	 * will always reuse a potentially active context, leaving
+	 * free contexts unused.
+	 */
+	for (i = kvm->arch.nested_mmus_next;
+	     i < (kvm->arch.nested_mmus_size + kvm->arch.nested_mmus_next);
+	     i++) {
+		s2_mmu = &kvm->arch.nested_mmus[i % kvm->arch.nested_mmus_size];
+
+		if (atomic_read(&s2_mmu->refcnt) == 0)
+			break;
+	}
+	BUG_ON(atomic_read(&s2_mmu->refcnt)); /* We have struct MMUs to spare */
+
+	/* Set the scene for the next search */
+	kvm->arch.nested_mmus_next = (i + 1) % kvm->arch.nested_mmus_size;
+
+	if (kvm_s2_mmu_valid(s2_mmu)) {
+		/* Clear the old state */
+		kvm_unmap_stage2_range(s2_mmu, 0, kvm_phys_size(s2_mmu));
+		if (atomic64_read(&s2_mmu->vmid.id))
+			kvm_call_hyp(__kvm_tlb_flush_vmid, s2_mmu);
+	}
+
+	/*
+	 * The virtual VMID (modulo CnP) will be used as a key when matching
+	 * an existing kvm_s2_mmu.
+	 *
+	 * We cache VTCR at allocation time, once and for all. It'd be great
+	 * if the guest didn't screw that one up, as this is not very
+	 * forgiving...
+	 */
+	s2_mmu->tlb_vttbr = vcpu_read_sys_reg(vcpu, VTTBR_EL2) & ~VTTBR_CNP_BIT;
+	s2_mmu->tlb_vtcr = vcpu_read_sys_reg(vcpu, VTCR_EL2);
+	s2_mmu->nested_stage2_enabled = vcpu_read_sys_reg(vcpu, HCR_EL2) & HCR_VM;
+
+out:
+	atomic_inc(&s2_mmu->refcnt);
+	return s2_mmu;
+}
+
+void kvm_init_nested_s2_mmu(struct kvm_s2_mmu *mmu)
+{
+	mmu->tlb_vttbr = 1;
+	mmu->nested_stage2_enabled = false;
+	atomic_set(&mmu->refcnt, 0);
+}
+
+void kvm_vcpu_load_hw_mmu(struct kvm_vcpu *vcpu)
+{
+	if (is_hyp_ctxt(vcpu)) {
+		vcpu->arch.hw_mmu = &vcpu->kvm->arch.mmu;
+	} else {
+		write_lock(&vcpu->kvm->mmu_lock);
+		vcpu->arch.hw_mmu = get_s2_mmu_nested(vcpu);
+		write_unlock(&vcpu->kvm->mmu_lock);
+	}
+}
+
+void kvm_vcpu_put_hw_mmu(struct kvm_vcpu *vcpu)
+{
+	if (vcpu->arch.hw_mmu != &vcpu->kvm->arch.mmu) {
+		atomic_dec(&vcpu->arch.hw_mmu->refcnt);
+		vcpu->arch.hw_mmu = NULL;
+	}
+}
+
+void kvm_arch_flush_shadow_all(struct kvm *kvm)
+{
+	int i;
+
+	for (i = 0; i < kvm->arch.nested_mmus_size; i++) {
+		struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i];
+
+		WARN_ON(atomic_read(&mmu->refcnt));
+
+		if (!atomic_read(&mmu->refcnt))
+			kvm_free_stage2_pgd(mmu);
+	}
+	kfree(kvm->arch.nested_mmus);
+	kvm->arch.nested_mmus = NULL;
+	kvm->arch.nested_mmus_size = 0;
+	kvm_uninit_stage2_mmu(kvm);
+}
+
 /*
  * Our emulated CPU doesn't support all the possible features. For the
  * sake of simplicity (and probably mental sanity), wipe out a number
diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c
index 5bb4de162cab..e106ea01598f 100644
--- a/arch/arm64/kvm/reset.c
+++ b/arch/arm64/kvm/reset.c
@@ -266,6 +266,12 @@  void kvm_reset_vcpu(struct kvm_vcpu *vcpu)
 	preempt_enable();
 }
 
+u32 kvm_get_pa_bits(struct kvm *kvm)
+{
+	/* Fixed limit until we can configure ID_AA64MMFR0.PARange */
+	return kvm_ipa_limit;
+}
+
 u32 get_kvm_ipa_limit(void)
 {
 	return kvm_ipa_limit;