Message ID | 20201210160002.1407373-34-maz@kernel.org (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | KVM: arm64: ARMv8.3/8.4 Nested Virtualization support | expand |
On Fri, 11 Dec 2020 at 00:04, Marc Zyngier <maz@kernel.org> wrote: > > Add Stage-2 mmu data structures for virtual EL2 and for nested guests. > We don't yet populate shadow Stage-2 page tables, but we now have a > framework for getting to a shadow Stage-2 pgd. > > We allocate twice the number of vcpus as Stage-2 mmu structures because > that's sufficient for each vcpu running two translation regimes without > having to flush the Stage-2 page tables. > > Co-developed-by: Christoffer Dall <christoffer.dall@arm.com> > Signed-off-by: Christoffer Dall <christoffer.dall@arm.com> > Signed-off-by: Marc Zyngier <maz@kernel.org> > --- > arch/arm64/include/asm/kvm_host.h | 29 +++++ > arch/arm64/include/asm/kvm_mmu.h | 8 ++ > arch/arm64/include/asm/kvm_nested.h | 7 ++ > arch/arm64/kvm/arm.c | 16 ++- > arch/arm64/kvm/mmu.c | 18 ++- > arch/arm64/kvm/nested.c | 183 ++++++++++++++++++++++++++++ > 6 files changed, 250 insertions(+), 11 deletions(-) > > diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h > index d731cf7a56cb..d99e51e7cbee 100644 > --- a/arch/arm64/include/asm/kvm_host.h > +++ b/arch/arm64/include/asm/kvm_host.h > @@ -95,14 +95,43 @@ struct kvm_s2_mmu { > int __percpu *last_vcpu_ran; > > struct kvm *kvm; > + > + /* > + * For a shadow stage-2 MMU, the virtual vttbr programmed by the guest > + * hypervisor. Unused for kvm_arch->mmu. Set to 1 when the structure > + * contains no valid information. > + */ > + u64 vttbr; > + > + /* true when this represents a nested context where virtual HCR_EL2.VM == 1 */ > + bool nested_stage2_enabled; > + > + /* > + * 0: Nobody is currently using this, check vttbr for validity > + * >0: Somebody is actively using this. > + */ > + atomic_t refcnt; > }; > > +static inline bool kvm_s2_mmu_valid(struct kvm_s2_mmu *mmu) > +{ > + return !(mmu->vttbr & 1); > +} > + > struct kvm_arch_memory_slot { > }; > > struct kvm_arch { > struct kvm_s2_mmu mmu; > > + /* > + * Stage 2 paging stage for VMs with nested virtual using a virtual > + * VMID. > + */ > + struct kvm_s2_mmu *nested_mmus; > + size_t nested_mmus_size; > + int nested_mmus_next; > + > /* VTCR_EL2 value for this VM */ > u64 vtcr; > > diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h > index 76a8a0ca45b8..ec39015bb2a6 100644 > --- a/arch/arm64/include/asm/kvm_mmu.h > +++ b/arch/arm64/include/asm/kvm_mmu.h > @@ -126,6 +126,7 @@ alternative_cb_end > #include <asm/cacheflush.h> > #include <asm/mmu_context.h> > #include <asm/kvm_emulate.h> > +#include <asm/kvm_nested.h> > > void kvm_update_va_mask(struct alt_instr *alt, > __le32 *origptr, __le32 *updptr, int nr_inst); > @@ -184,6 +185,7 @@ int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size, > void **haddr); > void free_hyp_pgds(void); > > +void kvm_unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size); > void stage2_unmap_vm(struct kvm *kvm); > int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu); > void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu); > @@ -306,5 +308,11 @@ static __always_inline void __load_guest_stage2(struct kvm_s2_mmu *mmu) > asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT)); > } > > +static inline u64 get_vmid(u64 vttbr) > +{ > + return (vttbr & VTTBR_VMID_MASK(kvm_get_vmid_bits())) >> > + VTTBR_VMID_SHIFT; > +} > + > #endif /* __ASSEMBLY__ */ > #endif /* __ARM64_KVM_MMU_H__ */ > diff --git a/arch/arm64/include/asm/kvm_nested.h b/arch/arm64/include/asm/kvm_nested.h > index 026ddaad972c..473ecd1d60d0 100644 > --- a/arch/arm64/include/asm/kvm_nested.h > +++ b/arch/arm64/include/asm/kvm_nested.h > @@ -61,6 +61,13 @@ static inline u64 translate_cnthctl_el2_to_cntkctl_el1(u64 cnthctl) > (cnthctl & (CNTHCTL_EVNTI | CNTHCTL_EVNTDIR | CNTHCTL_EVNTEN))); > } > > +extern void kvm_init_nested(struct kvm *kvm); > +extern int kvm_vcpu_init_nested(struct kvm_vcpu *vcpu); > +extern void kvm_init_nested_s2_mmu(struct kvm_s2_mmu *mmu); > +extern struct kvm_s2_mmu *lookup_s2_mmu(struct kvm *kvm, u64 vttbr, u64 hcr); > +extern void kvm_vcpu_load_hw_mmu(struct kvm_vcpu *vcpu); > +extern void kvm_vcpu_put_hw_mmu(struct kvm_vcpu *vcpu); > + > int handle_wfx_nested(struct kvm_vcpu *vcpu, bool is_wfe); > extern bool __forward_traps(struct kvm_vcpu *vcpu, unsigned int reg, > u64 control_bit); > diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c > index 6e637d2b4cfb..1656dd80bbc4 100644 > --- a/arch/arm64/kvm/arm.c > +++ b/arch/arm64/kvm/arm.c > @@ -35,6 +35,7 @@ > #include <asm/kvm_arm.h> > #include <asm/kvm_asm.h> > #include <asm/kvm_mmu.h> > +#include <asm/kvm_nested.h> > #include <asm/kvm_emulate.h> > #include <asm/sections.h> > > @@ -142,6 +143,8 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) > if (ret) > return ret; > > + kvm_init_nested(kvm); > + > ret = create_hyp_mappings(kvm, kvm + 1, PAGE_HYP); > if (ret) > goto out_free_stage2_pgd; > @@ -385,6 +388,9 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) > struct kvm_s2_mmu *mmu; > int *last_ran; > > + if (nested_virt_in_use(vcpu)) > + kvm_vcpu_load_hw_mmu(vcpu); > + > mmu = vcpu->arch.hw_mmu; > last_ran = this_cpu_ptr(mmu->last_vcpu_ran); > > @@ -426,6 +432,9 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) > kvm_vgic_put(vcpu); > kvm_vcpu_pmu_restore_host(vcpu); > > + if (nested_virt_in_use(vcpu)) > + kvm_vcpu_put_hw_mmu(vcpu); > + > vcpu->cpu = -1; > } > > @@ -1026,8 +1035,13 @@ static int kvm_vcpu_set_target(struct kvm_vcpu *vcpu, > > vcpu->arch.target = phys_target; > > + /* Prepare for nested if required */ > + ret = kvm_vcpu_init_nested(vcpu); > + > /* Now we know what it is, we can reset it. */ > - ret = kvm_reset_vcpu(vcpu); > + if (!ret) > + ret = kvm_reset_vcpu(vcpu); > + > if (ret) { > vcpu->arch.target = -1; > bitmap_zero(vcpu->arch.features, KVM_VCPU_MAX_FEATURES); > diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c > index 1f41173e6149..2f0302211af3 100644 > --- a/arch/arm64/kvm/mmu.c > +++ b/arch/arm64/kvm/mmu.c > @@ -113,7 +113,7 @@ static bool kvm_is_device_pfn(unsigned long pfn) > * does. > */ > /** > - * unmap_stage2_range -- Clear stage2 page table entries to unmap a range > + * kvm_unmap_stage2_range -- Clear stage2 page table entries to unmap a range > * @mmu: The KVM stage-2 MMU pointer > * @start: The intermediate physical base address of the range to unmap > * @size: The size of the area to unmap > @@ -136,7 +136,7 @@ static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 > may_block)); > } > > -static void unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size) > +void kvm_unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size) > { > __unmap_stage2_range(mmu, start, size, true); > } > @@ -391,6 +391,9 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu) > mmu->pgt = pgt; > mmu->pgd_phys = __pa(pgt->pgd); > mmu->vmid.vmid_gen = 0; > + > + kvm_init_nested_s2_mmu(mmu); > + > return 0; > > out_destroy_pgtable: > @@ -435,7 +438,7 @@ static void stage2_unmap_memslot(struct kvm *kvm, > > if (!(vma->vm_flags & VM_PFNMAP)) { > gpa_t gpa = addr + (vm_start - memslot->userspace_addr); > - unmap_stage2_range(&kvm->arch.mmu, gpa, vm_end - vm_start); > + kvm_unmap_stage2_range(&kvm->arch.mmu, gpa, vm_end - vm_start); > } > hva = vm_end; > } while (hva < reg_end); > @@ -1360,7 +1363,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, > > spin_lock(&kvm->mmu_lock); > if (ret) > - unmap_stage2_range(&kvm->arch.mmu, mem->guest_phys_addr, mem->memory_size); > + kvm_unmap_stage2_range(&kvm->arch.mmu, mem->guest_phys_addr, mem->memory_size); > else if (!cpus_have_final_cap(ARM64_HAS_STAGE2_FWB)) > stage2_flush_memslot(kvm, memslot); > spin_unlock(&kvm->mmu_lock); > @@ -1377,11 +1380,6 @@ void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) > { > } > > -void kvm_arch_flush_shadow_all(struct kvm *kvm) > -{ > - kvm_free_stage2_pgd(&kvm->arch.mmu); > -} > - > void kvm_arch_flush_shadow_memslot(struct kvm *kvm, > struct kvm_memory_slot *slot) > { > @@ -1389,7 +1387,7 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm, > phys_addr_t size = slot->npages << PAGE_SHIFT; > > spin_lock(&kvm->mmu_lock); > - unmap_stage2_range(&kvm->arch.mmu, gpa, size); > + kvm_unmap_stage2_range(&kvm->arch.mmu, gpa, size); > spin_unlock(&kvm->mmu_lock); > } > > diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c > index 9fb44bc7db3f..8e85d2ef24d9 100644 > --- a/arch/arm64/kvm/nested.c > +++ b/arch/arm64/kvm/nested.c > @@ -19,12 +19,177 @@ > #include <linux/kvm.h> > #include <linux/kvm_host.h> > > +#include <asm/kvm_arm.h> > #include <asm/kvm_emulate.h> > +#include <asm/kvm_mmu.h> > #include <asm/kvm_nested.h> > #include <asm/sysreg.h> > > #include "sys_regs.h" > > +void kvm_init_nested(struct kvm *kvm) > +{ > + kvm->arch.nested_mmus = NULL; > + kvm->arch.nested_mmus_size = 0; > +} > + > +int kvm_vcpu_init_nested(struct kvm_vcpu *vcpu) > +{ > + struct kvm *kvm = vcpu->kvm; > + struct kvm_s2_mmu *tmp; > + int num_mmus; > + int ret = -ENOMEM; > + > + if (!test_bit(KVM_ARM_VCPU_HAS_EL2, vcpu->arch.features)) > + return 0; > + > + if (!cpus_have_final_cap(ARM64_HAS_NESTED_VIRT)) > + return -EINVAL; nit: returning a "not supported" kind of errno? > + > + mutex_lock(&kvm->lock); > + > + /* > + * Let's treat memory allocation failures as benign: If we fail to > + * allocate anything, return an error and keep the allocated array > + * alive. Userspace may try to recover by intializing the vcpu > + * again, and there is no reason to affect the whole VM for this. > + */ > + num_mmus = atomic_read(&kvm->online_vcpus) * 2; > + tmp = krealloc(kvm->arch.nested_mmus, > + num_mmus * sizeof(*kvm->arch.nested_mmus), > + GFP_KERNEL | __GFP_ZERO); > + if (tmp) { > + if (kvm_init_stage2_mmu(kvm, &tmp[num_mmus - 1]) || > + kvm_init_stage2_mmu(kvm, &tmp[num_mmus - 2])) { > + kvm_free_stage2_pgd(&tmp[num_mmus - 1]); > + kvm_free_stage2_pgd(&tmp[num_mmus - 2]); > + } else { > + kvm->arch.nested_mmus_size = num_mmus; > + ret = 0; > + } > + > + kvm->arch.nested_mmus = tmp; > + } > + > + mutex_unlock(&kvm->lock); > + return ret; > +} > + > +/* Must be called with kvm->lock held */ > +struct kvm_s2_mmu *lookup_s2_mmu(struct kvm *kvm, u64 vttbr, u64 hcr) > +{ > + bool nested_stage2_enabled = hcr & HCR_VM; > + int i; > + > + /* Don't consider the CnP bit for the vttbr match */ > + vttbr = vttbr & ~VTTBR_CNP_BIT; > + > + /* > + * Two possibilities when looking up a S2 MMU context: > + * > + * - either S2 is enabled in the guest, and we need a context that > + * is S2-enabled and matches the full VTTBR (VMID+BADDR), which > + * makes it safe from a TLB conflict perspective (a broken guest > + * won't be able to generate them), > + * > + * - or S2 is disabled, and we need a context that is S2-disabled > + * and matches the VMID only, as all TLBs are tagged by VMID even > + * if S2 translation is enabled. > + */ > + for (i = 0; i < kvm->arch.nested_mmus_size; i++) { > + struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i]; > + > + if (!kvm_s2_mmu_valid(mmu)) > + continue; > + > + if (nested_stage2_enabled && > + mmu->nested_stage2_enabled && > + vttbr == mmu->vttbr) > + return mmu; > + > + if (!nested_stage2_enabled && > + !mmu->nested_stage2_enabled && > + get_vmid(vttbr) == get_vmid(mmu->vttbr)) > + return mmu; > + } > + return NULL; > +} > + > +static struct kvm_s2_mmu *get_s2_mmu_nested(struct kvm_vcpu *vcpu) > +{ > + struct kvm *kvm = vcpu->kvm; > + u64 vttbr = vcpu_read_sys_reg(vcpu, VTTBR_EL2); > + u64 hcr= vcpu_read_sys_reg(vcpu, HCR_EL2); > + struct kvm_s2_mmu *s2_mmu; > + int i; > + > + s2_mmu = lookup_s2_mmu(kvm, vttbr, hcr); > + if (s2_mmu) > + goto out; > + > + /* > + * Make sure we don't always search from the same point, or we > + * will always reuse a potentially active context, leaving > + * free contexts unused. > + */ > + for (i = kvm->arch.nested_mmus_next; > + i < (kvm->arch.nested_mmus_size + kvm->arch.nested_mmus_next); > + i++) { > + s2_mmu = &kvm->arch.nested_mmus[i % kvm->arch.nested_mmus_size]; > + > + if (atomic_read(&s2_mmu->refcnt) == 0) > + break; > + } > + BUG_ON(atomic_read(&s2_mmu->refcnt)); /* We have struct MMUs to spare */ > + > + /* Set the scene for the next search */ > + kvm->arch.nested_mmus_next = (i + 1) % kvm->arch.nested_mmus_size; > + > + if (kvm_s2_mmu_valid(s2_mmu)) { > + /* Clear the old state */ > + kvm_unmap_stage2_range(s2_mmu, 0, kvm_phys_size(kvm)); > + if (s2_mmu->vmid.vmid_gen) > + kvm_call_hyp(__kvm_tlb_flush_vmid, s2_mmu); > + } > + > + /* > + * The virtual VMID (modulo CnP) will be used as a key when matching > + * an existing kvm_s2_mmu. > + */ > + s2_mmu->vttbr = vttbr & ~VTTBR_CNP_BIT; > + s2_mmu->nested_stage2_enabled = hcr & HCR_VM; > + > +out: > + atomic_inc(&s2_mmu->refcnt); > + return s2_mmu; > +} > + > +void kvm_init_nested_s2_mmu(struct kvm_s2_mmu *mmu) > +{ > + mmu->vttbr = 1; > + mmu->nested_stage2_enabled = false; > + atomic_set(&mmu->refcnt, 0); > +} > + > +void kvm_vcpu_load_hw_mmu(struct kvm_vcpu *vcpu) > +{ > + if (is_hyp_ctxt(vcpu)) { > + vcpu->arch.hw_mmu = &vcpu->kvm->arch.mmu; > + } else { > + spin_lock(&vcpu->kvm->mmu_lock); > + vcpu->arch.hw_mmu = get_s2_mmu_nested(vcpu); > + spin_unlock(&vcpu->kvm->mmu_lock); > + } > +} > + > +void kvm_vcpu_put_hw_mmu(struct kvm_vcpu *vcpu) > +{ > + if (vcpu->arch.hw_mmu != &vcpu->kvm->arch.mmu) { > + atomic_dec(&vcpu->arch.hw_mmu->refcnt); > + vcpu->arch.hw_mmu = NULL; > + } > +} > + > /* > * Inject wfx to the virtual EL2 if this is not from the virtual EL2 and > * the virtual HCR_EL2.TWX is set. Otherwise, let the host hypervisor > @@ -43,6 +208,24 @@ int handle_wfx_nested(struct kvm_vcpu *vcpu, bool is_wfe) > return -EINVAL; > } > > +void kvm_arch_flush_shadow_all(struct kvm *kvm) > +{ > + int i; > + > + for (i = 0; i < kvm->arch.nested_mmus_size; i++) { > + struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i]; > + > + WARN_ON(atomic_read(&mmu->refcnt)); > + > + if (!atomic_read(&mmu->refcnt)) > + kvm_free_stage2_pgd(mmu); > + } > + kfree(kvm->arch.nested_mmus); > + kvm->arch.nested_mmus = NULL; > + kvm->arch.nested_mmus_size = 0; > + kvm_free_stage2_pgd(&kvm->arch.mmu); > +} > + > #define FEATURE(x) (GENMASK_ULL(x##_SHIFT + 3, x##_SHIFT)) > > /* > -- > 2.29.2 > > _______________________________________________ > kvmarm mailing list > kvmarm@lists.cs.columbia.edu > https://lists.cs.columbia.edu/mailman/listinfo/kvmarm
On Thu, 21 Jan 2021 02:59:13 +0000, Haibo Xu <haibo.xu@linaro.org> wrote: > > On Fri, 11 Dec 2020 at 00:04, Marc Zyngier <maz@kernel.org> wrote: [...] > > diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c > > index 9fb44bc7db3f..8e85d2ef24d9 100644 > > --- a/arch/arm64/kvm/nested.c > > +++ b/arch/arm64/kvm/nested.c > > @@ -19,12 +19,177 @@ > > #include <linux/kvm.h> > > #include <linux/kvm_host.h> > > > > +#include <asm/kvm_arm.h> > > #include <asm/kvm_emulate.h> > > +#include <asm/kvm_mmu.h> > > #include <asm/kvm_nested.h> > > #include <asm/sysreg.h> > > > > #include "sys_regs.h" > > > > +void kvm_init_nested(struct kvm *kvm) > > +{ > > + kvm->arch.nested_mmus = NULL; > > + kvm->arch.nested_mmus_size = 0; > > +} > > + > > +int kvm_vcpu_init_nested(struct kvm_vcpu *vcpu) > > +{ > > + struct kvm *kvm = vcpu->kvm; > > + struct kvm_s2_mmu *tmp; > > + int num_mmus; > > + int ret = -ENOMEM; > > + > > + if (!test_bit(KVM_ARM_VCPU_HAS_EL2, vcpu->arch.features)) > > + return 0; > > + > > + if (!cpus_have_final_cap(ARM64_HAS_NESTED_VIRT)) > > + return -EINVAL; > > nit: returning a "not supported" kind of errno? I think this is consistent with what we otherwise return when there is a mismatch between requested features from userspace and host capabilities (such as asking for a 32bit guest on a 64bit only CPU). What seems to be missing though is a way to *advertise* the feature to userspace as such: diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 97d377265d8e..0225b81d3434 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -243,6 +243,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_ARM_EL1_32BIT: r = cpus_have_const_cap(ARM64_HAS_32BIT_EL1); break; + case KVM_CAP_ARM_EL2: + r = cpus_have_const_cap(ARM64_HAS_NESTED_VIRT); + break; case KVM_CAP_GUEST_DEBUG_HW_BPS: r = get_num_brps(); break; diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 374c67875cdb..555a60ac00d8 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1058,6 +1058,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_ENFORCE_PV_FEATURE_CPUID 190 #define KVM_CAP_SYS_HYPERV_CPUID 191 #define KVM_CAP_DIRTY_LOG_RING 192 +#define KVM_CAP_ARM_EL2 193 #ifdef KVM_CAP_IRQ_ROUTING Thanks, M.
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index d731cf7a56cb..d99e51e7cbee 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -95,14 +95,43 @@ struct kvm_s2_mmu { int __percpu *last_vcpu_ran; struct kvm *kvm; + + /* + * For a shadow stage-2 MMU, the virtual vttbr programmed by the guest + * hypervisor. Unused for kvm_arch->mmu. Set to 1 when the structure + * contains no valid information. + */ + u64 vttbr; + + /* true when this represents a nested context where virtual HCR_EL2.VM == 1 */ + bool nested_stage2_enabled; + + /* + * 0: Nobody is currently using this, check vttbr for validity + * >0: Somebody is actively using this. + */ + atomic_t refcnt; }; +static inline bool kvm_s2_mmu_valid(struct kvm_s2_mmu *mmu) +{ + return !(mmu->vttbr & 1); +} + struct kvm_arch_memory_slot { }; struct kvm_arch { struct kvm_s2_mmu mmu; + /* + * Stage 2 paging stage for VMs with nested virtual using a virtual + * VMID. + */ + struct kvm_s2_mmu *nested_mmus; + size_t nested_mmus_size; + int nested_mmus_next; + /* VTCR_EL2 value for this VM */ u64 vtcr; diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 76a8a0ca45b8..ec39015bb2a6 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -126,6 +126,7 @@ alternative_cb_end #include <asm/cacheflush.h> #include <asm/mmu_context.h> #include <asm/kvm_emulate.h> +#include <asm/kvm_nested.h> void kvm_update_va_mask(struct alt_instr *alt, __le32 *origptr, __le32 *updptr, int nr_inst); @@ -184,6 +185,7 @@ int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size, void **haddr); void free_hyp_pgds(void); +void kvm_unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size); void stage2_unmap_vm(struct kvm *kvm); int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu); void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu); @@ -306,5 +308,11 @@ static __always_inline void __load_guest_stage2(struct kvm_s2_mmu *mmu) asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT)); } +static inline u64 get_vmid(u64 vttbr) +{ + return (vttbr & VTTBR_VMID_MASK(kvm_get_vmid_bits())) >> + VTTBR_VMID_SHIFT; +} + #endif /* __ASSEMBLY__ */ #endif /* __ARM64_KVM_MMU_H__ */ diff --git a/arch/arm64/include/asm/kvm_nested.h b/arch/arm64/include/asm/kvm_nested.h index 026ddaad972c..473ecd1d60d0 100644 --- a/arch/arm64/include/asm/kvm_nested.h +++ b/arch/arm64/include/asm/kvm_nested.h @@ -61,6 +61,13 @@ static inline u64 translate_cnthctl_el2_to_cntkctl_el1(u64 cnthctl) (cnthctl & (CNTHCTL_EVNTI | CNTHCTL_EVNTDIR | CNTHCTL_EVNTEN))); } +extern void kvm_init_nested(struct kvm *kvm); +extern int kvm_vcpu_init_nested(struct kvm_vcpu *vcpu); +extern void kvm_init_nested_s2_mmu(struct kvm_s2_mmu *mmu); +extern struct kvm_s2_mmu *lookup_s2_mmu(struct kvm *kvm, u64 vttbr, u64 hcr); +extern void kvm_vcpu_load_hw_mmu(struct kvm_vcpu *vcpu); +extern void kvm_vcpu_put_hw_mmu(struct kvm_vcpu *vcpu); + int handle_wfx_nested(struct kvm_vcpu *vcpu, bool is_wfe); extern bool __forward_traps(struct kvm_vcpu *vcpu, unsigned int reg, u64 control_bit); diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 6e637d2b4cfb..1656dd80bbc4 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -35,6 +35,7 @@ #include <asm/kvm_arm.h> #include <asm/kvm_asm.h> #include <asm/kvm_mmu.h> +#include <asm/kvm_nested.h> #include <asm/kvm_emulate.h> #include <asm/sections.h> @@ -142,6 +143,8 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) if (ret) return ret; + kvm_init_nested(kvm); + ret = create_hyp_mappings(kvm, kvm + 1, PAGE_HYP); if (ret) goto out_free_stage2_pgd; @@ -385,6 +388,9 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) struct kvm_s2_mmu *mmu; int *last_ran; + if (nested_virt_in_use(vcpu)) + kvm_vcpu_load_hw_mmu(vcpu); + mmu = vcpu->arch.hw_mmu; last_ran = this_cpu_ptr(mmu->last_vcpu_ran); @@ -426,6 +432,9 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) kvm_vgic_put(vcpu); kvm_vcpu_pmu_restore_host(vcpu); + if (nested_virt_in_use(vcpu)) + kvm_vcpu_put_hw_mmu(vcpu); + vcpu->cpu = -1; } @@ -1026,8 +1035,13 @@ static int kvm_vcpu_set_target(struct kvm_vcpu *vcpu, vcpu->arch.target = phys_target; + /* Prepare for nested if required */ + ret = kvm_vcpu_init_nested(vcpu); + /* Now we know what it is, we can reset it. */ - ret = kvm_reset_vcpu(vcpu); + if (!ret) + ret = kvm_reset_vcpu(vcpu); + if (ret) { vcpu->arch.target = -1; bitmap_zero(vcpu->arch.features, KVM_VCPU_MAX_FEATURES); diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 1f41173e6149..2f0302211af3 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -113,7 +113,7 @@ static bool kvm_is_device_pfn(unsigned long pfn) * does. */ /** - * unmap_stage2_range -- Clear stage2 page table entries to unmap a range + * kvm_unmap_stage2_range -- Clear stage2 page table entries to unmap a range * @mmu: The KVM stage-2 MMU pointer * @start: The intermediate physical base address of the range to unmap * @size: The size of the area to unmap @@ -136,7 +136,7 @@ static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 may_block)); } -static void unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size) +void kvm_unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size) { __unmap_stage2_range(mmu, start, size, true); } @@ -391,6 +391,9 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu) mmu->pgt = pgt; mmu->pgd_phys = __pa(pgt->pgd); mmu->vmid.vmid_gen = 0; + + kvm_init_nested_s2_mmu(mmu); + return 0; out_destroy_pgtable: @@ -435,7 +438,7 @@ static void stage2_unmap_memslot(struct kvm *kvm, if (!(vma->vm_flags & VM_PFNMAP)) { gpa_t gpa = addr + (vm_start - memslot->userspace_addr); - unmap_stage2_range(&kvm->arch.mmu, gpa, vm_end - vm_start); + kvm_unmap_stage2_range(&kvm->arch.mmu, gpa, vm_end - vm_start); } hva = vm_end; } while (hva < reg_end); @@ -1360,7 +1363,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, spin_lock(&kvm->mmu_lock); if (ret) - unmap_stage2_range(&kvm->arch.mmu, mem->guest_phys_addr, mem->memory_size); + kvm_unmap_stage2_range(&kvm->arch.mmu, mem->guest_phys_addr, mem->memory_size); else if (!cpus_have_final_cap(ARM64_HAS_STAGE2_FWB)) stage2_flush_memslot(kvm, memslot); spin_unlock(&kvm->mmu_lock); @@ -1377,11 +1380,6 @@ void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) { } -void kvm_arch_flush_shadow_all(struct kvm *kvm) -{ - kvm_free_stage2_pgd(&kvm->arch.mmu); -} - void kvm_arch_flush_shadow_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) { @@ -1389,7 +1387,7 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm, phys_addr_t size = slot->npages << PAGE_SHIFT; spin_lock(&kvm->mmu_lock); - unmap_stage2_range(&kvm->arch.mmu, gpa, size); + kvm_unmap_stage2_range(&kvm->arch.mmu, gpa, size); spin_unlock(&kvm->mmu_lock); } diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index 9fb44bc7db3f..8e85d2ef24d9 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -19,12 +19,177 @@ #include <linux/kvm.h> #include <linux/kvm_host.h> +#include <asm/kvm_arm.h> #include <asm/kvm_emulate.h> +#include <asm/kvm_mmu.h> #include <asm/kvm_nested.h> #include <asm/sysreg.h> #include "sys_regs.h" +void kvm_init_nested(struct kvm *kvm) +{ + kvm->arch.nested_mmus = NULL; + kvm->arch.nested_mmus_size = 0; +} + +int kvm_vcpu_init_nested(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_s2_mmu *tmp; + int num_mmus; + int ret = -ENOMEM; + + if (!test_bit(KVM_ARM_VCPU_HAS_EL2, vcpu->arch.features)) + return 0; + + if (!cpus_have_final_cap(ARM64_HAS_NESTED_VIRT)) + return -EINVAL; + + mutex_lock(&kvm->lock); + + /* + * Let's treat memory allocation failures as benign: If we fail to + * allocate anything, return an error and keep the allocated array + * alive. Userspace may try to recover by intializing the vcpu + * again, and there is no reason to affect the whole VM for this. + */ + num_mmus = atomic_read(&kvm->online_vcpus) * 2; + tmp = krealloc(kvm->arch.nested_mmus, + num_mmus * sizeof(*kvm->arch.nested_mmus), + GFP_KERNEL | __GFP_ZERO); + if (tmp) { + if (kvm_init_stage2_mmu(kvm, &tmp[num_mmus - 1]) || + kvm_init_stage2_mmu(kvm, &tmp[num_mmus - 2])) { + kvm_free_stage2_pgd(&tmp[num_mmus - 1]); + kvm_free_stage2_pgd(&tmp[num_mmus - 2]); + } else { + kvm->arch.nested_mmus_size = num_mmus; + ret = 0; + } + + kvm->arch.nested_mmus = tmp; + } + + mutex_unlock(&kvm->lock); + return ret; +} + +/* Must be called with kvm->lock held */ +struct kvm_s2_mmu *lookup_s2_mmu(struct kvm *kvm, u64 vttbr, u64 hcr) +{ + bool nested_stage2_enabled = hcr & HCR_VM; + int i; + + /* Don't consider the CnP bit for the vttbr match */ + vttbr = vttbr & ~VTTBR_CNP_BIT; + + /* + * Two possibilities when looking up a S2 MMU context: + * + * - either S2 is enabled in the guest, and we need a context that + * is S2-enabled and matches the full VTTBR (VMID+BADDR), which + * makes it safe from a TLB conflict perspective (a broken guest + * won't be able to generate them), + * + * - or S2 is disabled, and we need a context that is S2-disabled + * and matches the VMID only, as all TLBs are tagged by VMID even + * if S2 translation is enabled. + */ + for (i = 0; i < kvm->arch.nested_mmus_size; i++) { + struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i]; + + if (!kvm_s2_mmu_valid(mmu)) + continue; + + if (nested_stage2_enabled && + mmu->nested_stage2_enabled && + vttbr == mmu->vttbr) + return mmu; + + if (!nested_stage2_enabled && + !mmu->nested_stage2_enabled && + get_vmid(vttbr) == get_vmid(mmu->vttbr)) + return mmu; + } + return NULL; +} + +static struct kvm_s2_mmu *get_s2_mmu_nested(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + u64 vttbr = vcpu_read_sys_reg(vcpu, VTTBR_EL2); + u64 hcr= vcpu_read_sys_reg(vcpu, HCR_EL2); + struct kvm_s2_mmu *s2_mmu; + int i; + + s2_mmu = lookup_s2_mmu(kvm, vttbr, hcr); + if (s2_mmu) + goto out; + + /* + * Make sure we don't always search from the same point, or we + * will always reuse a potentially active context, leaving + * free contexts unused. + */ + for (i = kvm->arch.nested_mmus_next; + i < (kvm->arch.nested_mmus_size + kvm->arch.nested_mmus_next); + i++) { + s2_mmu = &kvm->arch.nested_mmus[i % kvm->arch.nested_mmus_size]; + + if (atomic_read(&s2_mmu->refcnt) == 0) + break; + } + BUG_ON(atomic_read(&s2_mmu->refcnt)); /* We have struct MMUs to spare */ + + /* Set the scene for the next search */ + kvm->arch.nested_mmus_next = (i + 1) % kvm->arch.nested_mmus_size; + + if (kvm_s2_mmu_valid(s2_mmu)) { + /* Clear the old state */ + kvm_unmap_stage2_range(s2_mmu, 0, kvm_phys_size(kvm)); + if (s2_mmu->vmid.vmid_gen) + kvm_call_hyp(__kvm_tlb_flush_vmid, s2_mmu); + } + + /* + * The virtual VMID (modulo CnP) will be used as a key when matching + * an existing kvm_s2_mmu. + */ + s2_mmu->vttbr = vttbr & ~VTTBR_CNP_BIT; + s2_mmu->nested_stage2_enabled = hcr & HCR_VM; + +out: + atomic_inc(&s2_mmu->refcnt); + return s2_mmu; +} + +void kvm_init_nested_s2_mmu(struct kvm_s2_mmu *mmu) +{ + mmu->vttbr = 1; + mmu->nested_stage2_enabled = false; + atomic_set(&mmu->refcnt, 0); +} + +void kvm_vcpu_load_hw_mmu(struct kvm_vcpu *vcpu) +{ + if (is_hyp_ctxt(vcpu)) { + vcpu->arch.hw_mmu = &vcpu->kvm->arch.mmu; + } else { + spin_lock(&vcpu->kvm->mmu_lock); + vcpu->arch.hw_mmu = get_s2_mmu_nested(vcpu); + spin_unlock(&vcpu->kvm->mmu_lock); + } +} + +void kvm_vcpu_put_hw_mmu(struct kvm_vcpu *vcpu) +{ + if (vcpu->arch.hw_mmu != &vcpu->kvm->arch.mmu) { + atomic_dec(&vcpu->arch.hw_mmu->refcnt); + vcpu->arch.hw_mmu = NULL; + } +} + /* * Inject wfx to the virtual EL2 if this is not from the virtual EL2 and * the virtual HCR_EL2.TWX is set. Otherwise, let the host hypervisor @@ -43,6 +208,24 @@ int handle_wfx_nested(struct kvm_vcpu *vcpu, bool is_wfe) return -EINVAL; } +void kvm_arch_flush_shadow_all(struct kvm *kvm) +{ + int i; + + for (i = 0; i < kvm->arch.nested_mmus_size; i++) { + struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i]; + + WARN_ON(atomic_read(&mmu->refcnt)); + + if (!atomic_read(&mmu->refcnt)) + kvm_free_stage2_pgd(mmu); + } + kfree(kvm->arch.nested_mmus); + kvm->arch.nested_mmus = NULL; + kvm->arch.nested_mmus_size = 0; + kvm_free_stage2_pgd(&kvm->arch.mmu); +} + #define FEATURE(x) (GENMASK_ULL(x##_SHIFT + 3, x##_SHIFT)) /*