@@ -649,4 +649,11 @@ static __always_inline void kvm_reset_cptr_el2(struct kvm_vcpu *vcpu)
kvm_write_cptr_el2(val);
}
+
+static inline bool kvm_is_shadow_s2_fault(struct kvm_vcpu *vcpu)
+{
+ return (vcpu->arch.hw_mmu != &vcpu->kvm->arch.mmu &&
+ vcpu->arch.hw_mmu->nested_stage2_enabled);
+}
+
#endif /* __ARM64_KVM_EMULATE_H__ */
@@ -77,8 +77,27 @@ struct kvm_s2_trans {
u64 upper_attr;
};
+static inline phys_addr_t kvm_s2_trans_output(struct kvm_s2_trans *trans)
+{
+ return trans->output;
+}
+
+static inline unsigned long kvm_s2_trans_size(struct kvm_s2_trans *trans)
+{
+ return trans->block_size;
+}
+
+static inline u32 kvm_s2_trans_esr(struct kvm_s2_trans *trans)
+{
+ return trans->esr;
+}
+
extern int kvm_walk_nested_s2(struct kvm_vcpu *vcpu, phys_addr_t gipa,
struct kvm_s2_trans *result);
+extern int kvm_s2_handle_perm_fault(struct kvm_vcpu *vcpu,
+ struct kvm_s2_trans *trans);
+extern int kvm_inject_s2_fault(struct kvm_vcpu *vcpu, u64 esr_el2);
+int handle_wfx_nested(struct kvm_vcpu *vcpu, bool is_wfe);
extern bool forward_smc_trap(struct kvm_vcpu *vcpu);
extern bool __check_nv_sr_forward(struct kvm_vcpu *vcpu);
@@ -1412,14 +1412,16 @@ static bool kvm_vma_mte_allowed(struct vm_area_struct *vma)
}
static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
- struct kvm_memory_slot *memslot, unsigned long hva,
- unsigned long fault_status)
+ struct kvm_s2_trans *nested,
+ struct kvm_memory_slot *memslot,
+ unsigned long hva, unsigned long fault_status)
{
int ret = 0;
bool write_fault, writable, force_pte = false;
bool exec_fault, mte_allowed;
bool device = false;
unsigned long mmu_seq;
+ phys_addr_t ipa = fault_ipa;
struct kvm *kvm = vcpu->kvm;
struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
struct vm_area_struct *vma;
@@ -1504,10 +1506,38 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
}
vma_pagesize = 1UL << vma_shift;
+
+ if (nested) {
+ unsigned long max_map_size;
+
+ max_map_size = force_pte ? PUD_SIZE : PAGE_SIZE;
+
+ ipa = kvm_s2_trans_output(nested);
+
+ /*
+ * If we're about to create a shadow stage 2 entry, then we
+ * can only create a block mapping if the guest stage 2 page
+ * table uses at least as big a mapping.
+ */
+ max_map_size = min(kvm_s2_trans_size(nested), max_map_size);
+
+ /*
+ * Be careful that if the mapping size falls between
+ * two host sizes, take the smallest of the two.
+ */
+ if (max_map_size >= PMD_SIZE && max_map_size < PUD_SIZE)
+ max_map_size = PMD_SIZE;
+ else if (max_map_size >= PAGE_SIZE && max_map_size < PMD_SIZE)
+ max_map_size = PAGE_SIZE;
+
+ force_pte = (max_map_size == PAGE_SIZE);
+ vma_pagesize = min(vma_pagesize, (long)max_map_size);
+ }
+
if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE)
fault_ipa &= ~(vma_pagesize - 1);
- gfn = fault_ipa >> PAGE_SHIFT;
+ gfn = ipa >> PAGE_SHIFT;
mte_allowed = kvm_vma_mte_allowed(vma);
/* Don't use the VMA after the unlock -- it may have vanished */
@@ -1657,8 +1687,10 @@ static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
*/
int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
{
+ struct kvm_s2_trans nested_trans, *nested = NULL;
unsigned long fault_status;
- phys_addr_t fault_ipa;
+ phys_addr_t fault_ipa; /* The address we faulted on */
+ phys_addr_t ipa; /* Always the IPA in the L1 guest phys space */
struct kvm_memory_slot *memslot;
unsigned long hva;
bool is_iabt, write_fault, writable;
@@ -1667,7 +1699,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
fault_status = kvm_vcpu_trap_get_fault_type(vcpu);
- fault_ipa = kvm_vcpu_get_fault_ipa(vcpu);
+ ipa = fault_ipa = kvm_vcpu_get_fault_ipa(vcpu);
is_iabt = kvm_vcpu_trap_is_iabt(vcpu);
if (fault_status == ESR_ELx_FSC_FAULT) {
@@ -1708,6 +1740,12 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
if (fault_status != ESR_ELx_FSC_FAULT &&
fault_status != ESR_ELx_FSC_PERM &&
fault_status != ESR_ELx_FSC_ACCESS) {
+ /*
+ * We must never see an address size fault on shadow stage 2
+ * page table walk, because we would have injected an addr
+ * size fault when we walked the nested s2 page and not
+ * create the shadow entry.
+ */
kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n",
kvm_vcpu_trap_get_class(vcpu),
(unsigned long)kvm_vcpu_trap_get_fault(vcpu),
@@ -1717,7 +1755,37 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
idx = srcu_read_lock(&vcpu->kvm->srcu);
- gfn = fault_ipa >> PAGE_SHIFT;
+ /*
+ * We may have faulted on a shadow stage 2 page table if we are
+ * running a nested guest. In this case, we have to resolve the L2
+ * IPA to the L1 IPA first, before knowing what kind of memory should
+ * back the L1 IPA.
+ *
+ * If the shadow stage 2 page table walk faults, then we simply inject
+ * this to the guest and carry on.
+ */
+ if (kvm_is_shadow_s2_fault(vcpu)) {
+ u32 esr;
+
+ ret = kvm_walk_nested_s2(vcpu, fault_ipa, &nested_trans);
+ if (ret) {
+ esr = kvm_s2_trans_esr(&nested_trans);
+ kvm_inject_s2_fault(vcpu, esr);
+ goto out_unlock;
+ }
+
+ ret = kvm_s2_handle_perm_fault(vcpu, &nested_trans);
+ if (ret) {
+ esr = kvm_s2_trans_esr(&nested_trans);
+ kvm_inject_s2_fault(vcpu, esr);
+ goto out_unlock;
+ }
+
+ ipa = kvm_s2_trans_output(&nested_trans);
+ nested = &nested_trans;
+ }
+
+ gfn = ipa >> PAGE_SHIFT;
memslot = gfn_to_memslot(vcpu->kvm, gfn);
hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable);
write_fault = kvm_is_write_fault(vcpu);
@@ -1761,13 +1829,13 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
* faulting VA. This is always 12 bits, irrespective
* of the page size.
*/
- fault_ipa |= kvm_vcpu_get_hfar(vcpu) & ((1 << 12) - 1);
- ret = io_mem_abort(vcpu, fault_ipa);
+ ipa |= kvm_vcpu_get_hfar(vcpu) & ((1 << 12) - 1);
+ ret = io_mem_abort(vcpu, ipa);
goto out_unlock;
}
/* Userspace should not be able to register out-of-bounds IPAs */
- VM_BUG_ON(fault_ipa >= kvm_phys_size(vcpu->arch.hw_mmu));
+ VM_BUG_ON(ipa >= kvm_phys_size(vcpu->arch.hw_mmu));
if (fault_status == ESR_ELx_FSC_ACCESS) {
handle_access_fault(vcpu, fault_ipa);
@@ -1775,7 +1843,8 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
goto out_unlock;
}
- ret = user_mem_abort(vcpu, fault_ipa, memslot, hva, fault_status);
+ ret = user_mem_abort(vcpu, fault_ipa, nested,
+ memslot, hva, fault_status);
if (ret == 0)
ret = 1;
out:
@@ -108,6 +108,15 @@ static u32 compute_fsc(int level, u32 fsc)
return fsc | (level & 0x3);
}
+static int esr_s2_fault(struct kvm_vcpu *vcpu, int level, u32 fsc)
+{
+ u32 esr;
+
+ esr = kvm_vcpu_get_esr(vcpu) & ~ESR_ELx_FSC;
+ esr |= compute_fsc(level, fsc);
+ return esr;
+}
+
static int check_base_s2_limits(struct s2_walk_info *wi,
int level, int input_size, int stride)
{
@@ -472,6 +481,45 @@ void kvm_vcpu_put_hw_mmu(struct kvm_vcpu *vcpu)
}
}
+/*
+ * Returns non-zero if permission fault is handled by injecting it to the next
+ * level hypervisor.
+ */
+int kvm_s2_handle_perm_fault(struct kvm_vcpu *vcpu, struct kvm_s2_trans *trans)
+{
+ unsigned long fault_status = kvm_vcpu_trap_get_fault_type(vcpu);
+ bool forward_fault = false;
+
+ trans->esr = 0;
+
+ if (fault_status != ESR_ELx_FSC_PERM)
+ return 0;
+
+ if (kvm_vcpu_trap_is_iabt(vcpu)) {
+ forward_fault = (trans->upper_attr & BIT(54));
+ } else {
+ bool write_fault = kvm_is_write_fault(vcpu);
+
+ forward_fault = ((write_fault && !trans->writable) ||
+ (!write_fault && !trans->readable));
+ }
+
+ if (forward_fault) {
+ trans->esr = esr_s2_fault(vcpu, trans->level, ESR_ELx_FSC_PERM);
+ return 1;
+ }
+
+ return 0;
+}
+
+int kvm_inject_s2_fault(struct kvm_vcpu *vcpu, u64 esr_el2)
+{
+ vcpu_write_sys_reg(vcpu, vcpu->arch.fault.far_el2, FAR_EL2);
+ vcpu_write_sys_reg(vcpu, vcpu->arch.fault.hpfar_el2, HPFAR_EL2);
+
+ return kvm_inject_nested_sync(vcpu, esr_el2);
+}
+
void kvm_arch_flush_shadow_all(struct kvm *kvm)
{
int i;