@@ -2019,33 +2019,15 @@ static void clear_sp_write_flooding_count(u64 *spte)
__clear_sp_write_flooding_count(sptep_to_sp(spte));
}
-static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
- gfn_t gfn,
- gva_t gaddr,
- unsigned level,
- bool direct,
- unsigned int access)
+static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, gfn_t gfn,
+ union kvm_mmu_page_role role)
{
- union kvm_mmu_page_role role;
struct hlist_head *sp_list;
- unsigned quadrant;
struct kvm_mmu_page *sp;
int ret;
int collisions = 0;
LIST_HEAD(invalid_list);
- role = vcpu->arch.mmu->root_role;
- role.level = level;
- role.direct = direct;
- role.access = access;
- if (role.has_4_byte_gpte) {
- quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
- quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
- role.quadrant = quadrant;
- }
- if (level <= vcpu->arch.mmu->cpu_role.base.level)
- role.passthrough = 0;
-
sp_list = &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)];
for_each_valid_sp(vcpu->kvm, sp, sp_list) {
if (sp->gfn != gfn) {
@@ -2063,7 +2045,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
* Unsync pages must not be left as is, because the new
* upper-level page will be write-protected.
*/
- if (level > PG_LEVEL_4K && sp->unsync)
+ if (role.level > PG_LEVEL_4K && sp->unsync)
kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
&invalid_list);
continue;
@@ -2104,14 +2086,14 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
++vcpu->kvm->stat.mmu_cache_miss;
- sp = kvm_mmu_alloc_page(vcpu, direct);
+ sp = kvm_mmu_alloc_page(vcpu, role.direct);
sp->gfn = gfn;
sp->role = role;
hlist_add_head(&sp->hash_link, sp_list);
if (sp_has_gptes(sp)) {
account_shadowed(vcpu->kvm, sp);
- if (level == PG_LEVEL_4K && kvm_vcpu_write_protect_gfn(vcpu, gfn))
+ if (role.level == PG_LEVEL_4K && kvm_vcpu_write_protect_gfn(vcpu, gfn))
kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, 1);
}
trace_kvm_mmu_get_page(sp, true);
@@ -2123,6 +2105,55 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
return sp;
}
+static union kvm_mmu_page_role kvm_mmu_child_role(u64 *sptep, bool direct, u32 access)
+{
+ struct kvm_mmu_page *parent_sp = sptep_to_sp(sptep);
+ union kvm_mmu_page_role role;
+
+ role = parent_sp->role;
+ role.level--;
+ role.access = access;
+ role.direct = direct;
+ role.passthrough = 0;
+
+ /*
+ * If the guest has 4-byte PTEs then that means it's using 32-bit,
+ * 2-level, non-PAE paging. KVM shadows such guests with PAE paging
+ * (i.e. 8-byte PTEs). The difference in PTE size means that KVM must
+ * shadow each guest page table with multiple shadow page tables, which
+ * requires extra bookkeeping in the role.
+ *
+ * Specifically, to shadow the guest's page directory (which covers a
+ * 4GiB address space), KVM uses 4 PAE page directories, each mapping
+ * 1GiB of the address space. @role.quadrant encodes which quarter of
+ * the address space each maps.
+ *
+ * To shadow the guest's page tables (which each map a 4MiB region), KVM
+ * uses 2 PAE page tables, each mapping a 2MiB region. For these,
+ * @role.quadrant encodes which half of the region they map.
+ *
+ * Note, the 4 PAE page directories are pre-allocated and the quadrant
+ * assigned in mmu_alloc_root(). So only page tables need to be handled
+ * here.
+ */
+ if (role.has_4_byte_gpte) {
+ WARN_ON_ONCE(role.level != PG_LEVEL_4K);
+ role.quadrant = (sptep - parent_sp->spt) % 2;
+ }
+
+ return role;
+}
+
+static struct kvm_mmu_page *kvm_mmu_get_child_sp(struct kvm_vcpu *vcpu,
+ u64 *sptep, gfn_t gfn,
+ bool direct, u32 access)
+{
+ union kvm_mmu_page_role role;
+
+ role = kvm_mmu_child_role(sptep, direct, access);
+ return kvm_mmu_get_page(vcpu, gfn, role);
+}
+
static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterator,
struct kvm_vcpu *vcpu, hpa_t root,
u64 addr)
@@ -2965,8 +2996,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
if (is_shadow_present_pte(*it.sptep))
continue;
- sp = kvm_mmu_get_page(vcpu, base_gfn, it.addr,
- it.level - 1, true, ACC_ALL);
+ sp = kvm_mmu_get_child_sp(vcpu, it.sptep, base_gfn, true, ACC_ALL);
link_shadow_page(vcpu, it.sptep, sp);
if (fault->is_tdp && fault->huge_page_disallowed &&
@@ -3369,12 +3399,24 @@ static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
return ret;
}
-static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gva,
+static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, int quadrant,
u8 level, bool direct)
{
+ union kvm_mmu_page_role role;
struct kvm_mmu_page *sp;
- sp = kvm_mmu_get_page(vcpu, gfn, gva, level, direct, ACC_ALL);
+ role = vcpu->arch.mmu->root_role;
+ role.level = level;
+ role.direct = direct;
+ role.access = ACC_ALL;
+
+ if (role.has_4_byte_gpte)
+ role.quadrant = quadrant;
+
+ if (level <= vcpu->arch.mmu->cpu_role.base.level)
+ role.passthrough = 0;
+
+ sp = kvm_mmu_get_page(vcpu, gfn, role);
++sp->root_count;
return __pa(sp->spt);
@@ -3408,8 +3450,8 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
for (i = 0; i < 4; ++i) {
WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i]));
- root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT),
- i << 30, PT32_ROOT_LEVEL, true);
+ root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT), i,
+ PT32_ROOT_LEVEL, true);
mmu->pae_root[i] = root | PT_PRESENT_MASK |
shadow_me_mask;
}
@@ -3578,8 +3620,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
root_gfn = pdptrs[i] >> PAGE_SHIFT;
}
- root = mmu_alloc_root(vcpu, root_gfn, i << 30,
- PT32_ROOT_LEVEL, false);
+ root = mmu_alloc_root(vcpu, root_gfn, i, PT32_ROOT_LEVEL, false);
mmu->pae_root[i] = root | pm_mask;
}
@@ -648,8 +648,9 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
if (!is_shadow_present_pte(*it.sptep)) {
table_gfn = gw->table_gfn[it.level - 2];
access = gw->pt_access[it.level - 2];
- sp = kvm_mmu_get_page(vcpu, table_gfn, fault->addr,
- it.level-1, false, access);
+ sp = kvm_mmu_get_child_sp(vcpu, it.sptep, table_gfn,
+ false, access);
+
/*
* We must synchronize the pagetable before linking it
* because the guest doesn't need to flush tlb when
@@ -705,8 +706,8 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
drop_large_spte(vcpu, it.sptep);
if (!is_shadow_present_pte(*it.sptep)) {
- sp = kvm_mmu_get_page(vcpu, base_gfn, fault->addr,
- it.level - 1, true, direct_access);
+ sp = kvm_mmu_get_child_sp(vcpu, it.sptep, base_gfn,
+ true, direct_access);
link_shadow_page(vcpu, it.sptep, sp);
if (fault->huge_page_disallowed &&
fault->req_level >= it.level)