@@ -1722,6 +1722,58 @@ static bool using_special_root_page(struct kvm_mmu *mmu)
return mmu->root_level <= PT32E_ROOT_LEVEL;
}
+/*
+ * Special pages are pages to hold PAE PDPTEs for 32bit guest or higher level
+ * pages linked to special page when shadowing NPT.
+ *
+ * Special pages are specially allocated. If sp->spt needs to be 32bit, it
+ * will use the preallocated mmu->pae_root.
+ *
+ * Special pages are only visible to local VCPU except through rmap from their
+ * children, so they are not in the kvm->arch.active_mmu_pages nor in the hash.
+ *
+ * And they are either accounted nor write-protected since they don't has gfn
+ * associated.
+ *
+ * Because of above, special pages can not be freed nor zapped like normal
+ * shadow pages. They are freed directly when the special root is freed, see
+ * mmu_free_special_root_page().
+ *
+ * Special root page can not be put on mmu->prev_roots because the comparison
+ * must use PDPTEs instead of CR3 and mmu->pae_root can not be shared for multi
+ * root pages.
+ *
+ * Except above limitations, all the other abilities are the same as other
+ * shadow page, like link, rmap, sync, unsync etc.
+ *
+ * Special pages can be obsoleted but might be possibly reused later. When
+ * the obsoleting process is done, all the obsoleted shadow pages are unlinked
+ * from the special pages by the help of the rmap of the children and the
+ * special pages become theoretically valid again. If there is no other event
+ * to cause a VCPU to free the root and the VCPU is being preempted by the host
+ * during two obsoleting processes, the VCPU can reuse its special pages when
+ * it is back.
+ */
+static struct kvm_mmu_page *kvm_mmu_alloc_special_page(struct kvm_vcpu *vcpu,
+ union kvm_mmu_page_role role)
+{
+ struct kvm_mmu_page *sp;
+
+ sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
+ sp->gfn = 0;
+ sp->role = role;
+ if (role.level == PT32E_ROOT_LEVEL &&
+ vcpu->arch.mmu->shadow_root_level == PT32E_ROOT_LEVEL)
+ sp->spt = vcpu->arch.mmu->pae_root;
+ else
+ sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
+ /* sp->gfns is not used for special sp */
+ set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
+ sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
+
+ return sp;
+}
+
static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct)
{
struct kvm_mmu_page *sp;
@@ -2081,6 +2133,9 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
if (level <= vcpu->arch.mmu->root_level)
role.passthrough = 0;
+ if (unlikely(level >= PT32E_ROOT_LEVEL && using_special_root_page(vcpu->arch.mmu)))
+ return kvm_mmu_alloc_special_page(vcpu, role);
+
sp_list = &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)];
for_each_valid_sp(vcpu->kvm, sp, sp_list) {
if (sp->gfn != gfn) {
@@ -3250,6 +3305,37 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
*root_hpa = INVALID_PAGE;
}
+static void mmu_free_special_root_page(struct kvm *kvm, struct kvm_mmu *mmu)
+{
+ u64 spte = mmu->root.hpa;
+ struct kvm_mmu_page *sp = to_shadow_page(spte & PT64_BASE_ADDR_MASK);
+ int i;
+
+ /* Free level 5 or 4 roots for shadow NPT for 32 bit L1 */
+ while (sp->role.level > PT32E_ROOT_LEVEL)
+ {
+ spte = sp->spt[0];
+ mmu_page_zap_pte(kvm, sp, sp->spt + 0, NULL);
+ free_page((unsigned long)sp->spt);
+ kmem_cache_free(mmu_page_header_cache, sp);
+ if (!is_shadow_present_pte(spte))
+ return;
+ sp = to_shadow_page(spte & PT64_BASE_ADDR_MASK);
+ }
+
+ if (WARN_ON_ONCE(sp->role.level != PT32E_ROOT_LEVEL))
+ return;
+
+ /* Free PAE roots */
+ for (i = 0; i < 4; i++)
+ mmu_page_zap_pte(kvm, sp, sp->spt + i, NULL);
+
+ if (sp->spt != mmu->pae_root)
+ free_page((unsigned long)sp->spt);
+
+ kmem_cache_free(mmu_page_header_cache, sp);
+}
+
/* roots_to_free must be some combination of the KVM_MMU_ROOT_* flags */
void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu,
ulong roots_to_free)
@@ -3283,7 +3369,10 @@ void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu,
if (free_active_root) {
if (to_shadow_page(mmu->root.hpa)) {
- mmu_free_root_page(kvm, &mmu->root.hpa, &invalid_list);
+ if (using_special_root_page(mmu))
+ mmu_free_special_root_page(kvm, mmu);
+ else
+ mmu_free_root_page(kvm, &mmu->root.hpa, &invalid_list);
} else if (mmu->pae_root) {
for (i = 0; i < 4; ++i) {
if (!IS_VALID_PAE_ROOT(mmu->pae_root[i]))