Message ID | 20221208193857.4090582-7-dmatlack@google.com (mailing list archive) |
---|---|
State | Handled Elsewhere |
Headers | show |
Series | KVM: Refactor the KVM/x86 TDP MMU into common code | expand |
On Thu, Dec 8, 2022 at 11:39 AM David Matlack <dmatlack@google.com> wrote: > > Move struct kvm_mmu_page to common code and all x86-specific fields into > kvm_mmu_page_arch. > > This commit increases the size of struct kvm_mmu_page by 64 bytes on > x86_64 (184 bytes -> 248 bytes). The size of this struct can be reduced > in future commits by moving TDP MMU root fields into a separate struct > and by dynamically allocating fields only used by the Shadow MMU. > > No functional change intended. > > Signed-off-by: David Matlack <dmatlack@google.com> I haven't reviewed every line of the mechanical refactor of adding "arch." all over the place, but at a high level, this looks like a good split on struct kvm_mmu_page. > --- > arch/x86/include/asm/kvm/mmu_types.h | 62 ++++++++++++++ > arch/x86/include/asm/kvm_host.h | 4 - > arch/x86/kvm/mmu/mmu.c | 122 ++++++++++++++------------- > arch/x86/kvm/mmu/mmu_internal.h | 83 ------------------ > arch/x86/kvm/mmu/mmutrace.h | 4 +- > arch/x86/kvm/mmu/paging_tmpl.h | 10 +-- > arch/x86/kvm/mmu/tdp_mmu.c | 8 +- > arch/x86/kvm/mmu/tdp_mmu.h | 2 +- > include/kvm/mmu_types.h | 32 ++++++- > 9 files changed, 167 insertions(+), 160 deletions(-) > > diff --git a/arch/x86/include/asm/kvm/mmu_types.h b/arch/x86/include/asm/kvm/mmu_types.h > index 35f893ebab5a..affcb520b482 100644 > --- a/arch/x86/include/asm/kvm/mmu_types.h > +++ b/arch/x86/include/asm/kvm/mmu_types.h > @@ -2,6 +2,8 @@ > #ifndef __ASM_KVM_MMU_TYPES_H > #define __ASM_KVM_MMU_TYPES_H > > +#include <linux/bitmap.h> > +#include <linux/list.h> > #include <linux/types.h> > > /* > @@ -53,4 +55,64 @@ struct kvm_mmu_page_role_arch { > u16 passthrough:1; > }; > > +struct kvm_rmap_head { > + unsigned long val; > +}; > + > +struct kvm_mmu_page_arch { > + struct hlist_node hash_link; > + > + bool shadow_mmu_page; > + bool unsync; > + u8 mmu_valid_gen; > + > + /* > + * The shadow page can't be replaced by an equivalent huge page > + * because it is being used to map an executable page in the guest > + * and the NX huge page mitigation is enabled. > + */ > + bool nx_huge_page_disallowed; > + > + /* > + * Stores the result of the guest translation being shadowed by each > + * SPTE. KVM shadows two types of guest translations: nGPA -> GPA > + * (shadow EPT/NPT) and GVA -> GPA (traditional shadow paging). In both > + * cases the result of the translation is a GPA and a set of access > + * constraints. > + * > + * The GFN is stored in the upper bits (PAGE_SHIFT) and the shadowed > + * access permissions are stored in the lower bits. Note, for > + * convenience and uniformity across guests, the access permissions are > + * stored in KVM format (e.g. ACC_EXEC_MASK) not the raw guest format. > + */ > + u64 *shadowed_translation; > + > + unsigned int unsync_children; > + > + /* Rmap pointers to all parent sptes. */ > + struct kvm_rmap_head parent_ptes; > + > + DECLARE_BITMAP(unsync_child_bitmap, 512); > + > + /* > + * Tracks shadow pages that, if zapped, would allow KVM to create an NX > + * huge page. A shadow page will have nx_huge_page_disallowed set but > + * not be on the list if a huge page is disallowed for other reasons, > + * e.g. because KVM is shadowing a PTE at the same gfn, the memslot > + * isn't properly aligned, etc... > + */ > + struct list_head possible_nx_huge_page_link; > + > +#ifdef CONFIG_X86_32 > + /* > + * Used out of the mmu-lock to avoid reading spte values while an > + * update is in progress; see the comments in __get_spte_lockless(). > + */ > + int clear_spte_count; > +#endif > + > + /* Number of writes since the last time traversal visited this page. */ > + atomic_t write_flooding_count; > +}; > + > #endif /* !__ASM_KVM_MMU_TYPES_H */ > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h > index ebcd7a0dabef..f5743a652e10 100644 > --- a/arch/x86/include/asm/kvm_host.h > +++ b/arch/x86/include/asm/kvm_host.h > @@ -329,10 +329,6 @@ union kvm_cpu_role { > }; > }; > > -struct kvm_rmap_head { > - unsigned long val; > -}; > - > struct kvm_pio_request { > unsigned long linear_rip; > unsigned long count; > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c > index 11cef930d5ed..e47f35878ab5 100644 > --- a/arch/x86/kvm/mmu/mmu.c > +++ b/arch/x86/kvm/mmu/mmu.c > @@ -350,7 +350,7 @@ static void count_spte_clear(u64 *sptep, u64 spte) > > /* Ensure the spte is completely set before we increase the count */ > smp_wmb(); > - sp->clear_spte_count++; > + sp->arch.clear_spte_count++; > } > > static void __set_spte(u64 *sptep, u64 spte) > @@ -432,7 +432,7 @@ static u64 __get_spte_lockless(u64 *sptep) > int count; > > retry: > - count = sp->clear_spte_count; > + count = sp->arch.clear_spte_count; > smp_rmb(); > > spte.spte_low = orig->spte_low; > @@ -442,7 +442,7 @@ static u64 __get_spte_lockless(u64 *sptep) > smp_rmb(); > > if (unlikely(spte.spte_low != orig->spte_low || > - count != sp->clear_spte_count)) > + count != sp->arch.clear_spte_count)) > goto retry; > > return spte.spte; > @@ -699,7 +699,7 @@ static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index) > return sp->gfn; > > if (!sp->role.arch.direct) > - return sp->shadowed_translation[index] >> PAGE_SHIFT; > + return sp->arch.shadowed_translation[index] >> PAGE_SHIFT; > > return sp->gfn + (index << ((sp->role.level - 1) * SPTE_LEVEL_BITS)); > } > @@ -713,7 +713,7 @@ static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index) > static u32 kvm_mmu_page_get_access(struct kvm_mmu_page *sp, int index) > { > if (sp_has_gptes(sp)) > - return sp->shadowed_translation[index] & ACC_ALL; > + return sp->arch.shadowed_translation[index] & ACC_ALL; > > /* > * For direct MMUs (e.g. TDP or non-paging guests) or passthrough SPs, > @@ -734,7 +734,7 @@ static void kvm_mmu_page_set_translation(struct kvm_mmu_page *sp, int index, > gfn_t gfn, unsigned int access) > { > if (sp_has_gptes(sp)) { > - sp->shadowed_translation[index] = (gfn << PAGE_SHIFT) | access; > + sp->arch.shadowed_translation[index] = (gfn << PAGE_SHIFT) | access; > return; > } > > @@ -825,18 +825,18 @@ void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp) > * on the list if KVM is reusing an existing shadow page, i.e. if KVM > * links a shadow page at multiple points. > */ > - if (!list_empty(&sp->possible_nx_huge_page_link)) > + if (!list_empty(&sp->arch.possible_nx_huge_page_link)) > return; > > ++kvm->stat.nx_lpage_splits; > - list_add_tail(&sp->possible_nx_huge_page_link, > + list_add_tail(&sp->arch.possible_nx_huge_page_link, > &kvm->arch.possible_nx_huge_pages); > } > > static void account_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp, > bool nx_huge_page_possible) > { > - sp->nx_huge_page_disallowed = true; > + sp->arch.nx_huge_page_disallowed = true; > > if (nx_huge_page_possible) > track_possible_nx_huge_page(kvm, sp); > @@ -861,16 +861,16 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) > > void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp) > { > - if (list_empty(&sp->possible_nx_huge_page_link)) > + if (list_empty(&sp->arch.possible_nx_huge_page_link)) > return; > > --kvm->stat.nx_lpage_splits; > - list_del_init(&sp->possible_nx_huge_page_link); > + list_del_init(&sp->arch.possible_nx_huge_page_link); > } > > static void unaccount_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp) > { > - sp->nx_huge_page_disallowed = false; > + sp->arch.nx_huge_page_disallowed = false; > > untrack_possible_nx_huge_page(kvm, sp); > } > @@ -1720,11 +1720,11 @@ static void kvm_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) > static void kvm_mmu_free_shadow_page(struct kvm_mmu_page *sp) > { > MMU_WARN_ON(!is_empty_shadow_page(sp->spt)); > - hlist_del(&sp->hash_link); > + hlist_del(&sp->arch.hash_link); > list_del(&sp->link); > free_page((unsigned long)sp->spt); > if (!sp->role.arch.direct) > - free_page((unsigned long)sp->shadowed_translation); > + free_page((unsigned long)sp->arch.shadowed_translation); > kmem_cache_free(mmu_page_header_cache, sp); > } > > @@ -1739,13 +1739,13 @@ static void mmu_page_add_parent_pte(struct kvm_mmu_memory_cache *cache, > if (!parent_pte) > return; > > - pte_list_add(cache, parent_pte, &sp->parent_ptes); > + pte_list_add(cache, parent_pte, &sp->arch.parent_ptes); > } > > static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp, > u64 *parent_pte) > { > - pte_list_remove(parent_pte, &sp->parent_ptes); > + pte_list_remove(parent_pte, &sp->arch.parent_ptes); > } > > static void drop_parent_pte(struct kvm_mmu_page *sp, > @@ -1761,7 +1761,7 @@ static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp) > u64 *sptep; > struct rmap_iterator iter; > > - for_each_rmap_spte(&sp->parent_ptes, &iter, sptep) { > + for_each_rmap_spte(&sp->arch.parent_ptes, &iter, sptep) { > mark_unsync(sptep); > } > } > @@ -1771,9 +1771,9 @@ static void mark_unsync(u64 *spte) > struct kvm_mmu_page *sp; > > sp = sptep_to_sp(spte); > - if (__test_and_set_bit(spte_index(spte), sp->unsync_child_bitmap)) > + if (__test_and_set_bit(spte_index(spte), sp->arch.unsync_child_bitmap)) > return; > - if (sp->unsync_children++) > + if (sp->arch.unsync_children++) > return; > kvm_mmu_mark_parents_unsync(sp); > } > @@ -1799,7 +1799,7 @@ static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp, > { > int i; > > - if (sp->unsync) > + if (sp->arch.unsync) > for (i=0; i < pvec->nr; i++) > if (pvec->page[i].sp == sp) > return 0; > @@ -1812,9 +1812,9 @@ static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp, > > static inline void clear_unsync_child_bit(struct kvm_mmu_page *sp, int idx) > { > - --sp->unsync_children; > - WARN_ON((int)sp->unsync_children < 0); > - __clear_bit(idx, sp->unsync_child_bitmap); > + --sp->arch.unsync_children; > + WARN_ON((int)sp->arch.unsync_children < 0); > + __clear_bit(idx, sp->arch.unsync_child_bitmap); > } > > static int __mmu_unsync_walk(struct kvm_mmu_page *sp, > @@ -1822,7 +1822,7 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp, > { > int i, ret, nr_unsync_leaf = 0; > > - for_each_set_bit(i, sp->unsync_child_bitmap, 512) { > + for_each_set_bit(i, sp->arch.unsync_child_bitmap, 512) { > struct kvm_mmu_page *child; > u64 ent = sp->spt[i]; > > @@ -1833,7 +1833,7 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp, > > child = spte_to_child_sp(ent); > > - if (child->unsync_children) { > + if (child->arch.unsync_children) { > if (mmu_pages_add(pvec, child, i)) > return -ENOSPC; > > @@ -1845,7 +1845,7 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp, > nr_unsync_leaf += ret; > } else > return ret; > - } else if (child->unsync) { > + } else if (child->arch.unsync) { > nr_unsync_leaf++; > if (mmu_pages_add(pvec, child, i)) > return -ENOSPC; > @@ -1862,7 +1862,7 @@ static int mmu_unsync_walk(struct kvm_mmu_page *sp, > struct kvm_mmu_pages *pvec) > { > pvec->nr = 0; > - if (!sp->unsync_children) > + if (!sp->arch.unsync_children) > return 0; > > mmu_pages_add(pvec, sp, INVALID_INDEX); > @@ -1871,9 +1871,9 @@ static int mmu_unsync_walk(struct kvm_mmu_page *sp, > > static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) > { > - WARN_ON(!sp->unsync); > + WARN_ON(!sp->arch.unsync); > trace_kvm_mmu_sync_page(sp); > - sp->unsync = 0; > + sp->arch.unsync = 0; > --kvm->stat.mmu_unsync; > } > > @@ -1894,7 +1894,7 @@ static bool sp_has_gptes(struct kvm_mmu_page *sp) > } > > #define for_each_valid_sp(_kvm, _sp, _list) \ > - hlist_for_each_entry(_sp, _list, hash_link) \ > + hlist_for_each_entry(_sp, _list, arch.hash_link) \ > if (is_obsolete_sp((_kvm), (_sp))) { \ > } else > > @@ -1934,7 +1934,7 @@ static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp) > > /* TDP MMU pages do not use the MMU generation. */ > return !is_tdp_mmu_page(sp) && > - unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen); > + unlikely(sp->arch.mmu_valid_gen != kvm->arch.mmu_valid_gen); > } > > struct mmu_page_path { > @@ -2006,7 +2006,7 @@ static void mmu_pages_clear_parents(struct mmu_page_path *parents) > WARN_ON(idx == INVALID_INDEX); > clear_unsync_child_bit(sp, idx); > level++; > - } while (!sp->unsync_children); > + } while (!sp->arch.unsync_children); > } > > static int mmu_sync_children(struct kvm_vcpu *vcpu, > @@ -2053,7 +2053,7 @@ static int mmu_sync_children(struct kvm_vcpu *vcpu, > > static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp) > { > - atomic_set(&sp->write_flooding_count, 0); > + atomic_set(&sp->arch.write_flooding_count, 0); > } > > static void clear_sp_write_flooding_count(u64 *spte) > @@ -2094,7 +2094,7 @@ static struct kvm_mmu_page *kvm_mmu_find_shadow_page(struct kvm *kvm, > * Unsync pages must not be left as is, because the new > * upper-level page will be write-protected. > */ > - if (role.level > PG_LEVEL_4K && sp->unsync) > + if (role.level > PG_LEVEL_4K && sp->arch.unsync) > kvm_mmu_prepare_zap_page(kvm, sp, > &invalid_list); > continue; > @@ -2104,7 +2104,7 @@ static struct kvm_mmu_page *kvm_mmu_find_shadow_page(struct kvm *kvm, > if (sp->role.arch.direct) > goto out; > > - if (sp->unsync) { > + if (sp->arch.unsync) { > if (KVM_BUG_ON(!vcpu, kvm)) > break; > > @@ -2163,25 +2163,26 @@ static struct kvm_mmu_page *kvm_mmu_alloc_shadow_page(struct kvm *kvm, > sp = kvm_mmu_memory_cache_alloc(caches->page_header_cache); > sp->spt = kvm_mmu_memory_cache_alloc(caches->shadow_page_cache); > if (!role.arch.direct) > - sp->shadowed_translation = kvm_mmu_memory_cache_alloc(caches->shadowed_info_cache); > + sp->arch.shadowed_translation = > + kvm_mmu_memory_cache_alloc(caches->shadowed_info_cache); > > set_page_private(virt_to_page(sp->spt), (unsigned long)sp); > > - INIT_LIST_HEAD(&sp->possible_nx_huge_page_link); > + INIT_LIST_HEAD(&sp->arch.possible_nx_huge_page_link); > > /* > * active_mmu_pages must be a FIFO list, as kvm_zap_obsolete_pages() > * depends on valid pages being added to the head of the list. See > * comments in kvm_zap_obsolete_pages(). > */ > - sp->mmu_valid_gen = kvm->arch.mmu_valid_gen; > + sp->arch.mmu_valid_gen = kvm->arch.mmu_valid_gen; > list_add(&sp->link, &kvm->arch.active_mmu_pages); > kvm_account_mmu_page(kvm, sp); > > sp->gfn = gfn; > sp->role = role; > - sp->shadow_mmu_page = true; > - hlist_add_head(&sp->hash_link, sp_list); > + sp->arch.shadow_mmu_page = true; > + hlist_add_head(&sp->arch.hash_link, sp_list); > if (sp_has_gptes(sp)) > account_shadowed(kvm, sp); > > @@ -2368,7 +2369,7 @@ static void __link_shadow_page(struct kvm *kvm, > > mmu_page_add_parent_pte(cache, sp, sptep); > > - if (sp->unsync_children || sp->unsync) > + if (sp->arch.unsync_children || sp->arch.unsync) > mark_unsync(sptep); > } > > @@ -2421,7 +2422,8 @@ static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, > * avoids retaining a large number of stale nested SPs. > */ > if (tdp_enabled && invalid_list && > - child->role.arch.guest_mode && !child->parent_ptes.val) > + child->role.arch.guest_mode && > + !child->arch.parent_ptes.val) > return kvm_mmu_prepare_zap_page(kvm, child, > invalid_list); > } > @@ -2449,7 +2451,7 @@ static void kvm_mmu_unlink_parents(struct kvm_mmu_page *sp) > u64 *sptep; > struct rmap_iterator iter; > > - while ((sptep = rmap_get_first(&sp->parent_ptes, &iter))) > + while ((sptep = rmap_get_first(&sp->arch.parent_ptes, &iter))) > drop_parent_pte(sp, sptep); > } > > @@ -2496,7 +2498,7 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm, > if (!sp->role.invalid && sp_has_gptes(sp)) > unaccount_shadowed(kvm, sp); > > - if (sp->unsync) > + if (sp->arch.unsync) > kvm_unlink_unsync_page(kvm, sp); > if (!refcount_read(&sp->root_refcount)) { > /* Count self */ > @@ -2527,7 +2529,7 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm, > zapped_root = !is_obsolete_sp(kvm, sp); > } > > - if (sp->nx_huge_page_disallowed) > + if (sp->arch.nx_huge_page_disallowed) > unaccount_nx_huge_page(kvm, sp); > > sp->role.invalid = 1; > @@ -2704,7 +2706,7 @@ static void kvm_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) > { > trace_kvm_mmu_unsync_page(sp); > ++kvm->stat.mmu_unsync; > - sp->unsync = 1; > + sp->arch.unsync = 1; > > kvm_mmu_mark_parents_unsync(sp); > } > @@ -2739,7 +2741,7 @@ int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot, > if (!can_unsync) > return -EPERM; > > - if (sp->unsync) > + if (sp->arch.unsync) > continue; > > if (prefetch) > @@ -2764,7 +2766,7 @@ int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot, > * for write, i.e. unsync cannot transition from 0->1 > * while this CPU holds mmu_lock for read (or write). > */ > - if (READ_ONCE(sp->unsync)) > + if (READ_ONCE(sp->arch.unsync)) > continue; > } > > @@ -2796,8 +2798,8 @@ int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot, > * 2.2 Guest issues TLB flush. > * That causes a VM Exit. > * > - * 2.3 Walking of unsync pages sees sp->unsync is > - * false and skips the page. > + * 2.3 Walking of unsync pages sees sp->arch.unsync > + * is false and skips the page. > * > * 2.4 Guest accesses GVA X. > * Since the mapping in the SP was not updated, > @@ -2805,7 +2807,7 @@ int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot, > * gets used. > * 1.1 Host marks SP > * as unsync > - * (sp->unsync = true) > + * (sp->arch.unsync = true) > * > * The write barrier below ensures that 1.1 happens before 1.2 and thus > * the situation in 2.4 does not arise. It pairs with the read barrier > @@ -3126,7 +3128,7 @@ void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_ > cur_level == fault->goal_level && > is_shadow_present_pte(spte) && > !is_large_pte(spte) && > - spte_to_child_sp(spte)->nx_huge_page_disallowed) { > + spte_to_child_sp(spte)->arch.nx_huge_page_disallowed) { > /* > * A small SPTE exists for this pfn, but FNAME(fetch), > * direct_map(), or kvm_tdp_mmu_map() would like to create a > @@ -3902,7 +3904,7 @@ static bool is_unsync_root(hpa_t root) > > /* > * The read barrier orders the CPU's read of SPTE.W during the page table > - * walk before the reads of sp->unsync/sp->unsync_children here. > + * walk before the reads of sp->arch.{unsync,unsync_children} here. > * > * Even if another CPU was marking the SP as unsync-ed simultaneously, > * any guest page table changes are not guaranteed to be visible anyway > @@ -3922,7 +3924,7 @@ static bool is_unsync_root(hpa_t root) > if (WARN_ON_ONCE(!sp)) > return false; > > - if (sp->unsync || sp->unsync_children) > + if (sp->arch.unsync || sp->arch.unsync_children) > return true; > > return false; > @@ -5510,8 +5512,8 @@ static bool detect_write_flooding(struct kvm_mmu_page *sp) > if (sp->role.level == PG_LEVEL_4K) > return false; > > - atomic_inc(&sp->write_flooding_count); > - return atomic_read(&sp->write_flooding_count) >= 3; > + atomic_inc(&sp->arch.write_flooding_count); > + return atomic_read(&sp->arch.write_flooding_count) >= 3; > } > > /* > @@ -6389,7 +6391,7 @@ static bool shadow_mmu_try_split_huge_pages(struct kvm *kvm, > continue; > > /* SPs with level >PG_LEVEL_4K should never by unsync. */ > - if (WARN_ON_ONCE(sp->unsync)) > + if (WARN_ON_ONCE(sp->arch.unsync)) > continue; > > /* Don't bother splitting huge pages on invalid SPs. */ > @@ -6941,8 +6943,8 @@ static void kvm_recover_nx_huge_pages(struct kvm *kvm) > */ > sp = list_first_entry(&kvm->arch.possible_nx_huge_pages, > struct kvm_mmu_page, > - possible_nx_huge_page_link); > - WARN_ON_ONCE(!sp->nx_huge_page_disallowed); > + arch.possible_nx_huge_page_link); > + WARN_ON_ONCE(!sp->arch.nx_huge_page_disallowed); > WARN_ON_ONCE(!sp->role.arch.direct); > > /* > @@ -6977,7 +6979,7 @@ static void kvm_recover_nx_huge_pages(struct kvm *kvm) > flush |= kvm_tdp_mmu_zap_sp(kvm, sp); > else > kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); > - WARN_ON_ONCE(sp->nx_huge_page_disallowed); > + WARN_ON_ONCE(sp->arch.nx_huge_page_disallowed); > > if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { > kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush); > diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h > index fd4990c8b0e9..af2ae4887e35 100644 > --- a/arch/x86/kvm/mmu/mmu_internal.h > +++ b/arch/x86/kvm/mmu/mmu_internal.h > @@ -44,89 +44,6 @@ extern bool dbg; > #define INVALID_PAE_ROOT 0 > #define IS_VALID_PAE_ROOT(x) (!!(x)) > > -struct kvm_mmu_page { > - /* > - * Note, "link" through "spt" fit in a single 64 byte cache line on > - * 64-bit kernels, keep it that way unless there's a reason not to. > - */ > - struct list_head link; > - struct hlist_node hash_link; > - > - bool shadow_mmu_page; > - bool unsync; > - u8 mmu_valid_gen; > - > - /* > - * The shadow page can't be replaced by an equivalent huge page > - * because it is being used to map an executable page in the guest > - * and the NX huge page mitigation is enabled. > - */ > - bool nx_huge_page_disallowed; > - > - /* > - * The following two entries are used to key the shadow page in the > - * hash table. > - */ > - union kvm_mmu_page_role role; > - gfn_t gfn; > - > - u64 *spt; > - > - /* > - * Stores the result of the guest translation being shadowed by each > - * SPTE. KVM shadows two types of guest translations: nGPA -> GPA > - * (shadow EPT/NPT) and GVA -> GPA (traditional shadow paging). In both > - * cases the result of the translation is a GPA and a set of access > - * constraints. > - * > - * The GFN is stored in the upper bits (PAGE_SHIFT) and the shadowed > - * access permissions are stored in the lower bits. Note, for > - * convenience and uniformity across guests, the access permissions are > - * stored in KVM format (e.g. ACC_EXEC_MASK) not the raw guest format. > - */ > - u64 *shadowed_translation; > - > - /* Currently serving as active root */ > - refcount_t root_refcount; > - > - unsigned int unsync_children; > - union { > - struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */ > - tdp_ptep_t ptep; > - }; > - union { > - DECLARE_BITMAP(unsync_child_bitmap, 512); > - struct { > - struct work_struct tdp_mmu_async_work; > - void *tdp_mmu_async_data; > - }; > - }; > - > - /* > - * Tracks shadow pages that, if zapped, would allow KVM to create an NX > - * huge page. A shadow page will have nx_huge_page_disallowed set but > - * not be on the list if a huge page is disallowed for other reasons, > - * e.g. because KVM is shadowing a PTE at the same gfn, the memslot > - * isn't properly aligned, etc... > - */ > - struct list_head possible_nx_huge_page_link; > -#ifdef CONFIG_X86_32 > - /* > - * Used out of the mmu-lock to avoid reading spte values while an > - * update is in progress; see the comments in __get_spte_lockless(). > - */ > - int clear_spte_count; > -#endif > - > - /* Number of writes since the last time traversal visited this page. */ > - atomic_t write_flooding_count; > - > -#ifdef CONFIG_X86_64 > - /* Used for freeing the page asynchronously if it is a TDP MMU page. */ > - struct rcu_head rcu_head; > -#endif > -}; > - > extern struct kmem_cache *mmu_page_header_cache; > > static inline bool kvm_mmu_page_ad_need_write_protect(struct kvm_mmu_page *sp) > diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h > index ffd10ce3eae3..335f26dabdf3 100644 > --- a/arch/x86/kvm/mmu/mmutrace.h > +++ b/arch/x86/kvm/mmu/mmutrace.h > @@ -16,11 +16,11 @@ > __field(bool, unsync) > > #define KVM_MMU_PAGE_ASSIGN(sp) \ > - __entry->mmu_valid_gen = sp->mmu_valid_gen; \ > + __entry->mmu_valid_gen = sp->arch.mmu_valid_gen; \ > __entry->gfn = sp->gfn; \ > __entry->role = sp->role.word; \ > __entry->root_count = refcount_read(&sp->root_refcount); \ > - __entry->unsync = sp->unsync; > + __entry->unsync = sp->arch.unsync; > > #define KVM_MMU_PAGE_PRINTK() ({ \ > const char *saved_ptr = trace_seq_buffer_ptr(p); \ > diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h > index e15ec1c473da..18bb92b70a01 100644 > --- a/arch/x86/kvm/mmu/paging_tmpl.h > +++ b/arch/x86/kvm/mmu/paging_tmpl.h > @@ -671,7 +671,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, > * KVM_REQ_MMU_SYNC is not necessary but it > * expedites the process. > */ > - if (sp->unsync_children && > + if (sp->arch.unsync_children && > mmu_sync_children(vcpu, sp, false)) > return RET_PF_RETRY; > } > @@ -921,7 +921,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa) > pt_element_t gpte; > gpa_t pte_gpa; > > - if (!sp->unsync) > + if (!sp->arch.unsync) > break; > > pte_gpa = FNAME(get_level1_sp_gpa)(sp); > @@ -942,7 +942,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa) > FNAME(prefetch_gpte)(vcpu, sp, sptep, gpte, false); > } > > - if (!sp->unsync_children) > + if (!sp->arch.unsync_children) > break; > } > write_unlock(&vcpu->kvm->mmu_lock); > @@ -974,8 +974,8 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, > } > > /* > - * Using the information in sp->shadowed_translation (kvm_mmu_page_get_gfn()) is > - * safe because: > + * Using the information in sp->arch.shadowed_translation > + * (kvm_mmu_page_get_gfn()) is safe because: > * - The spte has a reference to the struct page, so the pfn for a given gfn > * can't change unless all sptes pointing to it are nuked first. > * > diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c > index 34d674080170..66231c7ed31e 100644 > --- a/arch/x86/kvm/mmu/tdp_mmu.c > +++ b/arch/x86/kvm/mmu/tdp_mmu.c > @@ -270,7 +270,7 @@ static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu) > static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep, > gfn_t gfn, union kvm_mmu_page_role role) > { > - INIT_LIST_HEAD(&sp->possible_nx_huge_page_link); > + INIT_LIST_HEAD(&sp->arch.possible_nx_huge_page_link); > > set_page_private(virt_to_page(sp->spt), (unsigned long)sp); > > @@ -385,7 +385,7 @@ static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp, > { > tdp_unaccount_mmu_page(kvm, sp); > > - if (!sp->nx_huge_page_disallowed) > + if (!sp->arch.nx_huge_page_disallowed) > return; > > if (shared) > @@ -393,7 +393,7 @@ static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp, > else > lockdep_assert_held_write(&kvm->mmu_lock); > > - sp->nx_huge_page_disallowed = false; > + sp->arch.nx_huge_page_disallowed = false; > untrack_possible_nx_huge_page(kvm, sp); > > if (shared) > @@ -1181,7 +1181,7 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) > sp = tdp_mmu_alloc_sp(vcpu); > tdp_mmu_init_child_sp(sp, &iter); > > - sp->nx_huge_page_disallowed = fault->huge_page_disallowed; > + sp->arch.nx_huge_page_disallowed = fault->huge_page_disallowed; > > if (is_shadow_present_pte(iter.old_spte)) > r = tdp_mmu_split_huge_page(kvm, &iter, sp, true); > diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h > index 19d3153051a3..e6a929089715 100644 > --- a/arch/x86/kvm/mmu/tdp_mmu.h > +++ b/arch/x86/kvm/mmu/tdp_mmu.h > @@ -73,7 +73,7 @@ u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr, > #ifdef CONFIG_X86_64 > static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) > { > - return !sp->shadow_mmu_page; > + return !sp->arch.shadow_mmu_page; > } > #else > static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return false; } > diff --git a/include/kvm/mmu_types.h b/include/kvm/mmu_types.h > index 14099956fdac..a9da33d4baa8 100644 > --- a/include/kvm/mmu_types.h > +++ b/include/kvm/mmu_types.h > @@ -3,8 +3,11 @@ > #define __KVM_MMU_TYPES_H > > #include <linux/bug.h> > -#include <linux/types.h> > +#include <linux/kvm_types.h> > +#include <linux/refcount.h> > #include <linux/stddef.h> > +#include <linux/types.h> > +#include <linux/workqueue.h> > > #include <asm/kvm/mmu_types.h> > > @@ -36,4 +39,31 @@ static_assert(sizeof(union kvm_mmu_page_role) == sizeof_field(union kvm_mmu_page > > typedef u64 __rcu *tdp_ptep_t; > > +struct kvm_mmu_page { > + struct list_head link; > + > + union kvm_mmu_page_role role; > + > + /* The start of the GFN region mapped by this shadow page. */ > + gfn_t gfn; I'd like to put in a vote for getting rid of the "shadow page" / "shadow page table entry" terminology through this refactor. I can't count the number of times folks trying to understand the MMU have gotten confused by the overloaded terminology. > + > + /* The page table page. */ > + u64 *spt; > + > + /* The PTE that points to this shadow page. */ > + tdp_ptep_t ptep; Totally fine to not change this, but something like parent_ptep would be clearer. The Shadow MMU uses parent_pteps, which I think is much clearer. Could also change the comment to just: /* The PTE that points to this kvm_mmu_page. */ > + > + /* Used for freeing TDP MMU pages asynchronously. */ > + struct rcu_head rcu_head; > + > + /* The number of references to this shadow page as a root. */ > + refcount_t root_refcount; > + > + /* Used for tearing down an entire page table tree. */ > + struct work_struct tdp_mmu_async_work; > + void *tdp_mmu_async_data; > + > + struct kvm_mmu_page_arch arch; > +}; > + > #endif /* !__KVM_MMU_TYPES_H */ > -- > 2.39.0.rc1.256.g54fd8350bd-goog >
On 12/8/22 20:38, David Matlack wrote: > This commit increases the size of struct kvm_mmu_page by 64 bytes on > x86_64 (184 bytes -> 248 bytes). The size of this struct can be reduced > in future commits by moving TDP MMU root fields into a separate struct > and by dynamically allocating fields only used by the Shadow MMU. I think it's already possible to use a union like union { struct kvm_mmu_page_arch arch; struct { struct work_struct work; void *data; }; }; Paolo
On Mon, Dec 12, 2022 at 2:32 PM Paolo Bonzini <pbonzini@redhat.com> wrote: > > On 12/8/22 20:38, David Matlack wrote: > > This commit increases the size of struct kvm_mmu_page by 64 bytes on > > x86_64 (184 bytes -> 248 bytes). The size of this struct can be reduced > > in future commits by moving TDP MMU root fields into a separate struct > > and by dynamically allocating fields only used by the Shadow MMU. > > I think it's already possible to use a union like > > union { > struct kvm_mmu_page_arch arch; > struct { > struct work_struct work; > void *data; > }; > }; That would potentially corrupt kvm_mmu_page_arch.nx_huge_page_disallowed and possible_nx_huge_page_link, both of which still need to be used by the TDP MMU on x86.
diff --git a/arch/x86/include/asm/kvm/mmu_types.h b/arch/x86/include/asm/kvm/mmu_types.h index 35f893ebab5a..affcb520b482 100644 --- a/arch/x86/include/asm/kvm/mmu_types.h +++ b/arch/x86/include/asm/kvm/mmu_types.h @@ -2,6 +2,8 @@ #ifndef __ASM_KVM_MMU_TYPES_H #define __ASM_KVM_MMU_TYPES_H +#include <linux/bitmap.h> +#include <linux/list.h> #include <linux/types.h> /* @@ -53,4 +55,64 @@ struct kvm_mmu_page_role_arch { u16 passthrough:1; }; +struct kvm_rmap_head { + unsigned long val; +}; + +struct kvm_mmu_page_arch { + struct hlist_node hash_link; + + bool shadow_mmu_page; + bool unsync; + u8 mmu_valid_gen; + + /* + * The shadow page can't be replaced by an equivalent huge page + * because it is being used to map an executable page in the guest + * and the NX huge page mitigation is enabled. + */ + bool nx_huge_page_disallowed; + + /* + * Stores the result of the guest translation being shadowed by each + * SPTE. KVM shadows two types of guest translations: nGPA -> GPA + * (shadow EPT/NPT) and GVA -> GPA (traditional shadow paging). In both + * cases the result of the translation is a GPA and a set of access + * constraints. + * + * The GFN is stored in the upper bits (PAGE_SHIFT) and the shadowed + * access permissions are stored in the lower bits. Note, for + * convenience and uniformity across guests, the access permissions are + * stored in KVM format (e.g. ACC_EXEC_MASK) not the raw guest format. + */ + u64 *shadowed_translation; + + unsigned int unsync_children; + + /* Rmap pointers to all parent sptes. */ + struct kvm_rmap_head parent_ptes; + + DECLARE_BITMAP(unsync_child_bitmap, 512); + + /* + * Tracks shadow pages that, if zapped, would allow KVM to create an NX + * huge page. A shadow page will have nx_huge_page_disallowed set but + * not be on the list if a huge page is disallowed for other reasons, + * e.g. because KVM is shadowing a PTE at the same gfn, the memslot + * isn't properly aligned, etc... + */ + struct list_head possible_nx_huge_page_link; + +#ifdef CONFIG_X86_32 + /* + * Used out of the mmu-lock to avoid reading spte values while an + * update is in progress; see the comments in __get_spte_lockless(). + */ + int clear_spte_count; +#endif + + /* Number of writes since the last time traversal visited this page. */ + atomic_t write_flooding_count; +}; + #endif /* !__ASM_KVM_MMU_TYPES_H */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index ebcd7a0dabef..f5743a652e10 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -329,10 +329,6 @@ union kvm_cpu_role { }; }; -struct kvm_rmap_head { - unsigned long val; -}; - struct kvm_pio_request { unsigned long linear_rip; unsigned long count; diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 11cef930d5ed..e47f35878ab5 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -350,7 +350,7 @@ static void count_spte_clear(u64 *sptep, u64 spte) /* Ensure the spte is completely set before we increase the count */ smp_wmb(); - sp->clear_spte_count++; + sp->arch.clear_spte_count++; } static void __set_spte(u64 *sptep, u64 spte) @@ -432,7 +432,7 @@ static u64 __get_spte_lockless(u64 *sptep) int count; retry: - count = sp->clear_spte_count; + count = sp->arch.clear_spte_count; smp_rmb(); spte.spte_low = orig->spte_low; @@ -442,7 +442,7 @@ static u64 __get_spte_lockless(u64 *sptep) smp_rmb(); if (unlikely(spte.spte_low != orig->spte_low || - count != sp->clear_spte_count)) + count != sp->arch.clear_spte_count)) goto retry; return spte.spte; @@ -699,7 +699,7 @@ static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index) return sp->gfn; if (!sp->role.arch.direct) - return sp->shadowed_translation[index] >> PAGE_SHIFT; + return sp->arch.shadowed_translation[index] >> PAGE_SHIFT; return sp->gfn + (index << ((sp->role.level - 1) * SPTE_LEVEL_BITS)); } @@ -713,7 +713,7 @@ static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index) static u32 kvm_mmu_page_get_access(struct kvm_mmu_page *sp, int index) { if (sp_has_gptes(sp)) - return sp->shadowed_translation[index] & ACC_ALL; + return sp->arch.shadowed_translation[index] & ACC_ALL; /* * For direct MMUs (e.g. TDP or non-paging guests) or passthrough SPs, @@ -734,7 +734,7 @@ static void kvm_mmu_page_set_translation(struct kvm_mmu_page *sp, int index, gfn_t gfn, unsigned int access) { if (sp_has_gptes(sp)) { - sp->shadowed_translation[index] = (gfn << PAGE_SHIFT) | access; + sp->arch.shadowed_translation[index] = (gfn << PAGE_SHIFT) | access; return; } @@ -825,18 +825,18 @@ void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp) * on the list if KVM is reusing an existing shadow page, i.e. if KVM * links a shadow page at multiple points. */ - if (!list_empty(&sp->possible_nx_huge_page_link)) + if (!list_empty(&sp->arch.possible_nx_huge_page_link)) return; ++kvm->stat.nx_lpage_splits; - list_add_tail(&sp->possible_nx_huge_page_link, + list_add_tail(&sp->arch.possible_nx_huge_page_link, &kvm->arch.possible_nx_huge_pages); } static void account_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp, bool nx_huge_page_possible) { - sp->nx_huge_page_disallowed = true; + sp->arch.nx_huge_page_disallowed = true; if (nx_huge_page_possible) track_possible_nx_huge_page(kvm, sp); @@ -861,16 +861,16 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp) { - if (list_empty(&sp->possible_nx_huge_page_link)) + if (list_empty(&sp->arch.possible_nx_huge_page_link)) return; --kvm->stat.nx_lpage_splits; - list_del_init(&sp->possible_nx_huge_page_link); + list_del_init(&sp->arch.possible_nx_huge_page_link); } static void unaccount_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp) { - sp->nx_huge_page_disallowed = false; + sp->arch.nx_huge_page_disallowed = false; untrack_possible_nx_huge_page(kvm, sp); } @@ -1720,11 +1720,11 @@ static void kvm_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) static void kvm_mmu_free_shadow_page(struct kvm_mmu_page *sp) { MMU_WARN_ON(!is_empty_shadow_page(sp->spt)); - hlist_del(&sp->hash_link); + hlist_del(&sp->arch.hash_link); list_del(&sp->link); free_page((unsigned long)sp->spt); if (!sp->role.arch.direct) - free_page((unsigned long)sp->shadowed_translation); + free_page((unsigned long)sp->arch.shadowed_translation); kmem_cache_free(mmu_page_header_cache, sp); } @@ -1739,13 +1739,13 @@ static void mmu_page_add_parent_pte(struct kvm_mmu_memory_cache *cache, if (!parent_pte) return; - pte_list_add(cache, parent_pte, &sp->parent_ptes); + pte_list_add(cache, parent_pte, &sp->arch.parent_ptes); } static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp, u64 *parent_pte) { - pte_list_remove(parent_pte, &sp->parent_ptes); + pte_list_remove(parent_pte, &sp->arch.parent_ptes); } static void drop_parent_pte(struct kvm_mmu_page *sp, @@ -1761,7 +1761,7 @@ static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp) u64 *sptep; struct rmap_iterator iter; - for_each_rmap_spte(&sp->parent_ptes, &iter, sptep) { + for_each_rmap_spte(&sp->arch.parent_ptes, &iter, sptep) { mark_unsync(sptep); } } @@ -1771,9 +1771,9 @@ static void mark_unsync(u64 *spte) struct kvm_mmu_page *sp; sp = sptep_to_sp(spte); - if (__test_and_set_bit(spte_index(spte), sp->unsync_child_bitmap)) + if (__test_and_set_bit(spte_index(spte), sp->arch.unsync_child_bitmap)) return; - if (sp->unsync_children++) + if (sp->arch.unsync_children++) return; kvm_mmu_mark_parents_unsync(sp); } @@ -1799,7 +1799,7 @@ static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp, { int i; - if (sp->unsync) + if (sp->arch.unsync) for (i=0; i < pvec->nr; i++) if (pvec->page[i].sp == sp) return 0; @@ -1812,9 +1812,9 @@ static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp, static inline void clear_unsync_child_bit(struct kvm_mmu_page *sp, int idx) { - --sp->unsync_children; - WARN_ON((int)sp->unsync_children < 0); - __clear_bit(idx, sp->unsync_child_bitmap); + --sp->arch.unsync_children; + WARN_ON((int)sp->arch.unsync_children < 0); + __clear_bit(idx, sp->arch.unsync_child_bitmap); } static int __mmu_unsync_walk(struct kvm_mmu_page *sp, @@ -1822,7 +1822,7 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp, { int i, ret, nr_unsync_leaf = 0; - for_each_set_bit(i, sp->unsync_child_bitmap, 512) { + for_each_set_bit(i, sp->arch.unsync_child_bitmap, 512) { struct kvm_mmu_page *child; u64 ent = sp->spt[i]; @@ -1833,7 +1833,7 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp, child = spte_to_child_sp(ent); - if (child->unsync_children) { + if (child->arch.unsync_children) { if (mmu_pages_add(pvec, child, i)) return -ENOSPC; @@ -1845,7 +1845,7 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp, nr_unsync_leaf += ret; } else return ret; - } else if (child->unsync) { + } else if (child->arch.unsync) { nr_unsync_leaf++; if (mmu_pages_add(pvec, child, i)) return -ENOSPC; @@ -1862,7 +1862,7 @@ static int mmu_unsync_walk(struct kvm_mmu_page *sp, struct kvm_mmu_pages *pvec) { pvec->nr = 0; - if (!sp->unsync_children) + if (!sp->arch.unsync_children) return 0; mmu_pages_add(pvec, sp, INVALID_INDEX); @@ -1871,9 +1871,9 @@ static int mmu_unsync_walk(struct kvm_mmu_page *sp, static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) { - WARN_ON(!sp->unsync); + WARN_ON(!sp->arch.unsync); trace_kvm_mmu_sync_page(sp); - sp->unsync = 0; + sp->arch.unsync = 0; --kvm->stat.mmu_unsync; } @@ -1894,7 +1894,7 @@ static bool sp_has_gptes(struct kvm_mmu_page *sp) } #define for_each_valid_sp(_kvm, _sp, _list) \ - hlist_for_each_entry(_sp, _list, hash_link) \ + hlist_for_each_entry(_sp, _list, arch.hash_link) \ if (is_obsolete_sp((_kvm), (_sp))) { \ } else @@ -1934,7 +1934,7 @@ static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp) /* TDP MMU pages do not use the MMU generation. */ return !is_tdp_mmu_page(sp) && - unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen); + unlikely(sp->arch.mmu_valid_gen != kvm->arch.mmu_valid_gen); } struct mmu_page_path { @@ -2006,7 +2006,7 @@ static void mmu_pages_clear_parents(struct mmu_page_path *parents) WARN_ON(idx == INVALID_INDEX); clear_unsync_child_bit(sp, idx); level++; - } while (!sp->unsync_children); + } while (!sp->arch.unsync_children); } static int mmu_sync_children(struct kvm_vcpu *vcpu, @@ -2053,7 +2053,7 @@ static int mmu_sync_children(struct kvm_vcpu *vcpu, static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp) { - atomic_set(&sp->write_flooding_count, 0); + atomic_set(&sp->arch.write_flooding_count, 0); } static void clear_sp_write_flooding_count(u64 *spte) @@ -2094,7 +2094,7 @@ static struct kvm_mmu_page *kvm_mmu_find_shadow_page(struct kvm *kvm, * Unsync pages must not be left as is, because the new * upper-level page will be write-protected. */ - if (role.level > PG_LEVEL_4K && sp->unsync) + if (role.level > PG_LEVEL_4K && sp->arch.unsync) kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); continue; @@ -2104,7 +2104,7 @@ static struct kvm_mmu_page *kvm_mmu_find_shadow_page(struct kvm *kvm, if (sp->role.arch.direct) goto out; - if (sp->unsync) { + if (sp->arch.unsync) { if (KVM_BUG_ON(!vcpu, kvm)) break; @@ -2163,25 +2163,26 @@ static struct kvm_mmu_page *kvm_mmu_alloc_shadow_page(struct kvm *kvm, sp = kvm_mmu_memory_cache_alloc(caches->page_header_cache); sp->spt = kvm_mmu_memory_cache_alloc(caches->shadow_page_cache); if (!role.arch.direct) - sp->shadowed_translation = kvm_mmu_memory_cache_alloc(caches->shadowed_info_cache); + sp->arch.shadowed_translation = + kvm_mmu_memory_cache_alloc(caches->shadowed_info_cache); set_page_private(virt_to_page(sp->spt), (unsigned long)sp); - INIT_LIST_HEAD(&sp->possible_nx_huge_page_link); + INIT_LIST_HEAD(&sp->arch.possible_nx_huge_page_link); /* * active_mmu_pages must be a FIFO list, as kvm_zap_obsolete_pages() * depends on valid pages being added to the head of the list. See * comments in kvm_zap_obsolete_pages(). */ - sp->mmu_valid_gen = kvm->arch.mmu_valid_gen; + sp->arch.mmu_valid_gen = kvm->arch.mmu_valid_gen; list_add(&sp->link, &kvm->arch.active_mmu_pages); kvm_account_mmu_page(kvm, sp); sp->gfn = gfn; sp->role = role; - sp->shadow_mmu_page = true; - hlist_add_head(&sp->hash_link, sp_list); + sp->arch.shadow_mmu_page = true; + hlist_add_head(&sp->arch.hash_link, sp_list); if (sp_has_gptes(sp)) account_shadowed(kvm, sp); @@ -2368,7 +2369,7 @@ static void __link_shadow_page(struct kvm *kvm, mmu_page_add_parent_pte(cache, sp, sptep); - if (sp->unsync_children || sp->unsync) + if (sp->arch.unsync_children || sp->arch.unsync) mark_unsync(sptep); } @@ -2421,7 +2422,8 @@ static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, * avoids retaining a large number of stale nested SPs. */ if (tdp_enabled && invalid_list && - child->role.arch.guest_mode && !child->parent_ptes.val) + child->role.arch.guest_mode && + !child->arch.parent_ptes.val) return kvm_mmu_prepare_zap_page(kvm, child, invalid_list); } @@ -2449,7 +2451,7 @@ static void kvm_mmu_unlink_parents(struct kvm_mmu_page *sp) u64 *sptep; struct rmap_iterator iter; - while ((sptep = rmap_get_first(&sp->parent_ptes, &iter))) + while ((sptep = rmap_get_first(&sp->arch.parent_ptes, &iter))) drop_parent_pte(sp, sptep); } @@ -2496,7 +2498,7 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm, if (!sp->role.invalid && sp_has_gptes(sp)) unaccount_shadowed(kvm, sp); - if (sp->unsync) + if (sp->arch.unsync) kvm_unlink_unsync_page(kvm, sp); if (!refcount_read(&sp->root_refcount)) { /* Count self */ @@ -2527,7 +2529,7 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm, zapped_root = !is_obsolete_sp(kvm, sp); } - if (sp->nx_huge_page_disallowed) + if (sp->arch.nx_huge_page_disallowed) unaccount_nx_huge_page(kvm, sp); sp->role.invalid = 1; @@ -2704,7 +2706,7 @@ static void kvm_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) { trace_kvm_mmu_unsync_page(sp); ++kvm->stat.mmu_unsync; - sp->unsync = 1; + sp->arch.unsync = 1; kvm_mmu_mark_parents_unsync(sp); } @@ -2739,7 +2741,7 @@ int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot, if (!can_unsync) return -EPERM; - if (sp->unsync) + if (sp->arch.unsync) continue; if (prefetch) @@ -2764,7 +2766,7 @@ int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot, * for write, i.e. unsync cannot transition from 0->1 * while this CPU holds mmu_lock for read (or write). */ - if (READ_ONCE(sp->unsync)) + if (READ_ONCE(sp->arch.unsync)) continue; } @@ -2796,8 +2798,8 @@ int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot, * 2.2 Guest issues TLB flush. * That causes a VM Exit. * - * 2.3 Walking of unsync pages sees sp->unsync is - * false and skips the page. + * 2.3 Walking of unsync pages sees sp->arch.unsync + * is false and skips the page. * * 2.4 Guest accesses GVA X. * Since the mapping in the SP was not updated, @@ -2805,7 +2807,7 @@ int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot, * gets used. * 1.1 Host marks SP * as unsync - * (sp->unsync = true) + * (sp->arch.unsync = true) * * The write barrier below ensures that 1.1 happens before 1.2 and thus * the situation in 2.4 does not arise. It pairs with the read barrier @@ -3126,7 +3128,7 @@ void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_ cur_level == fault->goal_level && is_shadow_present_pte(spte) && !is_large_pte(spte) && - spte_to_child_sp(spte)->nx_huge_page_disallowed) { + spte_to_child_sp(spte)->arch.nx_huge_page_disallowed) { /* * A small SPTE exists for this pfn, but FNAME(fetch), * direct_map(), or kvm_tdp_mmu_map() would like to create a @@ -3902,7 +3904,7 @@ static bool is_unsync_root(hpa_t root) /* * The read barrier orders the CPU's read of SPTE.W during the page table - * walk before the reads of sp->unsync/sp->unsync_children here. + * walk before the reads of sp->arch.{unsync,unsync_children} here. * * Even if another CPU was marking the SP as unsync-ed simultaneously, * any guest page table changes are not guaranteed to be visible anyway @@ -3922,7 +3924,7 @@ static bool is_unsync_root(hpa_t root) if (WARN_ON_ONCE(!sp)) return false; - if (sp->unsync || sp->unsync_children) + if (sp->arch.unsync || sp->arch.unsync_children) return true; return false; @@ -5510,8 +5512,8 @@ static bool detect_write_flooding(struct kvm_mmu_page *sp) if (sp->role.level == PG_LEVEL_4K) return false; - atomic_inc(&sp->write_flooding_count); - return atomic_read(&sp->write_flooding_count) >= 3; + atomic_inc(&sp->arch.write_flooding_count); + return atomic_read(&sp->arch.write_flooding_count) >= 3; } /* @@ -6389,7 +6391,7 @@ static bool shadow_mmu_try_split_huge_pages(struct kvm *kvm, continue; /* SPs with level >PG_LEVEL_4K should never by unsync. */ - if (WARN_ON_ONCE(sp->unsync)) + if (WARN_ON_ONCE(sp->arch.unsync)) continue; /* Don't bother splitting huge pages on invalid SPs. */ @@ -6941,8 +6943,8 @@ static void kvm_recover_nx_huge_pages(struct kvm *kvm) */ sp = list_first_entry(&kvm->arch.possible_nx_huge_pages, struct kvm_mmu_page, - possible_nx_huge_page_link); - WARN_ON_ONCE(!sp->nx_huge_page_disallowed); + arch.possible_nx_huge_page_link); + WARN_ON_ONCE(!sp->arch.nx_huge_page_disallowed); WARN_ON_ONCE(!sp->role.arch.direct); /* @@ -6977,7 +6979,7 @@ static void kvm_recover_nx_huge_pages(struct kvm *kvm) flush |= kvm_tdp_mmu_zap_sp(kvm, sp); else kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); - WARN_ON_ONCE(sp->nx_huge_page_disallowed); + WARN_ON_ONCE(sp->arch.nx_huge_page_disallowed); if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush); diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index fd4990c8b0e9..af2ae4887e35 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -44,89 +44,6 @@ extern bool dbg; #define INVALID_PAE_ROOT 0 #define IS_VALID_PAE_ROOT(x) (!!(x)) -struct kvm_mmu_page { - /* - * Note, "link" through "spt" fit in a single 64 byte cache line on - * 64-bit kernels, keep it that way unless there's a reason not to. - */ - struct list_head link; - struct hlist_node hash_link; - - bool shadow_mmu_page; - bool unsync; - u8 mmu_valid_gen; - - /* - * The shadow page can't be replaced by an equivalent huge page - * because it is being used to map an executable page in the guest - * and the NX huge page mitigation is enabled. - */ - bool nx_huge_page_disallowed; - - /* - * The following two entries are used to key the shadow page in the - * hash table. - */ - union kvm_mmu_page_role role; - gfn_t gfn; - - u64 *spt; - - /* - * Stores the result of the guest translation being shadowed by each - * SPTE. KVM shadows two types of guest translations: nGPA -> GPA - * (shadow EPT/NPT) and GVA -> GPA (traditional shadow paging). In both - * cases the result of the translation is a GPA and a set of access - * constraints. - * - * The GFN is stored in the upper bits (PAGE_SHIFT) and the shadowed - * access permissions are stored in the lower bits. Note, for - * convenience and uniformity across guests, the access permissions are - * stored in KVM format (e.g. ACC_EXEC_MASK) not the raw guest format. - */ - u64 *shadowed_translation; - - /* Currently serving as active root */ - refcount_t root_refcount; - - unsigned int unsync_children; - union { - struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */ - tdp_ptep_t ptep; - }; - union { - DECLARE_BITMAP(unsync_child_bitmap, 512); - struct { - struct work_struct tdp_mmu_async_work; - void *tdp_mmu_async_data; - }; - }; - - /* - * Tracks shadow pages that, if zapped, would allow KVM to create an NX - * huge page. A shadow page will have nx_huge_page_disallowed set but - * not be on the list if a huge page is disallowed for other reasons, - * e.g. because KVM is shadowing a PTE at the same gfn, the memslot - * isn't properly aligned, etc... - */ - struct list_head possible_nx_huge_page_link; -#ifdef CONFIG_X86_32 - /* - * Used out of the mmu-lock to avoid reading spte values while an - * update is in progress; see the comments in __get_spte_lockless(). - */ - int clear_spte_count; -#endif - - /* Number of writes since the last time traversal visited this page. */ - atomic_t write_flooding_count; - -#ifdef CONFIG_X86_64 - /* Used for freeing the page asynchronously if it is a TDP MMU page. */ - struct rcu_head rcu_head; -#endif -}; - extern struct kmem_cache *mmu_page_header_cache; static inline bool kvm_mmu_page_ad_need_write_protect(struct kvm_mmu_page *sp) diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h index ffd10ce3eae3..335f26dabdf3 100644 --- a/arch/x86/kvm/mmu/mmutrace.h +++ b/arch/x86/kvm/mmu/mmutrace.h @@ -16,11 +16,11 @@ __field(bool, unsync) #define KVM_MMU_PAGE_ASSIGN(sp) \ - __entry->mmu_valid_gen = sp->mmu_valid_gen; \ + __entry->mmu_valid_gen = sp->arch.mmu_valid_gen; \ __entry->gfn = sp->gfn; \ __entry->role = sp->role.word; \ __entry->root_count = refcount_read(&sp->root_refcount); \ - __entry->unsync = sp->unsync; + __entry->unsync = sp->arch.unsync; #define KVM_MMU_PAGE_PRINTK() ({ \ const char *saved_ptr = trace_seq_buffer_ptr(p); \ diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index e15ec1c473da..18bb92b70a01 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -671,7 +671,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, * KVM_REQ_MMU_SYNC is not necessary but it * expedites the process. */ - if (sp->unsync_children && + if (sp->arch.unsync_children && mmu_sync_children(vcpu, sp, false)) return RET_PF_RETRY; } @@ -921,7 +921,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa) pt_element_t gpte; gpa_t pte_gpa; - if (!sp->unsync) + if (!sp->arch.unsync) break; pte_gpa = FNAME(get_level1_sp_gpa)(sp); @@ -942,7 +942,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa) FNAME(prefetch_gpte)(vcpu, sp, sptep, gpte, false); } - if (!sp->unsync_children) + if (!sp->arch.unsync_children) break; } write_unlock(&vcpu->kvm->mmu_lock); @@ -974,8 +974,8 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, } /* - * Using the information in sp->shadowed_translation (kvm_mmu_page_get_gfn()) is - * safe because: + * Using the information in sp->arch.shadowed_translation + * (kvm_mmu_page_get_gfn()) is safe because: * - The spte has a reference to the struct page, so the pfn for a given gfn * can't change unless all sptes pointing to it are nuked first. * diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 34d674080170..66231c7ed31e 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -270,7 +270,7 @@ static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu) static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep, gfn_t gfn, union kvm_mmu_page_role role) { - INIT_LIST_HEAD(&sp->possible_nx_huge_page_link); + INIT_LIST_HEAD(&sp->arch.possible_nx_huge_page_link); set_page_private(virt_to_page(sp->spt), (unsigned long)sp); @@ -385,7 +385,7 @@ static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp, { tdp_unaccount_mmu_page(kvm, sp); - if (!sp->nx_huge_page_disallowed) + if (!sp->arch.nx_huge_page_disallowed) return; if (shared) @@ -393,7 +393,7 @@ static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp, else lockdep_assert_held_write(&kvm->mmu_lock); - sp->nx_huge_page_disallowed = false; + sp->arch.nx_huge_page_disallowed = false; untrack_possible_nx_huge_page(kvm, sp); if (shared) @@ -1181,7 +1181,7 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) sp = tdp_mmu_alloc_sp(vcpu); tdp_mmu_init_child_sp(sp, &iter); - sp->nx_huge_page_disallowed = fault->huge_page_disallowed; + sp->arch.nx_huge_page_disallowed = fault->huge_page_disallowed; if (is_shadow_present_pte(iter.old_spte)) r = tdp_mmu_split_huge_page(kvm, &iter, sp, true); diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index 19d3153051a3..e6a929089715 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -73,7 +73,7 @@ u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr, #ifdef CONFIG_X86_64 static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { - return !sp->shadow_mmu_page; + return !sp->arch.shadow_mmu_page; } #else static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return false; } diff --git a/include/kvm/mmu_types.h b/include/kvm/mmu_types.h index 14099956fdac..a9da33d4baa8 100644 --- a/include/kvm/mmu_types.h +++ b/include/kvm/mmu_types.h @@ -3,8 +3,11 @@ #define __KVM_MMU_TYPES_H #include <linux/bug.h> -#include <linux/types.h> +#include <linux/kvm_types.h> +#include <linux/refcount.h> #include <linux/stddef.h> +#include <linux/types.h> +#include <linux/workqueue.h> #include <asm/kvm/mmu_types.h> @@ -36,4 +39,31 @@ static_assert(sizeof(union kvm_mmu_page_role) == sizeof_field(union kvm_mmu_page typedef u64 __rcu *tdp_ptep_t; +struct kvm_mmu_page { + struct list_head link; + + union kvm_mmu_page_role role; + + /* The start of the GFN region mapped by this shadow page. */ + gfn_t gfn; + + /* The page table page. */ + u64 *spt; + + /* The PTE that points to this shadow page. */ + tdp_ptep_t ptep; + + /* Used for freeing TDP MMU pages asynchronously. */ + struct rcu_head rcu_head; + + /* The number of references to this shadow page as a root. */ + refcount_t root_refcount; + + /* Used for tearing down an entire page table tree. */ + struct work_struct tdp_mmu_async_work; + void *tdp_mmu_async_data; + + struct kvm_mmu_page_arch arch; +}; + #endif /* !__KVM_MMU_TYPES_H */
Move struct kvm_mmu_page to common code and all x86-specific fields into kvm_mmu_page_arch. This commit increases the size of struct kvm_mmu_page by 64 bytes on x86_64 (184 bytes -> 248 bytes). The size of this struct can be reduced in future commits by moving TDP MMU root fields into a separate struct and by dynamically allocating fields only used by the Shadow MMU. No functional change intended. Signed-off-by: David Matlack <dmatlack@google.com> --- arch/x86/include/asm/kvm/mmu_types.h | 62 ++++++++++++++ arch/x86/include/asm/kvm_host.h | 4 - arch/x86/kvm/mmu/mmu.c | 122 ++++++++++++++------------- arch/x86/kvm/mmu/mmu_internal.h | 83 ------------------ arch/x86/kvm/mmu/mmutrace.h | 4 +- arch/x86/kvm/mmu/paging_tmpl.h | 10 +-- arch/x86/kvm/mmu/tdp_mmu.c | 8 +- arch/x86/kvm/mmu/tdp_mmu.h | 2 +- include/kvm/mmu_types.h | 32 ++++++- 9 files changed, 167 insertions(+), 160 deletions(-)