[v2,10/20] kvm: x86/mmu: Add TDP MMU PF handler

Message ID	20201014182700.2888246-11-bgardon@google.com (mailing list archive)
State	New, archived
Headers	show Return-Path: <SRS0=7LGy=DV=vger.kernel.org=kvm-owner@kernel.org> Sender: "bgardon via sendgmr" <bgardon@bgardon.sea.corp.google.com> Date: Wed, 14 Oct 2020 11:26:50 -0700 In-Reply-To: <20201014182700.2888246-1-bgardon@google.com> Message-Id: <20201014182700.2888246-11-bgardon@google.com> Mime-Version: 1.0 References: <20201014182700.2888246-1-bgardon@google.com> Subject: [PATCH v2 10/20] kvm: x86/mmu: Add TDP MMU PF handler From: Ben Gardon <bgardon@google.com> To: linux-kernel@vger.kernel.org, kvm@vger.kernel.org Cc: Cannon Matthews <cannonmatthews@google.com>, Paolo Bonzini <pbonzini@redhat.com>, Peter Xu <peterx@redhat.com>, Sean Christopherson <sean.j.christopherson@intel.com>, Peter Shier <pshier@google.com>, Peter Feiner <pfeiner@google.com>, Junaid Shahid <junaids@google.com>, Jim Mattson <jmattson@google.com>, Yulei Zhang <yulei.kernel@gmail.com>, Wanpeng Li <kernellwp@gmail.com>, Vitaly Kuznetsov <vkuznets@redhat.com>, Xiao Guangrong <xiaoguangrong.eric@gmail.com>, Ben Gardon <bgardon@google.com> Content-Type: text/plain; charset="UTF-8" Precedence: bulk
Series	Introduce the TDP MMU \| expand [v2,00/20] Introduce the TDP MMU [v2,01/20] kvm: x86/mmu: Separate making SPTEs from set_spte [v2,02/20] kvm: x86/mmu: Introduce tdp_iter [v2,03/20] kvm: x86/mmu: Init / Uninit the TDP MMU [v2,04/20] kvm: x86/mmu: Allocate and free TDP MMU roots [v2,05/20] kvm: x86/mmu: Add functions to handle changed TDP SPTEs [v2,06/20] KVM: Cache as_id in kvm_memory_slot [v2,07/20] kvm: x86/mmu: Support zapping SPTEs in the TDP MMU [v2,08/20] kvm: x86/mmu: Separate making non-leaf sptes from link_shadow_page [v2,09/20] kvm: x86/mmu: Remove disallowed_hugepage_adjust shadow_walk_iterator arg [v2,10/20] kvm: x86/mmu: Add TDP MMU PF handler [v2,11/20] kvm: x86/mmu: Allocate struct kvm_mmu_pages for all pages in TDP MMU [v2,12/20] kvm: x86/mmu: Support invalidate range MMU notifier for TDP MMU [v2,13/20] kvm: x86/mmu: Add access tracking for tdp_mmu [v2,14/20] kvm: x86/mmu: Support changed pte notifier in tdp MMU [v2,15/20] kvm: x86/mmu: Support dirty logging for the TDP MMU [v2,16/20] kvm: x86/mmu: Support disabling dirty logging for the tdp MMU [v2,17/20] kvm: x86/mmu: Support write protection for nesting in tdp MMU [v2,18/20] kvm: x86/mmu: Support MMIO in the TDP MMU [v2,19/20] kvm: x86/mmu: Don't clear write flooding count for direct roots [v2,20/20] kvm: x86/mmu: NX largepage recovery for TDP MMU

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 288b97e96202e..421a12a247b67 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -141,23 +141,6 @@ enum { /* make pte_list_desc fit well in cache line */ #define PTE_LIST_EXT 3 -/* - * Return values of handle_mmio_page_fault, mmu.page_fault, and fast_page_fault(). - * - * RET_PF_RETRY: let CPU fault again on the address. - * RET_PF_EMULATE: mmio page fault, emulate the instruction directly. - * RET_PF_INVALID: the spte is invalid, let the real page fault path update it. - * RET_PF_FIXED: The faulting entry has been fixed. - * RET_PF_SPURIOUS: The faulting entry was already fixed, e.g. by another vCPU. - */ -enum { - RET_PF_RETRY = 0, - RET_PF_EMULATE, - RET_PF_INVALID, - RET_PF_FIXED, - RET_PF_SPURIOUS, -}; - struct pte_list_desc { u64 *sptes[PTE_LIST_EXT]; struct pte_list_desc *more; @@ -195,19 +178,11 @@ static struct percpu_counter kvm_total_used_mmu_pages; static u64 __read_mostly shadow_nx_mask; static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ static u64 __read_mostly shadow_user_mask; -static u64 __read_mostly shadow_accessed_mask; static u64 __read_mostly shadow_mmio_value; static u64 __read_mostly shadow_mmio_access_mask; static u64 __read_mostly shadow_present_mask; static u64 __read_mostly shadow_me_mask; -/* - * SPTEs used by MMUs without A/D bits are marked with SPTE_AD_DISABLED_MASK; - * shadow_acc_track_mask is the set of bits to be cleared in non-accessed - * pages. - */ -static u64 __read_mostly shadow_acc_track_mask; - /* * The mask/shift to use for saving the original R/X bits when marking the PTE * as not-present for access tracking purposes. We do not save the W bit as the @@ -314,22 +289,11 @@ static inline bool spte_ad_need_write_protect(u64 spte) return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_ENABLED_MASK; } -static bool is_nx_huge_page_enabled(void) +bool is_nx_huge_page_enabled(void) { return READ_ONCE(nx_huge_pages); } -static inline u64 spte_shadow_accessed_mask(u64 spte) -{ - MMU_WARN_ON(is_mmio_spte(spte)); - return spte_ad_enabled(spte) ? shadow_accessed_mask : 0; -} - -static inline bool is_access_track_spte(u64 spte) -{ - return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0; -} - /* * Due to limited space in PTEs, the MMIO generation is a 19 bit subset of * the memslots generation and is derived as follows: @@ -377,7 +341,7 @@ static u64 get_mmio_spte_generation(u64 spte) return gen; } -static u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access) +u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access) { u64 gen = kvm_vcpu_memslots(vcpu)->generation & MMIO_SPTE_GEN_MASK; @@ -2468,7 +2432,7 @@ static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator) __shadow_walk_next(iterator, *iterator->sptep); } -static u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled) +u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled) { u64 spte; @@ -2886,15 +2850,10 @@ static bool kvm_is_mmio_pfn(kvm_pfn_t pfn) E820_TYPE_RAM); } -/* Bits which may be returned by set_spte() */ -#define SET_SPTE_WRITE_PROTECTED_PT BIT(0) -#define SET_SPTE_NEED_REMOTE_TLB_FLUSH BIT(1) -#define SET_SPTE_SPURIOUS BIT(2) - -static int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level, - gfn_t gfn, kvm_pfn_t pfn, u64 old_spte, bool speculative, - bool can_unsync, bool host_writable, bool ad_disabled, - u64 *new_spte) +int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level, + gfn_t gfn, kvm_pfn_t pfn, u64 old_spte, bool speculative, + bool can_unsync, bool host_writable, bool ad_disabled, + u64 *new_spte) { u64 spte = 0; int ret = 0; @@ -3187,9 +3146,9 @@ static int host_pfn_mapping_level(struct kvm_vcpu *vcpu, gfn_t gfn, return level; } -static int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, - int max_level, kvm_pfn_t *pfnp, - bool huge_page_disallowed, int *req_level) +int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, int max_level, + kvm_pfn_t *pfnp, bool huge_page_disallowed, + int *req_level) { struct kvm_memory_slot *slot; struct kvm_lpage_info *linfo; @@ -3243,8 +3202,8 @@ static int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, return level; } -static void disallowed_hugepage_adjust(u64 spte, gfn_t gfn, int cur_level, - kvm_pfn_t *pfnp, int *levelp) +void disallowed_hugepage_adjust(u64 spte, gfn_t gfn, int cur_level, + kvm_pfn_t *pfnp, int *levelp) { int level = *levelp; @@ -4068,9 +4027,11 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, if (page_fault_handle_page_track(vcpu, error_code, gfn)) return RET_PF_EMULATE; - r = fast_page_fault(vcpu, gpa, error_code); - if (r != RET_PF_INVALID) - return r; + if (!is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)) { + r = fast_page_fault(vcpu, gpa, error_code); + if (r != RET_PF_INVALID) + return r; + } r = mmu_topup_memory_caches(vcpu, false); if (r) @@ -4092,8 +4053,13 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, r = make_mmu_pages_available(vcpu); if (r) goto out_unlock; - r = __direct_map(vcpu, gpa, error_code, map_writable, max_level, pfn, - prefault, is_tdp); + + if (is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)) + r = kvm_tdp_mmu_map(vcpu, gpa, error_code, map_writable, + max_level, pfn, prefault, is_tdp); + else + r = __direct_map(vcpu, gpa, error_code, map_writable, max_level, + pfn, prefault, is_tdp); out_unlock: spin_unlock(&vcpu->kvm->mmu_lock); diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index c053a157e4d55..f7fe5616eff98 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -121,6 +121,14 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, #define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) static u64 __read_mostly shadow_dirty_mask; +static u64 __read_mostly shadow_accessed_mask; + +/* + * SPTEs used by MMUs without A/D bits are marked with SPTE_AD_DISABLED_MASK; + * shadow_acc_track_mask is the set of bits to be cleared in non-accessed + * pages. + */ +static u64 __read_mostly shadow_acc_track_mask; /* Functions for interpreting SPTEs */ static inline bool is_mmio_spte(u64 spte) @@ -186,6 +194,57 @@ static inline bool is_dirty_spte(u64 spte) return dirty_mask ? spte & dirty_mask : spte & PT_WRITABLE_MASK; } +static inline u64 spte_shadow_accessed_mask(u64 spte) +{ + MMU_WARN_ON(is_mmio_spte(spte)); + return spte_ad_enabled(spte) ? shadow_accessed_mask : 0; +} + +static inline bool is_access_track_spte(u64 spte) +{ + return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0; +} + void kvm_flush_remote_tlbs_with_address(struct kvm *kvm, u64 start_gfn, u64 pages); + +/* + * Return values of handle_mmio_page_fault, mmu.page_fault, and fast_page_fault(). + * + * RET_PF_RETRY: let CPU fault again on the address. + * RET_PF_EMULATE: mmio page fault, emulate the instruction directly. + * RET_PF_INVALID: the spte is invalid, let the real page fault path update it. + * RET_PF_FIXED: The faulting entry has been fixed. + * RET_PF_SPURIOUS: The faulting entry was already fixed, e.g. by another vCPU. + */ +enum { + RET_PF_RETRY = 0, + RET_PF_EMULATE, + RET_PF_INVALID, + RET_PF_FIXED, + RET_PF_SPURIOUS, +}; + +/* Bits which may be returned by set_spte() */ +#define SET_SPTE_WRITE_PROTECTED_PT BIT(0) +#define SET_SPTE_NEED_REMOTE_TLB_FLUSH BIT(1) +#define SET_SPTE_SPURIOUS BIT(2) + +int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level, + gfn_t gfn, kvm_pfn_t pfn, u64 old_spte, bool speculative, + bool can_unsync, bool host_writable, bool ad_disabled, + u64 *new_spte); +u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access); +u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled); + +int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, int max_level, + kvm_pfn_t *pfnp, bool huge_page_disallowed, + int *req_level); +void disallowed_hugepage_adjust(u64 spte, gfn_t gfn, int cur_level, + kvm_pfn_t *pfnp, int *levelp); + +bool is_nx_huge_page_enabled(void); + +void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc); + #endif /* __KVM_X86_MMU_INTERNAL_H */ diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 9b5cd4a832f1a..f92c12c4ce31a 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -291,6 +291,10 @@ static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, #define tdp_root_for_each_pte(_iter, _root, _start, _end) \ for_each_tdp_pte(_iter, _root->spt, _root->role.level, _start, _end) +#define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \ + for_each_tdp_pte(_iter, __va(_mmu->root_hpa), \ + _mmu->shadow_root_level, _start, _end) + static bool tdp_mmu_iter_cond_resched(struct kvm *kvm, struct tdp_iter *iter) { if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { @@ -371,3 +375,123 @@ void kvm_tdp_mmu_zap_all(struct kvm *kvm) if (flush) kvm_flush_remote_tlbs(kvm); } + +/* + * Installs a last-level SPTE to handle a TDP page fault. + * (NPT/EPT violation/misconfiguration) + */ +static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write, + int map_writable, + struct tdp_iter *iter, + kvm_pfn_t pfn, bool prefault) +{ + u64 new_spte; + int ret = 0; + int make_spte_ret = 0; + + if (unlikely(is_noslot_pfn(pfn))) + new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL); + else + make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn, + pfn, iter->old_spte, prefault, true, + map_writable, !shadow_accessed_mask, + &new_spte); + + tdp_mmu_set_spte(vcpu->kvm, iter, new_spte); + + /* + * If the page fault was caused by a write but the page is write + * protected, emulation is needed. If the emulation was skipped, + * the vCPU would have the same fault again. + */ + if (make_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) { + if (write) + ret = RET_PF_EMULATE; + kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); + } + + /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */ + if (unlikely(is_mmio_spte(new_spte))) + ret = RET_PF_EMULATE; + + if (!prefault) + vcpu->stat.pf_fixed++; + + return ret; +} + +/* + * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing + * page tables and SPTEs to translate the faulting guest physical address. + */ +int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, + int map_writable, int max_level, kvm_pfn_t pfn, + bool prefault, bool is_tdp) +{ + bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(); + bool write = error_code & PFERR_WRITE_MASK; + bool exec = error_code & PFERR_FETCH_MASK; + bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled; + struct kvm_mmu *mmu = vcpu->arch.mmu; + struct tdp_iter iter; + struct kvm_mmu_memory_cache *pf_pt_cache = + &vcpu->arch.mmu_shadow_page_cache; + u64 *child_pt; + u64 new_spte; + int ret; + gfn_t gfn = gpa >> PAGE_SHIFT; + int level; + int req_level; + + BUG_ON(!is_tdp); + BUG_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)); + BUG_ON(!is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)); + + level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn, + huge_page_disallowed, &req_level); + + tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { + if (nx_huge_page_workaround_enabled) + disallowed_hugepage_adjust(iter.old_spte, gfn, + iter.level, &pfn, &level); + + if (iter.level == level) + break; + + /* + * If there is an SPTE mapping a large page at a higher level + * than the target, that SPTE must be cleared and replaced + * with a non-leaf SPTE. + */ + if (is_shadow_present_pte(iter.old_spte) && + is_large_pte(iter.old_spte)) { + tdp_mmu_set_spte(vcpu->kvm, &iter, 0); + + kvm_flush_remote_tlbs_with_address(vcpu->kvm, iter.gfn, + KVM_PAGES_PER_HPAGE(iter.level)); + + /* + * The iter must explicitly re-read the spte here + * because the new value informs the !present + * path below. + */ + iter.old_spte = READ_ONCE(*iter.sptep); + } + + if (!is_shadow_present_pte(iter.old_spte)) { + child_pt = kvm_mmu_memory_cache_alloc(pf_pt_cache); + clear_page(child_pt); + new_spte = make_nonleaf_spte(child_pt, + !shadow_accessed_mask); + + tdp_mmu_set_spte(vcpu->kvm, &iter, new_spte); + } + } + + BUG_ON(iter.level != level); + + ret = tdp_mmu_map_handle_target_level(vcpu, write, map_writable, &iter, + pfn, prefault); + + return ret; +} diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index 6de2d007fc03c..4d111a4dd332f 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -14,4 +14,9 @@ void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root); bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end); void kvm_tdp_mmu_zap_all(struct kvm *kvm); + +int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, + int map_writable, int max_level, kvm_pfn_t pfn, + bool prefault, bool is_tdp); + #endif /* __KVM_X86_MMU_TDP_MMU_H */

[v2,10/20] kvm: x86/mmu: Add TDP MMU PF handler

Commit Message

Comments

Patch