[1/2] KVM: x86/mmu: Split NX hugepage recovery flow into TDP and non-TDP flow

Message ID	20240812171341.1763297-2-vipinsh@google.com (mailing list archive)
State	New, archived
Headers	show Received: from mail-pg1-f202.google.com (mail-pg1-f202.google.com [209.85.215.202]) (using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 97EC7187874 for <kvm@vger.kernel.org>; Mon, 12 Aug 2024 17:13:58 +0000 (UTC) Date: Mon, 12 Aug 2024 10:13:40 -0700 In-Reply-To: <20240812171341.1763297-1-vipinsh@google.com> Precedence: bulk Mime-Version: 1.0 References: <20240812171341.1763297-1-vipinsh@google.com> Message-ID: <20240812171341.1763297-2-vipinsh@google.com> Subject: [PATCH 1/2] KVM: x86/mmu: Split NX hugepage recovery flow into TDP and non-TDP flow From: Vipin Sharma <vipinsh@google.com> To: seanjc@google.com, pbonzini@redhat.com Cc: dmatlack@google.com, kvm@vger.kernel.org, linux-kernel@vger.kernel.org, Vipin Sharma <vipinsh@google.com> Content-Type: text/plain; charset="UTF-8"
Series	KVM: x86/mmu: Run NX huge page recovery under MMU read lock \| expand [0/2] KVM: x86/mmu: Run NX huge page recovery under MMU read lock [1/2] KVM: x86/mmu: Split NX hugepage recovery flow into TDP and non-TDP flow [2/2] KVM: x86/mmu: Recover NX Huge pages belonging to TDP MMU under MMU read lock

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 901be9e420a4..5534fcc9d1b5 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -909,7 +909,7 @@ void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp) list_del_init(&sp->possible_nx_huge_page_link); } -static void unaccount_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp) +void unaccount_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp) { sp->nx_huge_page_disallowed = false; @@ -7311,98 +7311,128 @@ static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel return err; } -static void kvm_recover_nx_huge_pages(struct kvm *kvm) +/* + * Get the first shadow mmu page of desired type from the NX huge pages list. + * Return NULL if list doesn't have the needed page with in the first max pages. + */ +struct kvm_mmu_page *kvm_mmu_possible_nx_huge_page(struct kvm *kvm, bool tdp_mmu, + ulong max) { - unsigned long nx_lpage_splits = kvm->stat.nx_lpage_splits; - struct kvm_memory_slot *slot; - int rcu_idx; - struct kvm_mmu_page *sp; - unsigned int ratio; - LIST_HEAD(invalid_list); - bool flush = false; - ulong to_zap; + struct kvm_mmu_page *sp = NULL; + ulong i = 0; - rcu_idx = srcu_read_lock(&kvm->srcu); - write_lock(&kvm->mmu_lock); + /* + * We use a separate list instead of just using active_mmu_pages because + * the number of shadow pages that be replaced with an NX huge page is + * expected to be relatively small compared to the total number of shadow + * pages. And because the TDP MMU doesn't use active_mmu_pages. + */ + list_for_each_entry(sp, &kvm->arch.possible_nx_huge_pages, possible_nx_huge_page_link) { + if (i++ >= max) + break; + if (is_tdp_mmu_page(sp) == tdp_mmu) + return sp; + } + + return NULL; +} + +static struct kvm_mmu_page *shadow_mmu_nx_huge_page_to_zap(struct kvm *kvm, ulong max) +{ + return kvm_mmu_possible_nx_huge_page(kvm, /*tdp_mmu=*/false, max); +} + +bool kvm_mmu_sp_dirty_logging_enabled(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + struct kvm_memory_slot *slot = NULL; /* - * Zapping TDP MMU shadow pages, including the remote TLB flush, must - * be done under RCU protection, because the pages are freed via RCU - * callback. + * Since gfn_to_memslot() is relatively expensive, it helps to skip it if + * it the test cannot possibly return true. On the other hand, if any + * memslot has logging enabled, chances are good that all of them do, in + * which case unaccount_nx_huge_page() is much cheaper than zapping the + * page. + * + * If a memslot update is in progress, reading an incorrect value of + * kvm->nr_memslots_dirty_logging is not a problem: if it is becoming + * zero, gfn_to_memslot() will be done unnecessarily; if it is becoming + * nonzero, the page will be zapped unnecessarily. Either way, this only + * affects efficiency in racy situations, and not correctness. */ - rcu_read_lock(); + if (atomic_read(&kvm->nr_memslots_dirty_logging)) { + struct kvm_memslots *slots; - ratio = READ_ONCE(nx_huge_pages_recovery_ratio); - to_zap = ratio ? DIV_ROUND_UP(nx_lpage_splits, ratio) : 0; - for ( ; to_zap; --to_zap) { - if (list_empty(&kvm->arch.possible_nx_huge_pages)) + slots = kvm_memslots_for_spte_role(kvm, sp->role); + slot = __gfn_to_memslot(slots, sp->gfn); + WARN_ON_ONCE(!slot); + } + + return slot && kvm_slot_dirty_track_enabled(slot); +} + +static void shadow_mmu_recover_nx_huge_pages(struct kvm *kvm, ulong to_zap) +{ + struct kvm_mmu_page *sp; + LIST_HEAD(invalid_list); + + lockdep_assert_held_write(&kvm->mmu_lock); + + while (to_zap) { + sp = shadow_mmu_nx_huge_page_to_zap(kvm, to_zap); + if (!sp) break; - /* - * We use a separate list instead of just using active_mmu_pages - * because the number of shadow pages that be replaced with an - * NX huge page is expected to be relatively small compared to - * the total number of shadow pages. And because the TDP MMU - * doesn't use active_mmu_pages. - */ - sp = list_first_entry(&kvm->arch.possible_nx_huge_pages, - struct kvm_mmu_page, - possible_nx_huge_page_link); WARN_ON_ONCE(!sp->nx_huge_page_disallowed); WARN_ON_ONCE(!sp->role.direct); /* - * Unaccount and do not attempt to recover any NX Huge Pages - * that are being dirty tracked, as they would just be faulted - * back in as 4KiB pages. The NX Huge Pages in this slot will be + * Unaccount and do not attempt to recover any NX Huge Pages that + * are being dirty tracked, as they would just be faulted back in + * as 4KiB pages. The NX Huge Pages in this slot will be * recovered, along with all the other huge pages in the slot, * when dirty logging is disabled. - * - * Since gfn_to_memslot() is relatively expensive, it helps to - * skip it if it the test cannot possibly return true. On the - * other hand, if any memslot has logging enabled, chances are - * good that all of them do, in which case unaccount_nx_huge_page() - * is much cheaper than zapping the page. - * - * If a memslot update is in progress, reading an incorrect value - * of kvm->nr_memslots_dirty_logging is not a problem: if it is - * becoming zero, gfn_to_memslot() will be done unnecessarily; if - * it is becoming nonzero, the page will be zapped unnecessarily. - * Either way, this only affects efficiency in racy situations, - * and not correctness. */ - slot = NULL; - if (atomic_read(&kvm->nr_memslots_dirty_logging)) { - struct kvm_memslots *slots; - - slots = kvm_memslots_for_spte_role(kvm, sp->role); - slot = __gfn_to_memslot(slots, sp->gfn); - WARN_ON_ONCE(!slot); - } - - if (slot && kvm_slot_dirty_track_enabled(slot)) + if (kvm_mmu_sp_dirty_logging_enabled(kvm, sp)) unaccount_nx_huge_page(kvm, sp); - else if (is_tdp_mmu_page(sp)) - flush |= kvm_tdp_mmu_zap_sp(kvm, sp); else kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); + WARN_ON_ONCE(sp->nx_huge_page_disallowed); if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { - kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush); - rcu_read_unlock(); - + kvm_mmu_commit_zap_page(kvm, &invalid_list); cond_resched_rwlock_write(&kvm->mmu_lock); - flush = false; - - rcu_read_lock(); } + to_zap--; } - kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush); - rcu_read_unlock(); + kvm_mmu_commit_zap_page(kvm, &invalid_list); +} + +static void kvm_recover_nx_huge_pages(struct kvm *kvm) +{ + unsigned long nx_lpage_splits = kvm->stat.nx_lpage_splits; + unsigned int ratio; + ulong to_zap; + int rcu_idx; + + ratio = READ_ONCE(nx_huge_pages_recovery_ratio); + to_zap = ratio ? DIV_ROUND_UP(nx_lpage_splits, ratio) : 0; + + rcu_idx = srcu_read_lock(&kvm->srcu); + + if (to_zap && tdp_mmu_enabled) { + write_lock(&kvm->mmu_lock); + to_zap = kvm_tdp_mmu_recover_nx_huge_pages(kvm, to_zap); + write_unlock(&kvm->mmu_lock); + } + + if (to_zap && kvm_memslots_have_rmaps(kvm)) { + write_lock(&kvm->mmu_lock); + shadow_mmu_recover_nx_huge_pages(kvm, to_zap); + write_unlock(&kvm->mmu_lock); + } - write_unlock(&kvm->mmu_lock); srcu_read_unlock(&kvm->srcu, rcu_idx); } diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 1721d97743e9..246b1bc0319b 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -354,4 +354,10 @@ void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc); void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp); void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp); +struct kvm_mmu_page *kvm_mmu_possible_nx_huge_page(struct kvm *kvm, bool tdp_mmu, + ulong max); + +bool kvm_mmu_sp_dirty_logging_enabled(struct kvm *kvm, struct kvm_mmu_page *sp); +void unaccount_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp); + #endif /* __KVM_X86_MMU_INTERNAL_H */ diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index c7dc49ee7388..933bb8b11c9f 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1796,3 +1796,62 @@ u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gfn_t gfn, */ return rcu_dereference(sptep); } + +static struct kvm_mmu_page *tdp_mmu_nx_huge_page_to_zap(struct kvm *kvm, ulong max) +{ + return kvm_mmu_possible_nx_huge_page(kvm, /*tdp_mmu=*/true, max); +} + +ulong kvm_tdp_mmu_recover_nx_huge_pages(struct kvm *kvm, ulong to_zap) +{ + struct kvm_mmu_page *sp; + bool flush = false; + + lockdep_assert_held_write(&kvm->mmu_lock); + /* + * Zapping TDP MMU shadow pages, including the remote TLB flush, must + * be done under RCU protection, because the pages are freed via RCU + * callback. + */ + rcu_read_lock(); + + while (to_zap) { + sp = tdp_mmu_nx_huge_page_to_zap(kvm, to_zap); + + if (!sp) + break; + + WARN_ON_ONCE(!sp->nx_huge_page_disallowed); + WARN_ON_ONCE(!sp->role.direct); + + /* + * Unaccount and do not attempt to recover any NX Huge Pages that + * are being dirty tracked, as they would just be faulted back in + * as 4KiB pages. The NX Huge Pages in this slot will be + * recovered, along with all the other huge pages in the slot, + * when dirty logging is disabled. + */ + if (kvm_mmu_sp_dirty_logging_enabled(kvm, sp)) + unaccount_nx_huge_page(kvm, sp); + else + flush |= kvm_tdp_mmu_zap_sp(kvm, sp); + + WARN_ON_ONCE(sp->nx_huge_page_disallowed); + + if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { + if (flush) { + kvm_flush_remote_tlbs(kvm); + flush = false; + } + rcu_read_unlock(); + cond_resched_rwlock_write(&kvm->mmu_lock); + rcu_read_lock(); + } + to_zap--; + } + + if (flush) + kvm_flush_remote_tlbs(kvm); + rcu_read_unlock(); + return to_zap; +} diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index 1b74e058a81c..7d68c2ddf78c 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -52,6 +52,8 @@ void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm, gfn_t start, gfn_t end, int target_level, bool shared); +ulong kvm_tdp_mmu_recover_nx_huge_pages(struct kvm *kvm, ulong to_zap); + static inline void kvm_tdp_mmu_walk_lockless_begin(void) { rcu_read_lock();

[1/2] KVM: x86/mmu: Split NX hugepage recovery flow into TDP and non-TDP flow

Commit Message

Comments

Patch