[v2,2/4] KVM: x86/mmu: Extract out TDP MMU NX huge page recovery code

Message ID	20240829191135.2041489-3-vipinsh@google.com (mailing list archive)
State	New, archived
Headers	show Received: from mail-pg1-f201.google.com (mail-pg1-f201.google.com [209.85.215.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 7C7661B9B28 for <kvm@vger.kernel.org>; Thu, 29 Aug 2024 19:11:44 +0000 (UTC) Date: Thu, 29 Aug 2024 12:11:33 -0700 In-Reply-To: <20240829191135.2041489-1-vipinsh@google.com> Precedence: bulk Mime-Version: 1.0 References: <20240829191135.2041489-1-vipinsh@google.com> Message-ID: <20240829191135.2041489-3-vipinsh@google.com> Subject: [PATCH v2 2/4] KVM: x86/mmu: Extract out TDP MMU NX huge page recovery code From: Vipin Sharma <vipinsh@google.com> To: seanjc@google.com, pbonzini@redhat.com, dmatlack@google.com Cc: kvm@vger.kernel.org, linux-kernel@vger.kernel.org, Vipin Sharma <vipinsh@google.com> Content-Type: text/plain; charset="UTF-8"
Series	KVM: x86/mmu: Run NX huge page recovery under MMU read lock \| expand [v2,0/4] KVM: x86/mmu: Run NX huge page recovery under MMU read lock [v2,1/4] KVM: x86/mmu: Track TDP MMU NX huge pages separately [v2,2/4] KVM: x86/mmu: Extract out TDP MMU NX huge page recovery code [v2,3/4] KVM: x86/mmu: Rearrange locks and to_zap count for NX huge page recovery [v2,4/4] KVM: x86/mmu: Recover TDP MMU NX huge pages using MMU read lock

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 0bda372b13a5..c8c64df979e3 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -925,7 +925,7 @@ void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp) list_del_init(&sp->possible_nx_huge_page_link); } -static void unaccount_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp) +void unaccount_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp) { sp->nx_huge_page_disallowed = false; @@ -7327,26 +7327,44 @@ static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel return err; } -static void kvm_recover_nx_huge_pages(struct kvm *kvm, - struct list_head *nx_huge_pages, - unsigned long to_zap) +bool kvm_mmu_sp_dirty_logging_enabled(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + struct kvm_memory_slot *slot = NULL; + + /* + * Since gfn_to_memslot() is relatively expensive, it helps to skip it if + * it the test cannot possibly return true. On the other hand, if any + * memslot has logging enabled, chances are good that all of them do, in + * which case unaccount_nx_huge_page() is much cheaper than zapping the + * page. + * + * If a memslot update is in progress, reading an incorrect value of + * kvm->nr_memslots_dirty_logging is not a problem: if it is becoming + * zero, gfn_to_memslot() will be done unnecessarily; if it is becoming + * nonzero, the page will be zapped unnecessarily. Either way, this only + * affects efficiency in racy situations, and not correctness. + */ + if (atomic_read(&kvm->nr_memslots_dirty_logging)) { + struct kvm_memslots *slots; + + slots = kvm_memslots_for_spte_role(kvm, sp->role); + slot = __gfn_to_memslot(slots, sp->gfn); + WARN_ON_ONCE(!slot); + } + return slot && kvm_slot_dirty_track_enabled(slot); +} + +static void kvm_mmu_recover_nx_huge_pages(struct kvm *kvm, + struct list_head *nx_huge_pages, + unsigned long to_zap) { - struct kvm_memory_slot *slot; int rcu_idx; struct kvm_mmu_page *sp; LIST_HEAD(invalid_list); - bool flush = false; rcu_idx = srcu_read_lock(&kvm->srcu); write_lock(&kvm->mmu_lock); - /* - * Zapping TDP MMU shadow pages, including the remote TLB flush, must - * be done under RCU protection, because the pages are freed via RCU - * callback. - */ - rcu_read_lock(); - for ( ; to_zap; --to_zap) { if (list_empty(nx_huge_pages)) break; @@ -7370,50 +7388,19 @@ static void kvm_recover_nx_huge_pages(struct kvm *kvm, * back in as 4KiB pages. The NX Huge Pages in this slot will be * recovered, along with all the other huge pages in the slot, * when dirty logging is disabled. - * - * Since gfn_to_memslot() is relatively expensive, it helps to - * skip it if it the test cannot possibly return true. On the - * other hand, if any memslot has logging enabled, chances are - * good that all of them do, in which case unaccount_nx_huge_page() - * is much cheaper than zapping the page. - * - * If a memslot update is in progress, reading an incorrect value - * of kvm->nr_memslots_dirty_logging is not a problem: if it is - * becoming zero, gfn_to_memslot() will be done unnecessarily; if - * it is becoming nonzero, the page will be zapped unnecessarily. - * Either way, this only affects efficiency in racy situations, - * and not correctness. */ - slot = NULL; - if (atomic_read(&kvm->nr_memslots_dirty_logging)) { - struct kvm_memslots *slots; - - slots = kvm_memslots_for_spte_role(kvm, sp->role); - slot = __gfn_to_memslot(slots, sp->gfn); - WARN_ON_ONCE(!slot); - } - - if (slot && kvm_slot_dirty_track_enabled(slot)) + if (kvm_mmu_sp_dirty_logging_enabled(kvm, sp)) unaccount_nx_huge_page(kvm, sp); - else if (is_tdp_mmu_page(sp)) - flush |= kvm_tdp_mmu_zap_sp(kvm, sp); else kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); WARN_ON_ONCE(sp->nx_huge_page_disallowed); if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { - kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush); - rcu_read_unlock(); - + kvm_mmu_commit_zap_page(kvm, &invalid_list); cond_resched_rwlock_write(&kvm->mmu_lock); - flush = false; - - rcu_read_lock(); } } - kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush); - - rcu_read_unlock(); + kvm_mmu_commit_zap_page(kvm, &invalid_list); write_unlock(&kvm->mmu_lock); srcu_read_unlock(&kvm->srcu, rcu_idx); @@ -7461,16 +7448,16 @@ static int kvm_nx_huge_page_recovery_worker(struct kvm *kvm, uintptr_t data) return 0; to_zap = nx_huge_pages_to_zap(kvm); - kvm_recover_nx_huge_pages(kvm, - &kvm->arch.possible_nx_huge_pages, - to_zap); + kvm_mmu_recover_nx_huge_pages(kvm, + &kvm->arch.possible_nx_huge_pages, + to_zap); if (tdp_mmu_enabled) { #ifdef CONFIG_X86_64 to_zap = kvm_tdp_mmu_nx_huge_pages_to_zap(kvm); - kvm_recover_nx_huge_pages(kvm, - &kvm->arch.tdp_mmu_possible_nx_huge_pages, - to_zap); + kvm_tdp_mmu_recover_nx_huge_pages(kvm, + &kvm->arch.tdp_mmu_possible_nx_huge_pages, + to_zap); #endif } } diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 8deed808592b..83b165077d97 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -353,6 +353,8 @@ void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc); void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp); void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp); +void unaccount_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp); +bool kvm_mmu_sp_dirty_logging_enabled(struct kvm *kvm, struct kvm_mmu_page *sp); extern unsigned int nx_huge_pages_recovery_ratio; #endif /* __KVM_X86_MMU_INTERNAL_H */ diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 6415c2c7e936..f0b4341264fd 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1805,3 +1805,71 @@ unsigned long kvm_tdp_mmu_nx_huge_pages_to_zap(struct kvm *kvm) return ratio ? DIV_ROUND_UP(pages, ratio) : 0; } + +void kvm_tdp_mmu_recover_nx_huge_pages(struct kvm *kvm, + struct list_head *nx_huge_pages, + unsigned long to_zap) +{ + int rcu_idx; + struct kvm_mmu_page *sp; + bool flush = false; + + rcu_idx = srcu_read_lock(&kvm->srcu); + write_lock(&kvm->mmu_lock); + + /* + * Zapping TDP MMU shadow pages, including the remote TLB flush, must + * be done under RCU protection, because the pages are freed via RCU + * callback. + */ + rcu_read_lock(); + + for ( ; to_zap; --to_zap) { + if (list_empty(nx_huge_pages)) + break; + + /* + * We use a separate list instead of just using active_mmu_pages + * because the number of shadow pages that be replaced with an + * NX huge page is expected to be relatively small compared to + * the total number of shadow pages. And because the TDP MMU + * doesn't use active_mmu_pages. + */ + sp = list_first_entry(nx_huge_pages, + struct kvm_mmu_page, + possible_nx_huge_page_link); + WARN_ON_ONCE(!sp->nx_huge_page_disallowed); + WARN_ON_ONCE(!sp->role.direct); + + /* + * Unaccount and do not attempt to recover any NX Huge Pages + * that are being dirty tracked, as they would just be faulted + * back in as 4KiB pages. The NX Huge Pages in this slot will be + * recovered, along with all the other huge pages in the slot, + * when dirty logging is disabled. + */ + if (kvm_mmu_sp_dirty_logging_enabled(kvm, sp)) + unaccount_nx_huge_page(kvm, sp); + else + flush |= kvm_tdp_mmu_zap_sp(kvm, sp); + WARN_ON_ONCE(sp->nx_huge_page_disallowed); + + if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { + if (flush) + kvm_flush_remote_tlbs(kvm); + rcu_read_unlock(); + + cond_resched_rwlock_write(&kvm->mmu_lock); + flush = false; + + rcu_read_lock(); + } + } + + if (flush) + kvm_flush_remote_tlbs(kvm); + rcu_read_unlock(); + + write_unlock(&kvm->mmu_lock); + srcu_read_unlock(&kvm->srcu, rcu_idx); +} diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index 95290fd6154e..4036552f40cd 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -68,6 +68,9 @@ u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gfn_t gfn, u64 *spte); unsigned long kvm_tdp_mmu_nx_huge_pages_to_zap(struct kvm *kvm); +void kvm_tdp_mmu_recover_nx_huge_pages(struct kvm *kvm, + struct list_head *nx_huge_pages, + unsigned long to_zap); #ifdef CONFIG_X86_64 static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return sp->tdp_mmu_page; }

[v2,2/4] KVM: x86/mmu: Extract out TDP MMU NX huge page recovery code

Commit Message

Patch