[4/7] KVM: x86/mmu: Recover TDP MMU huge page mappings in-place instead of zapping

Message ID	20240805233114.4060019-5-dmatlack@google.com (mailing list archive)
State	New, archived
Headers	show Received: from mail-yb1-f202.google.com (mail-yb1-f202.google.com [209.85.219.202]) (using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 5E90B16F84C for <kvm@vger.kernel.org>; Mon, 5 Aug 2024 23:31:25 +0000 (UTC) Date: Mon, 5 Aug 2024 16:31:11 -0700 In-Reply-To: <20240805233114.4060019-1-dmatlack@google.com> Precedence: bulk Mime-Version: 1.0 References: <20240805233114.4060019-1-dmatlack@google.com> Message-ID: <20240805233114.4060019-5-dmatlack@google.com> Subject: [PATCH 4/7] KVM: x86/mmu: Recover TDP MMU huge page mappings in-place instead of zapping From: David Matlack <dmatlack@google.com> To: Paolo Bonzini <pbonzini@redhat.com>, Sean Christopherson <seanjc@google.com> Cc: kvm@vger.kernel.org, David Matlack <dmatlack@google.com> Content-Type: text/plain; charset="UTF-8"
Series	KVM: x86/mmu: Optimize TDP MMU huge page recovery during disable-dirty-log \| expand [0/7] KVM: x86/mmu: Optimize TDP MMU huge page recovery during disable-dirty-log [1/7] Revert "KVM: x86/mmu: Don't bottom out on leafs when zapping collapsible SPTEs" [2/7] KVM: x86/mmu: Drop @max_level from kvm_mmu_max_mapping_level() [3/7] KVM: x86/mmu: Batch TLB flushes when zapping collapsible TDP MMU SPTEs [4/7] KVM: x86/mmu: Recover TDP MMU huge page mappings in-place instead of zapping [5/7] KVM: x86/mmu: Rename make_huge_page_split_spte() to make_small_spte() [6/7] KVM: x86/mmu: WARN if huge page recovery triggered during dirty logging [7/7] KVM: x86/mmu: Recheck SPTE points to a PT during huge page recovery

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 950a03e0181e..ed3b724db4d7 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1952,8 +1952,8 @@ void kvm_mmu_try_split_huge_pages(struct kvm *kvm, const struct kvm_memory_slot *memslot, u64 start, u64 end, int target_level); -void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, - const struct kvm_memory_slot *memslot); +void kvm_mmu_recover_huge_pages(struct kvm *kvm, + const struct kvm_memory_slot *memslot); void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, const struct kvm_memory_slot *memslot); void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 1b4e14ac512b..34e59210d94e 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -6917,8 +6917,8 @@ static void kvm_rmap_zap_collapsible_sptes(struct kvm *kvm, kvm_flush_remote_tlbs_memslot(kvm, slot); } -void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, - const struct kvm_memory_slot *slot) +void kvm_mmu_recover_huge_pages(struct kvm *kvm, + const struct kvm_memory_slot *slot) { if (kvm_memslots_have_rmaps(kvm)) { write_lock(&kvm->mmu_lock); @@ -6928,7 +6928,7 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, if (tdp_mmu_enabled) { read_lock(&kvm->mmu_lock); - kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot); + kvm_tdp_mmu_recover_huge_pages(kvm, slot); read_unlock(&kvm->mmu_lock); } } diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index d4527965e48c..979387d4ebfa 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -268,15 +268,14 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, return wrprot; } -static u64 make_spte_executable(u64 spte) +static u64 modify_spte_protections(u64 spte, u64 set, u64 clear) { bool is_access_track = is_access_track_spte(spte); if (is_access_track) spte = restore_acc_track_spte(spte); - spte &= ~shadow_nx_mask; - spte |= shadow_x_mask; + spte = (spte | set) & ~clear; if (is_access_track) spte = mark_spte_for_access_track(spte); @@ -284,6 +283,16 @@ static u64 make_spte_executable(u64 spte) return spte; } +static u64 make_spte_executable(u64 spte) +{ + return modify_spte_protections(spte, shadow_x_mask, shadow_nx_mask); +} + +static u64 make_spte_nonexecutable(u64 spte) +{ + return modify_spte_protections(spte, shadow_nx_mask, shadow_x_mask); +} + /* * Construct an SPTE that maps a sub-page of the given huge page SPTE where * `index` identifies which sub-page. @@ -320,6 +329,27 @@ u64 make_huge_page_split_spte(struct kvm *kvm, u64 huge_spte, return child_spte; } +u64 make_huge_spte(struct kvm *kvm, u64 small_spte, int level) +{ + u64 huge_spte; + + KVM_BUG_ON(!is_shadow_present_pte(small_spte), kvm); + KVM_BUG_ON(level == PG_LEVEL_4K, kvm); + + huge_spte = small_spte | PT_PAGE_SIZE_MASK; + + /* + * huge_spte already has the address of the sub-page being collapsed + * from small_spte, so just clear the lower address bits to create the + * huge page address. + */ + huge_spte &= KVM_HPAGE_MASK(level) | ~PAGE_MASK; + + if (is_nx_huge_page_enabled(kvm)) + huge_spte = make_spte_nonexecutable(huge_spte); + + return huge_spte; +} u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled) { diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index ef793c459b05..498c30b6ba71 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -503,6 +503,7 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, bool host_writable, u64 *new_spte); u64 make_huge_page_split_spte(struct kvm *kvm, u64 huge_spte, union kvm_mmu_page_role role, int index); +u64 make_huge_spte(struct kvm *kvm, u64 small_spte, int level); u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled); u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access); u64 mark_spte_for_access_track(u64 spte); diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index fad2912d3d4c..3f2d7343194e 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1575,15 +1575,16 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot); } -static void zap_collapsible_spte_range(struct kvm *kvm, - struct kvm_mmu_page *root, - const struct kvm_memory_slot *slot) +static void recover_huge_pages_range(struct kvm *kvm, + struct kvm_mmu_page *root, + const struct kvm_memory_slot *slot) { gfn_t start = slot->base_gfn; gfn_t end = start + slot->npages; struct tdp_iter iter; int max_mapping_level; bool flush = false; + u64 huge_spte; rcu_read_lock(); @@ -1608,18 +1609,19 @@ static void zap_collapsible_spte_range(struct kvm *kvm, continue; /* - * The page can be remapped at a higher level, so step - * up to zap the parent SPTE. + * Construct the huge SPTE based on the small SPTE and then step + * back up to install it. */ + huge_spte = make_huge_spte(kvm, iter.old_spte, max_mapping_level); while (max_mapping_level > iter.level) tdp_iter_step_up(&iter); - if (!tdp_mmu_set_spte_atomic(kvm, &iter, SHADOW_NONPRESENT_VALUE)) + if (!tdp_mmu_set_spte_atomic(kvm, &iter, huge_spte)) flush = true; /* - * If the atomic zap fails, the iter will recurse back into - * the same subtree to retry. + * If the cmpxchg fails, the iter will recurse back into the + * same subtree to retry. */ } @@ -1630,17 +1632,21 @@ static void zap_collapsible_spte_range(struct kvm *kvm, } /* - * Zap non-leaf SPTEs (and free their associated page tables) which could be - * replaced by huge pages, for GFNs within the slot. + * Recover huge page mappings within the slot by replacing non-leaf SPTEs with + * huge SPTEs where possible. + * + * Note that all huge page mappings are recovered, including NX huge pages that + * were split by guest instruction fetches and huge pages that were split for + * dirty tracking. */ -void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, - const struct kvm_memory_slot *slot) +void kvm_tdp_mmu_recover_huge_pages(struct kvm *kvm, + const struct kvm_memory_slot *slot) { struct kvm_mmu_page *root; lockdep_assert_held_read(&kvm->mmu_lock); for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id) - zap_collapsible_spte_range(kvm, root, slot); + recover_huge_pages_range(kvm, root, slot); } /* diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index 1b74e058a81c..ddea2827d1ad 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -40,8 +40,8 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, unsigned long mask, bool wrprot); -void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, - const struct kvm_memory_slot *slot); +void kvm_tdp_mmu_recover_huge_pages(struct kvm *kvm, + const struct kvm_memory_slot *slot); bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index af6c8cf6a37a..b83bebe53840 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -13056,19 +13056,15 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, if (!log_dirty_pages) { /* - * Dirty logging tracks sptes in 4k granularity, meaning that - * large sptes have to be split. If live migration succeeds, - * the guest in the source machine will be destroyed and large - * sptes will be created in the destination. However, if the - * guest continues to run in the source machine (for example if - * live migration fails), small sptes will remain around and - * cause bad performance. + * Recover huge page mappings in the slot now that dirty logging + * is disabled, i.e. now that KVM does not have to track guest + * writes at 4KiB granularity. * - * Scan sptes if dirty logging has been stopped, dropping those - * which can be collapsed into a single large-page spte. Later - * page faults will create the large-page sptes. + * Dirty logging might be disabled by userspace if an ongoing VM + * live migration is cancelled and the VM must continue running + * on the source. */ - kvm_mmu_zap_collapsible_sptes(kvm, new); + kvm_mmu_recover_huge_pages(kvm, new); } else { /* * Initially-all-set does not require write protecting any page,

[4/7] KVM: x86/mmu: Recover TDP MMU huge page mappings in-place instead of zapping

Commit Message

Patch