[v2,4/6] KVM: x86/mmu: Recover TDP MMU huge page mappings in-place instead of zapping

Message ID	20240823235648.3236880-5-dmatlack@google.com (mailing list archive)
State	New, archived
Headers	show Received: from mail-yb1-f201.google.com (mail-yb1-f201.google.com [209.85.219.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 3F86A1CC8B2 for <kvm@vger.kernel.org>; Fri, 23 Aug 2024 23:56:59 +0000 (UTC) Date: Fri, 23 Aug 2024 16:56:46 -0700 In-Reply-To: <20240823235648.3236880-1-dmatlack@google.com> Precedence: bulk Mime-Version: 1.0 References: <20240823235648.3236880-1-dmatlack@google.com> Message-ID: <20240823235648.3236880-5-dmatlack@google.com> Subject: [PATCH v2 4/6] KVM: x86/mmu: Recover TDP MMU huge page mappings in-place instead of zapping From: David Matlack <dmatlack@google.com> To: Paolo Bonzini <pbonzini@redhat.com>, Sean Christopherson <seanjc@google.com> Cc: kvm@vger.kernel.org, David Matlack <dmatlack@google.com> Content-Type: text/plain; charset="UTF-8"
Series	KVM: x86/mmu: Optimize TDP MMU huge page recovery during disable-dirty-log \| expand [v2,0/6] KVM: x86/mmu: Optimize TDP MMU huge page recovery during disable-dirty-log [v2,1/6] KVM: x86/mmu: Drop @max_level from kvm_mmu_max_mapping_level() [v2,2/6] KVM: x86/mmu: Batch TLB flushes when zapping collapsible TDP MMU SPTEs [v2,3/6] KVM: x86/mmu: Refactor TDP MMU iter need resched check [v2,4/6] KVM: x86/mmu: Recover TDP MMU huge page mappings in-place instead of zapping [v2,5/6] KVM: x86/mmu: Rename make_huge_page_split_spte() to make_small_spte() [v2,6/6] KVM: x86/mmu: WARN if huge page recovery triggered during dirty logging

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 1811a42fa093..8f9bd7c0e139 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1953,8 +1953,8 @@ void kvm_mmu_try_split_huge_pages(struct kvm *kvm, const struct kvm_memory_slot *memslot, u64 start, u64 end, int target_level); -void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, - const struct kvm_memory_slot *memslot); +void kvm_mmu_recover_huge_pages(struct kvm *kvm, + const struct kvm_memory_slot *memslot); void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, const struct kvm_memory_slot *memslot); void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 2e92d9e9b311..2f8b1ebcbe9c 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -6904,8 +6904,8 @@ static void kvm_rmap_zap_collapsible_sptes(struct kvm *kvm, kvm_flush_remote_tlbs_memslot(kvm, slot); } -void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, - const struct kvm_memory_slot *slot) +void kvm_mmu_recover_huge_pages(struct kvm *kvm, + const struct kvm_memory_slot *slot) { if (kvm_memslots_have_rmaps(kvm)) { write_lock(&kvm->mmu_lock); @@ -6915,7 +6915,7 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, if (tdp_mmu_enabled) { read_lock(&kvm->mmu_lock); - kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot); + kvm_tdp_mmu_recover_huge_pages(kvm, slot); read_unlock(&kvm->mmu_lock); } } diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index 8f7eb3ad88fc..a12437bf6e0c 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -268,15 +268,14 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, return wrprot; } -static u64 make_spte_executable(u64 spte) +static u64 modify_spte_protections(u64 spte, u64 set, u64 clear) { bool is_access_track = is_access_track_spte(spte); if (is_access_track) spte = restore_acc_track_spte(spte); - spte &= ~shadow_nx_mask; - spte |= shadow_x_mask; + spte = (spte | set) & ~clear; if (is_access_track) spte = mark_spte_for_access_track(spte); @@ -284,6 +283,16 @@ static u64 make_spte_executable(u64 spte) return spte; } +static u64 make_spte_executable(u64 spte) +{ + return modify_spte_protections(spte, shadow_x_mask, shadow_nx_mask); +} + +static u64 make_spte_nonexecutable(u64 spte) +{ + return modify_spte_protections(spte, shadow_nx_mask, shadow_x_mask); +} + /* * Construct an SPTE that maps a sub-page of the given huge page SPTE where * `index` identifies which sub-page. @@ -320,6 +329,30 @@ u64 make_huge_page_split_spte(struct kvm *kvm, u64 huge_spte, return child_spte; } +u64 make_huge_spte(struct kvm *kvm, u64 small_spte, int level) +{ + u64 huge_spte; + + if (KVM_BUG_ON(!is_shadow_present_pte(small_spte), kvm)) + return SHADOW_NONPRESENT_VALUE; + + if (KVM_BUG_ON(level == PG_LEVEL_4K, kvm)) + return SHADOW_NONPRESENT_VALUE; + + huge_spte = small_spte | PT_PAGE_SIZE_MASK; + + /* + * huge_spte already has the address of the sub-page being collapsed + * from small_spte, so just clear the lower address bits to create the + * huge page address. + */ + huge_spte &= KVM_HPAGE_MASK(level) | ~PAGE_MASK; + + if (is_nx_huge_page_enabled(kvm)) + huge_spte = make_spte_nonexecutable(huge_spte); + + return huge_spte; +} u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled) { diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index 2cb816ea2430..990d599eb827 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -503,6 +503,7 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, bool host_writable, u64 *new_spte); u64 make_huge_page_split_spte(struct kvm *kvm, u64 huge_spte, union kvm_mmu_page_role role, int index); +u64 make_huge_spte(struct kvm *kvm, u64 small_spte, int level); u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled); u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access); u64 mark_spte_for_access_track(u64 spte); diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 9b8299ee4abb..be70f0f22550 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1581,15 +1581,43 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot); } -static void zap_collapsible_spte_range(struct kvm *kvm, - struct kvm_mmu_page *root, - const struct kvm_memory_slot *slot) +static int tdp_mmu_make_huge_spte(struct kvm *kvm, + struct tdp_iter *parent, + u64 *huge_spte) +{ + struct kvm_mmu_page *root = spte_to_child_sp(parent->old_spte); + gfn_t start = parent->gfn; + gfn_t end = start + KVM_PAGES_PER_HPAGE(parent->level); + struct tdp_iter iter; + + tdp_root_for_each_leaf_pte(iter, root, start, end) { + /* + * Use the parent iterator when checking for forward progress so + * that KVM doesn't get stuck continuously trying to yield (i.e. + * returning -EAGAIN here and then failing the forward progress + * check in the caller ad nauseam). + */ + if (tdp_mmu_iter_need_resched(kvm, parent)) + return -EAGAIN; + + *huge_spte = make_huge_spte(kvm, iter.old_spte, parent->level); + return 0; + } + + return -ENOENT; +} + +static void recover_huge_pages_range(struct kvm *kvm, + struct kvm_mmu_page *root, + const struct kvm_memory_slot *slot) { gfn_t start = slot->base_gfn; gfn_t end = start + slot->npages; struct tdp_iter iter; int max_mapping_level; bool flush = false; + u64 huge_spte; + int r; rcu_read_lock(); @@ -1626,7 +1654,13 @@ static void zap_collapsible_spte_range(struct kvm *kvm, if (max_mapping_level < iter.level) continue; - if (tdp_mmu_set_spte_atomic(kvm, &iter, SHADOW_NONPRESENT_VALUE)) + r = tdp_mmu_make_huge_spte(kvm, &iter, &huge_spte); + if (r == -EAGAIN) + goto retry; + else if (r) + continue; + + if (tdp_mmu_set_spte_atomic(kvm, &iter, huge_spte)) goto retry; flush = true; @@ -1639,17 +1673,17 @@ static void zap_collapsible_spte_range(struct kvm *kvm, } /* - * Zap non-leaf SPTEs (and free their associated page tables) which could - * be replaced by huge pages, for GFNs within the slot. + * Recover huge page mappings within the slot by replacing non-leaf SPTEs with + * huge SPTEs where possible. */ -void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, - const struct kvm_memory_slot *slot) +void kvm_tdp_mmu_recover_huge_pages(struct kvm *kvm, + const struct kvm_memory_slot *slot) { struct kvm_mmu_page *root; lockdep_assert_held_read(&kvm->mmu_lock); for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id) - zap_collapsible_spte_range(kvm, root, slot); + recover_huge_pages_range(kvm, root, slot); } /* diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index 1b74e058a81c..ddea2827d1ad 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -40,8 +40,8 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, unsigned long mask, bool wrprot); -void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, - const struct kvm_memory_slot *slot); +void kvm_tdp_mmu_recover_huge_pages(struct kvm *kvm, + const struct kvm_memory_slot *slot); bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 966fb301d44b..3d09c12847d5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -13053,19 +13053,15 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, if (!log_dirty_pages) { /* - * Dirty logging tracks sptes in 4k granularity, meaning that - * large sptes have to be split. If live migration succeeds, - * the guest in the source machine will be destroyed and large - * sptes will be created in the destination. However, if the - * guest continues to run in the source machine (for example if - * live migration fails), small sptes will remain around and - * cause bad performance. + * Recover huge page mappings in the slot now that dirty logging + * is disabled, i.e. now that KVM does not have to track guest + * writes at 4KiB granularity. * - * Scan sptes if dirty logging has been stopped, dropping those - * which can be collapsed into a single large-page spte. Later - * page faults will create the large-page sptes. + * Dirty logging might be disabled by userspace if an ongoing VM + * live migration is cancelled and the VM must continue running + * on the source. */ - kvm_mmu_zap_collapsible_sptes(kvm, new); + kvm_mmu_recover_huge_pages(kvm, new); } else { /* * Initially-all-set does not require write protecting any page,

[v2,4/6] KVM: x86/mmu: Recover TDP MMU huge page mappings in-place instead of zapping

Commit Message

Comments

Patch