diff mbox series

[RFC,12/15] KVM: x86/mmu: Split large pages when dirty logging is enabled

Message ID 20211119235759.1304274-13-dmatlack@google.com (mailing list archive)
State New, archived
Headers show
Series KVM: x86/mmu: Eager Page Splitting for the TDP MMU | expand

Commit Message

David Matlack Nov. 19, 2021, 11:57 p.m. UTC
When dirty logging is enabled without initially-all-set, attempt to
split all large pages in the memslot down to 4KB pages so that vCPUs
do not have to take expensive write-protection faults to split large
pages.

Large page splitting is best-effort only. This commit only adds the
support for the TDP MMU, and even there splitting may fail due to out
of memory conditions. Failures to split a large page is fine from a
correctness standpoint because we still always follow it up by write-
protecting any remaining large pages.

Signed-off-by: David Matlack <dmatlack@google.com>
---
 arch/x86/include/asm/kvm_host.h |   6 ++
 arch/x86/kvm/mmu/mmu.c          |  83 +++++++++++++++++++++
 arch/x86/kvm/mmu/mmu_internal.h |   3 +
 arch/x86/kvm/mmu/spte.c         |  46 ++++++++++++
 arch/x86/kvm/mmu/spte.h         |   1 +
 arch/x86/kvm/mmu/tdp_mmu.c      | 123 ++++++++++++++++++++++++++++++++
 arch/x86/kvm/mmu/tdp_mmu.h      |   5 ++
 arch/x86/kvm/x86.c              |   6 ++
 8 files changed, 273 insertions(+)

Comments

Nikunj A. Dadhania Nov. 22, 2021, 5:05 a.m. UTC | #1
On 11/20/2021 5:27 AM, David Matlack wrote:
> When dirty logging is enabled without initially-all-set, attempt to
> split all large pages in the memslot down to 4KB pages so that vCPUs
> do not have to take expensive write-protection faults to split large
> pages.
> 
> Large page splitting is best-effort only. This commit only adds the
> support for the TDP MMU, and even there splitting may fail due to out
> of memory conditions. Failures to split a large page is fine from a
> correctness standpoint because we still always follow it up by write-
> protecting any remaining large pages.
> 
> Signed-off-by: David Matlack <dmatlack@google.com>

> +int mmu_topup_split_caches(struct kvm *kvm)
> +{
> +	struct kvm_mmu_memory_caches *split_caches = &kvm->arch.split_caches;
> +	int r;
> +
> +	assert_split_caches_invariants(kvm);
> +
> +	r = kvm_mmu_topup_memory_cache(&split_caches->page_header_cache, 1);
> +	if (r)
> +		goto out;
> +
> +	r = kvm_mmu_topup_memory_cache(&split_caches->shadow_page_cache, 1);
> +	if (r)
> +		goto out;
> +
> +	return 0;
> +
> +out:
> +	pr_warn("Failed to top-up split caches. Will not split large pages.\n");
> +	return r;
> +}
> +
> +static void mmu_free_split_caches(struct kvm *kvm)
> +{
> +	assert_split_caches_invariants(kvm);
> +
> +	kvm_mmu_free_memory_cache(&kvm->arch.split_caches.pte_list_desc_cache);
                                                              ^^^^^^^^^^^^^^
I believe this should be page_header_cache.

> +	kvm_mmu_free_memory_cache(&kvm->arch.split_caches.shadow_page_cache);
> +}

Regards
Nikunj
Ben Gardon Nov. 22, 2021, 7:30 p.m. UTC | #2
On Fri, Nov 19, 2021 at 3:58 PM David Matlack <dmatlack@google.com> wrote:
>
> When dirty logging is enabled without initially-all-set, attempt to
> split all large pages in the memslot down to 4KB pages so that vCPUs
> do not have to take expensive write-protection faults to split large
> pages.
>
> Large page splitting is best-effort only. This commit only adds the
> support for the TDP MMU, and even there splitting may fail due to out
> of memory conditions. Failures to split a large page is fine from a
> correctness standpoint because we still always follow it up by write-
> protecting any remaining large pages.
>
> Signed-off-by: David Matlack <dmatlack@google.com>
> ---
>  arch/x86/include/asm/kvm_host.h |   6 ++
>  arch/x86/kvm/mmu/mmu.c          |  83 +++++++++++++++++++++
>  arch/x86/kvm/mmu/mmu_internal.h |   3 +
>  arch/x86/kvm/mmu/spte.c         |  46 ++++++++++++
>  arch/x86/kvm/mmu/spte.h         |   1 +
>  arch/x86/kvm/mmu/tdp_mmu.c      | 123 ++++++++++++++++++++++++++++++++
>  arch/x86/kvm/mmu/tdp_mmu.h      |   5 ++
>  arch/x86/kvm/x86.c              |   6 ++
>  8 files changed, 273 insertions(+)
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 2a7564703ea6..432a4df817ec 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1232,6 +1232,9 @@ struct kvm_arch {
>         hpa_t   hv_root_tdp;
>         spinlock_t hv_root_tdp_lock;
>  #endif
> +
> +       /* MMU caches used when splitting large pages during VM-ioctls. */
> +       struct kvm_mmu_memory_caches split_caches;
>  };
>
>  struct kvm_vm_stat {
> @@ -1588,6 +1591,9 @@ void kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
>  void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
>                                       const struct kvm_memory_slot *memslot,
>                                       int start_level);
> +void kvm_mmu_slot_try_split_large_pages(struct kvm *kvm,
> +                                       const struct kvm_memory_slot *memslot,
> +                                       int target_level);
>  void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
>                                    const struct kvm_memory_slot *memslot);
>  void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 54f0d2228135..6768ef9c0891 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -738,6 +738,66 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
>                                           PT64_ROOT_MAX_LEVEL);
>  }
>
> +static inline void assert_split_caches_invariants(struct kvm *kvm)
> +{
> +       /*
> +        * The split caches must only be modified while holding the slots_lock,
> +        * since it is only used during memslot VM-ioctls.
> +        */
> +       lockdep_assert_held(&kvm->slots_lock);
> +
> +       /*
> +        * Only the TDP MMU supports large page splitting using
> +        * kvm->arch.split_caches, which is why we only have to allocate
> +        * page_header_cache and shadow_page_cache. Assert that the TDP
> +        * MMU is at least enabled when the split cache is allocated.
> +        */
> +       BUG_ON(!is_tdp_mmu_enabled(kvm));
> +}
> +
> +int mmu_topup_split_caches(struct kvm *kvm)
> +{
> +       struct kvm_mmu_memory_caches *split_caches = &kvm->arch.split_caches;
> +       int r;
> +
> +       assert_split_caches_invariants(kvm);
> +
> +       r = kvm_mmu_topup_memory_cache(&split_caches->page_header_cache, 1);
> +       if (r)
> +               goto out;
> +
> +       r = kvm_mmu_topup_memory_cache(&split_caches->shadow_page_cache, 1);
> +       if (r)
> +               goto out;
> +
> +       return 0;
> +
> +out:
> +       pr_warn("Failed to top-up split caches. Will not split large pages.\n");
> +       return r;
> +}
> +
> +static void mmu_free_split_caches(struct kvm *kvm)
> +{
> +       assert_split_caches_invariants(kvm);
> +
> +       kvm_mmu_free_memory_cache(&kvm->arch.split_caches.pte_list_desc_cache);
> +       kvm_mmu_free_memory_cache(&kvm->arch.split_caches.shadow_page_cache);
> +}
> +
> +bool mmu_split_caches_need_topup(struct kvm *kvm)
> +{
> +       assert_split_caches_invariants(kvm);
> +
> +       if (kvm->arch.split_caches.page_header_cache.nobjs == 0)
> +               return true;
> +
> +       if (kvm->arch.split_caches.shadow_page_cache.nobjs == 0)
> +               return true;
> +
> +       return false;
> +}
> +
>  static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
>  {
>         struct kvm_mmu_memory_caches *mmu_caches;
> @@ -5696,6 +5756,7 @@ void kvm_mmu_init_vm(struct kvm *kvm)
>
>         spin_lock_init(&kvm->arch.mmu_unsync_pages_lock);
>
> +       mmu_init_memory_caches(&kvm->arch.split_caches);
>         kvm_mmu_init_tdp_mmu(kvm);
>
>         node->track_write = kvm_mmu_pte_write;
> @@ -5819,6 +5880,28 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
>                 kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
>  }
>
> +void kvm_mmu_slot_try_split_large_pages(struct kvm *kvm,
> +                                       const struct kvm_memory_slot *memslot,
> +                                       int target_level)
> +{
> +       u64 start, end;
> +
> +       if (!is_tdp_mmu_enabled(kvm))
> +               return;
> +
> +       if (mmu_topup_split_caches(kvm))
> +               return;
> +
> +       start = memslot->base_gfn;
> +       end = start + memslot->npages;
> +
> +       read_lock(&kvm->mmu_lock);
> +       kvm_tdp_mmu_try_split_large_pages(kvm, memslot, start, end, target_level);
> +       read_unlock(&kvm->mmu_lock);
> +
> +       mmu_free_split_caches(kvm);
> +}
> +
>  static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
>                                          struct kvm_rmap_head *rmap_head,
>                                          const struct kvm_memory_slot *slot)
> diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
> index 52c6527b1a06..89b9b907c567 100644
> --- a/arch/x86/kvm/mmu/mmu_internal.h
> +++ b/arch/x86/kvm/mmu/mmu_internal.h
> @@ -161,4 +161,7 @@ void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);
>  void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp);
>  void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp);
>
> +int mmu_topup_split_caches(struct kvm *kvm);
> +bool mmu_split_caches_need_topup(struct kvm *kvm);
> +
>  #endif /* __KVM_X86_MMU_INTERNAL_H */
> diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
> index df2cdb8bcf77..6bb9b597a854 100644
> --- a/arch/x86/kvm/mmu/spte.c
> +++ b/arch/x86/kvm/mmu/spte.c
> @@ -191,6 +191,52 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
>         return wrprot;
>  }
>
> +static u64 mark_spte_executable(u64 spte)
> +{
> +       bool is_access_track = is_access_track_spte(spte);
> +
> +       if (is_access_track)
> +               spte = restore_acc_track_spte(spte);
> +
> +       spte &= ~shadow_nx_mask;
> +       spte |= shadow_x_mask;
> +
> +       if (is_access_track)
> +               spte = mark_spte_for_access_track(spte);
> +
> +       return spte;
> +}
> +
> +/*
> + * Construct an SPTE that maps a sub-page of the given large SPTE. This is
> + * used during large page splitting, to build the SPTEs that make up the new
> + * page table.
> + */
> +u64 make_large_page_split_spte(u64 large_spte, int level, int index, unsigned int access)

Just because this always trips me up reading code, I'd suggest naming
the argument large_spte_level or something.
Avoiding a variable called "level" in this function makes it much more explicit.

> +{
> +       u64 child_spte;
> +       int child_level;
> +
> +       BUG_ON(is_mmio_spte(large_spte));
> +       BUG_ON(!is_large_present_pte(large_spte));

In the interest of not crashing the host, I think it would be safe to
WARN and return 0 here.
BUG is fine too if that's preferred.

> +
> +       child_spte = large_spte;
> +       child_level = level - 1;
> +
> +       child_spte += (index * KVM_PAGES_PER_HPAGE(child_level)) << PAGE_SHIFT;

This += makes me nervous. It at least merits a comment explaining
what's going on.
I'd find a |= more readable to make it more explicit and since sptes
aren't numbers.
You could probably also be really explicit about extracting the PFN
and adding to it, clearing the PFN bits and then putting it back in
and I bet the compiler would optimize out the extra bit fiddling.

> +
> +       if (child_level == PG_LEVEL_4K) {
> +               child_spte &= ~PT_PAGE_SIZE_MASK;
> +
> +               /* Allow execution for 4K pages if it was disabled for NX HugePages. */
> +               if (is_nx_huge_page_enabled() && access & ACC_EXEC_MASK)
> +                       child_spte = mark_spte_executable(child_spte);
> +       }
> +
> +       return child_spte;
> +}
> +
> +
>  u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled)
>  {
>         u64 spte = SPTE_MMU_PRESENT_MASK;
> diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
> index 3e4943ee5a01..4efb4837e38d 100644
> --- a/arch/x86/kvm/mmu/spte.h
> +++ b/arch/x86/kvm/mmu/spte.h
> @@ -339,6 +339,7 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
>                unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn,
>                u64 old_spte, bool prefetch, bool can_unsync,
>                bool host_writable, u64 *new_spte);
> +u64 make_large_page_split_spte(u64 large_spte, int level, int index, unsigned int access);
>  u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled);
>  u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access);
>  u64 mark_spte_for_access_track(u64 spte);
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> index 5ca0fa659245..366857b9fb3b 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.c
> +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> @@ -695,6 +695,39 @@ static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
>         return false;
>  }
>
> +static inline bool
> +tdp_mmu_need_split_caches_topup_or_resched(struct kvm *kvm, struct tdp_iter *iter)
> +{
> +       if (mmu_split_caches_need_topup(kvm))
> +               return true;
> +
> +       return tdp_mmu_iter_need_resched(kvm, iter);
> +}
> +
> +static inline int
> +tdp_mmu_topup_split_caches_resched(struct kvm *kvm, struct tdp_iter *iter, bool flush)

This functionality could be shoe-horned into
tdp_mmu_iter_cond_resched, reducing code duplication.
I don't know if the extra parameters / complexity on that function
would be worth it, but I'm slightly inclined in that direction.

> +{
> +       int r;
> +
> +       rcu_read_unlock();
> +
> +       if (flush)
> +               kvm_flush_remote_tlbs(kvm);
> +
> +       read_unlock(&kvm->mmu_lock);
> +
> +       cond_resched();
> +       r = mmu_topup_split_caches(kvm);

Ah, right. I was confused by this for a second, but it's safe because
the caches are protected by the slots lock.

> +
> +       read_lock(&kvm->mmu_lock);
> +
> +       rcu_read_lock();
> +       WARN_ON(iter->gfn > iter->next_last_level_gfn);
> +       tdp_iter_restart(iter);
> +
> +       return r;
> +}
> +
>  /*
>   * Tears down the mappings for the range of gfns, [start, end), and frees the
>   * non-root pages mapping GFNs strictly within that range. Returns true if
> @@ -1241,6 +1274,96 @@ bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
>         return spte_set;
>  }
>
> +static bool tdp_mmu_split_large_page_atomic(struct kvm *kvm, struct tdp_iter *iter)
> +{
> +       const u64 large_spte = iter->old_spte;
> +       const int level = iter->level;
> +       struct kvm_mmu_page *child_sp;
> +       u64 child_spte;
> +       int i;
> +
> +       BUG_ON(mmu_split_caches_need_topup(kvm));

I think it would be safe to just WARN and return here as well.

> +
> +       child_sp = alloc_child_tdp_mmu_page(&kvm->arch.split_caches, iter);
> +
> +       for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
> +               child_spte = make_large_page_split_spte(large_spte, level, i, ACC_ALL);

Relating to my other comment above on make_large_page_split_spte, you
could also iterate through the range of PFNs here and pass that as an
argument to the helper function.

> +
> +               /*
> +                * No need for atomics since child_sp has not been installed
> +                * in the table yet and thus is not reachable by any other
> +                * thread.
> +                */
> +               child_sp->spt[i] = child_spte;
> +       }
> +
> +       return tdp_mmu_install_sp_atomic(kvm, iter, child_sp, false);
> +}
> +
> +static void tdp_mmu_split_large_pages_root(struct kvm *kvm, struct kvm_mmu_page *root,
> +                                          gfn_t start, gfn_t end, int target_level)
> +{
> +       struct tdp_iter iter;
> +       bool flush = false;
> +       int r;
> +
> +       rcu_read_lock();
> +
> +       /*
> +        * Traverse the page table splitting all large pages above the target
> +        * level into one lower level. For example, if we encounter a 1GB page
> +        * we split it into 512 2MB pages.
> +        *
> +        * Since the TDP iterator uses a pre-order traversal, we are guaranteed
> +        * to visit an SPTE before ever visiting its children, which means we
> +        * will correctly recursively split large pages that are more than one
> +        * level above the target level (e.g. splitting 1GB to 2MB to 4KB).
> +        */
> +       for_each_tdp_pte_min_level(iter, root, target_level + 1, start, end) {
> +retry:
> +               if (tdp_mmu_need_split_caches_topup_or_resched(kvm, &iter)) {
> +                       r = tdp_mmu_topup_split_caches_resched(kvm, &iter, flush);
> +                       flush = false;
> +
> +                       /*
> +                        * If topping up the split caches failed, we can't split
> +                        * any more pages. Bail out of the loop.
> +                        */
> +                       if (r)
> +                               break;
> +
> +                       continue;
> +               }
> +
> +               if (!is_large_present_pte(iter.old_spte))
> +                       continue;
> +
> +               if (!tdp_mmu_split_large_page_atomic(kvm, &iter))
> +                       goto retry;
> +
> +               flush = true;
> +       }
> +
> +       rcu_read_unlock();
> +
> +       if (flush)
> +               kvm_flush_remote_tlbs(kvm);
> +}
> +
> +void kvm_tdp_mmu_try_split_large_pages(struct kvm *kvm,
> +                                      const struct kvm_memory_slot *slot,
> +                                      gfn_t start, gfn_t end,
> +                                      int target_level)
> +{
> +       struct kvm_mmu_page *root;
> +
> +       lockdep_assert_held_read(&kvm->mmu_lock);
> +
> +       for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
> +               tdp_mmu_split_large_pages_root(kvm, root, start, end, target_level);
> +
> +}
> +
>  /*
>   * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
>   * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
> index 476b133544dd..7812087836b2 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.h
> +++ b/arch/x86/kvm/mmu/tdp_mmu.h
> @@ -72,6 +72,11 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
>                                    struct kvm_memory_slot *slot, gfn_t gfn,
>                                    int min_level);
>
> +void kvm_tdp_mmu_try_split_large_pages(struct kvm *kvm,
> +                                      const struct kvm_memory_slot *slot,
> +                                      gfn_t start, gfn_t end,
> +                                      int target_level);
> +
>  static inline void kvm_tdp_mmu_walk_lockless_begin(void)
>  {
>         rcu_read_lock();
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 04e8dabc187d..4702ebfd394b 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -11735,6 +11735,12 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
>                 if (kvm_dirty_log_manual_protect_and_init_set(kvm))
>                         return;
>
> +               /*
> +                * Attempt to split all large pages into 4K pages so that vCPUs
> +                * do not have to take write-protection faults.
> +                */
> +               kvm_mmu_slot_try_split_large_pages(kvm, new, PG_LEVEL_4K);

Thank you for parameterizing the target level here. I'm working on a
proof of concept for 2M dirty tracking right now (still in exploratory
phase) and this parameter will help future-proof the splitting
algorithm if we ever decide we don't want to split everything to 4k
for dirty logging.

> +
>                 if (kvm_x86_ops.cpu_dirty_log_size) {
>                         kvm_mmu_slot_leaf_clear_dirty(kvm, new);
>                         kvm_mmu_slot_remove_write_access(kvm, new, PG_LEVEL_2M);
> --
> 2.34.0.rc2.393.gf8c9666880-goog
>
Peter Xu Nov. 26, 2021, 12:01 p.m. UTC | #3
Hi, David,

On Fri, Nov 19, 2021 at 11:57:56PM +0000, David Matlack wrote:
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 2a7564703ea6..432a4df817ec 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1232,6 +1232,9 @@ struct kvm_arch {
>  	hpa_t	hv_root_tdp;
>  	spinlock_t hv_root_tdp_lock;
>  #endif
> +
> +	/* MMU caches used when splitting large pages during VM-ioctls. */
> +	struct kvm_mmu_memory_caches split_caches;

Are mmu_gfn_array_cache and mmu_pte_list_desc_cache wasted here?  I saw that
"struct kvm_mmu_memory_cache" still takes up quite a few hundreds of bytes,
just want to make sure we won't waste them in vain.

[...]

> +int mmu_topup_split_caches(struct kvm *kvm)
> +{
> +	struct kvm_mmu_memory_caches *split_caches = &kvm->arch.split_caches;
> +	int r;
> +
> +	assert_split_caches_invariants(kvm);
> +
> +	r = kvm_mmu_topup_memory_cache(&split_caches->page_header_cache, 1);
> +	if (r)
> +		goto out;
> +
> +	r = kvm_mmu_topup_memory_cache(&split_caches->shadow_page_cache, 1);
> +	if (r)
> +		goto out;

Is it intended to only top-up with one cache object?  IIUC this means we'll try
to proactively yield the cpu for each of the huge page split right after the
object is consumed.

Wondering whether it be more efficient to make it a slightly larger number, so
we don't overload the memory but also make the loop a bit more efficient.

> +
> +	return 0;
> +
> +out:
> +	pr_warn("Failed to top-up split caches. Will not split large pages.\n");
> +	return r;
> +}

Thanks,
David Matlack Nov. 30, 2021, 11:33 p.m. UTC | #4
On Sun, Nov 21, 2021 at 9:05 PM Nikunj A. Dadhania <nikunj@amd.com> wrote:
>
>
>
> On 11/20/2021 5:27 AM, David Matlack wrote:
> > When dirty logging is enabled without initially-all-set, attempt to
> > split all large pages in the memslot down to 4KB pages so that vCPUs
> > do not have to take expensive write-protection faults to split large
> > pages.
> >
> > Large page splitting is best-effort only. This commit only adds the
> > support for the TDP MMU, and even there splitting may fail due to out
> > of memory conditions. Failures to split a large page is fine from a
> > correctness standpoint because we still always follow it up by write-
> > protecting any remaining large pages.
> >
> > Signed-off-by: David Matlack <dmatlack@google.com>
>
> > +int mmu_topup_split_caches(struct kvm *kvm)
> > +{
> > +     struct kvm_mmu_memory_caches *split_caches = &kvm->arch.split_caches;
> > +     int r;
> > +
> > +     assert_split_caches_invariants(kvm);
> > +
> > +     r = kvm_mmu_topup_memory_cache(&split_caches->page_header_cache, 1);
> > +     if (r)
> > +             goto out;
> > +
> > +     r = kvm_mmu_topup_memory_cache(&split_caches->shadow_page_cache, 1);
> > +     if (r)
> > +             goto out;
> > +
> > +     return 0;
> > +
> > +out:
> > +     pr_warn("Failed to top-up split caches. Will not split large pages.\n");
> > +     return r;
> > +}
> > +
> > +static void mmu_free_split_caches(struct kvm *kvm)
> > +{
> > +     assert_split_caches_invariants(kvm);
> > +
> > +     kvm_mmu_free_memory_cache(&kvm->arch.split_caches.pte_list_desc_cache);
>                                                               ^^^^^^^^^^^^^^
> I believe this should be page_header_cache.

Oh wow, thanks for catching that. You are correct. Will fix in v1.

>
> > +     kvm_mmu_free_memory_cache(&kvm->arch.split_caches.shadow_page_cache);
> > +}
>
> Regards
> Nikunj
>
David Matlack Nov. 30, 2021, 11:44 p.m. UTC | #5
On Mon, Nov 22, 2021 at 11:31 AM Ben Gardon <bgardon@google.com> wrote:
>
> On Fri, Nov 19, 2021 at 3:58 PM David Matlack <dmatlack@google.com> wrote:
> >
> > When dirty logging is enabled without initially-all-set, attempt to
> > split all large pages in the memslot down to 4KB pages so that vCPUs
> > do not have to take expensive write-protection faults to split large
> > pages.
> >
> > Large page splitting is best-effort only. This commit only adds the
> > support for the TDP MMU, and even there splitting may fail due to out
> > of memory conditions. Failures to split a large page is fine from a
> > correctness standpoint because we still always follow it up by write-
> > protecting any remaining large pages.
> >
> > Signed-off-by: David Matlack <dmatlack@google.com>
> > ---
> >  arch/x86/include/asm/kvm_host.h |   6 ++
> >  arch/x86/kvm/mmu/mmu.c          |  83 +++++++++++++++++++++
> >  arch/x86/kvm/mmu/mmu_internal.h |   3 +
> >  arch/x86/kvm/mmu/spte.c         |  46 ++++++++++++
> >  arch/x86/kvm/mmu/spte.h         |   1 +
> >  arch/x86/kvm/mmu/tdp_mmu.c      | 123 ++++++++++++++++++++++++++++++++
> >  arch/x86/kvm/mmu/tdp_mmu.h      |   5 ++
> >  arch/x86/kvm/x86.c              |   6 ++
> >  8 files changed, 273 insertions(+)
> >
> > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> > index 2a7564703ea6..432a4df817ec 100644
> > --- a/arch/x86/include/asm/kvm_host.h
> > +++ b/arch/x86/include/asm/kvm_host.h
> > @@ -1232,6 +1232,9 @@ struct kvm_arch {
> >         hpa_t   hv_root_tdp;
> >         spinlock_t hv_root_tdp_lock;
> >  #endif
> > +
> > +       /* MMU caches used when splitting large pages during VM-ioctls. */
> > +       struct kvm_mmu_memory_caches split_caches;
> >  };
> >
> >  struct kvm_vm_stat {
> > @@ -1588,6 +1591,9 @@ void kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
> >  void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
> >                                       const struct kvm_memory_slot *memslot,
> >                                       int start_level);
> > +void kvm_mmu_slot_try_split_large_pages(struct kvm *kvm,
> > +                                       const struct kvm_memory_slot *memslot,
> > +                                       int target_level);
> >  void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
> >                                    const struct kvm_memory_slot *memslot);
> >  void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
> > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> > index 54f0d2228135..6768ef9c0891 100644
> > --- a/arch/x86/kvm/mmu/mmu.c
> > +++ b/arch/x86/kvm/mmu/mmu.c
> > @@ -738,6 +738,66 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
> >                                           PT64_ROOT_MAX_LEVEL);
> >  }
> >
> > +static inline void assert_split_caches_invariants(struct kvm *kvm)
> > +{
> > +       /*
> > +        * The split caches must only be modified while holding the slots_lock,
> > +        * since it is only used during memslot VM-ioctls.
> > +        */
> > +       lockdep_assert_held(&kvm->slots_lock);
> > +
> > +       /*
> > +        * Only the TDP MMU supports large page splitting using
> > +        * kvm->arch.split_caches, which is why we only have to allocate
> > +        * page_header_cache and shadow_page_cache. Assert that the TDP
> > +        * MMU is at least enabled when the split cache is allocated.
> > +        */
> > +       BUG_ON(!is_tdp_mmu_enabled(kvm));
> > +}
> > +
> > +int mmu_topup_split_caches(struct kvm *kvm)
> > +{
> > +       struct kvm_mmu_memory_caches *split_caches = &kvm->arch.split_caches;
> > +       int r;
> > +
> > +       assert_split_caches_invariants(kvm);
> > +
> > +       r = kvm_mmu_topup_memory_cache(&split_caches->page_header_cache, 1);
> > +       if (r)
> > +               goto out;
> > +
> > +       r = kvm_mmu_topup_memory_cache(&split_caches->shadow_page_cache, 1);
> > +       if (r)
> > +               goto out;
> > +
> > +       return 0;
> > +
> > +out:
> > +       pr_warn("Failed to top-up split caches. Will not split large pages.\n");
> > +       return r;
> > +}
> > +
> > +static void mmu_free_split_caches(struct kvm *kvm)
> > +{
> > +       assert_split_caches_invariants(kvm);
> > +
> > +       kvm_mmu_free_memory_cache(&kvm->arch.split_caches.pte_list_desc_cache);
> > +       kvm_mmu_free_memory_cache(&kvm->arch.split_caches.shadow_page_cache);
> > +}
> > +
> > +bool mmu_split_caches_need_topup(struct kvm *kvm)
> > +{
> > +       assert_split_caches_invariants(kvm);
> > +
> > +       if (kvm->arch.split_caches.page_header_cache.nobjs == 0)
> > +               return true;
> > +
> > +       if (kvm->arch.split_caches.shadow_page_cache.nobjs == 0)
> > +               return true;
> > +
> > +       return false;
> > +}
> > +
> >  static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
> >  {
> >         struct kvm_mmu_memory_caches *mmu_caches;
> > @@ -5696,6 +5756,7 @@ void kvm_mmu_init_vm(struct kvm *kvm)
> >
> >         spin_lock_init(&kvm->arch.mmu_unsync_pages_lock);
> >
> > +       mmu_init_memory_caches(&kvm->arch.split_caches);
> >         kvm_mmu_init_tdp_mmu(kvm);
> >
> >         node->track_write = kvm_mmu_pte_write;
> > @@ -5819,6 +5880,28 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
> >                 kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
> >  }
> >
> > +void kvm_mmu_slot_try_split_large_pages(struct kvm *kvm,
> > +                                       const struct kvm_memory_slot *memslot,
> > +                                       int target_level)
> > +{
> > +       u64 start, end;
> > +
> > +       if (!is_tdp_mmu_enabled(kvm))
> > +               return;
> > +
> > +       if (mmu_topup_split_caches(kvm))
> > +               return;
> > +
> > +       start = memslot->base_gfn;
> > +       end = start + memslot->npages;
> > +
> > +       read_lock(&kvm->mmu_lock);
> > +       kvm_tdp_mmu_try_split_large_pages(kvm, memslot, start, end, target_level);
> > +       read_unlock(&kvm->mmu_lock);
> > +
> > +       mmu_free_split_caches(kvm);
> > +}
> > +
> >  static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
> >                                          struct kvm_rmap_head *rmap_head,
> >                                          const struct kvm_memory_slot *slot)
> > diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
> > index 52c6527b1a06..89b9b907c567 100644
> > --- a/arch/x86/kvm/mmu/mmu_internal.h
> > +++ b/arch/x86/kvm/mmu/mmu_internal.h
> > @@ -161,4 +161,7 @@ void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);
> >  void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp);
> >  void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp);
> >
> > +int mmu_topup_split_caches(struct kvm *kvm);
> > +bool mmu_split_caches_need_topup(struct kvm *kvm);
> > +
> >  #endif /* __KVM_X86_MMU_INTERNAL_H */
> > diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
> > index df2cdb8bcf77..6bb9b597a854 100644
> > --- a/arch/x86/kvm/mmu/spte.c
> > +++ b/arch/x86/kvm/mmu/spte.c
> > @@ -191,6 +191,52 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
> >         return wrprot;
> >  }
> >
> > +static u64 mark_spte_executable(u64 spte)
> > +{
> > +       bool is_access_track = is_access_track_spte(spte);
> > +
> > +       if (is_access_track)
> > +               spte = restore_acc_track_spte(spte);
> > +
> > +       spte &= ~shadow_nx_mask;
> > +       spte |= shadow_x_mask;
> > +
> > +       if (is_access_track)
> > +               spte = mark_spte_for_access_track(spte);
> > +
> > +       return spte;
> > +}
> > +
> > +/*
> > + * Construct an SPTE that maps a sub-page of the given large SPTE. This is
> > + * used during large page splitting, to build the SPTEs that make up the new
> > + * page table.
> > + */
> > +u64 make_large_page_split_spte(u64 large_spte, int level, int index, unsigned int access)
>
> Just because this always trips me up reading code, I'd suggest naming
> the argument large_spte_level or something.
> Avoiding a variable called "level" in this function makes it much more explicit.

Will do.

>
> > +{
> > +       u64 child_spte;
> > +       int child_level;
> > +
> > +       BUG_ON(is_mmio_spte(large_spte));
> > +       BUG_ON(!is_large_present_pte(large_spte));
>
> In the interest of not crashing the host, I think it would be safe to
> WARN and return 0 here.
> BUG is fine too if that's preferred.

Ack. I'll take a look and see if I can avoid the BUG_ONs. They're
optional sanity checks anyway.

>
> > +
> > +       child_spte = large_spte;
> > +       child_level = level - 1;
> > +
> > +       child_spte += (index * KVM_PAGES_PER_HPAGE(child_level)) << PAGE_SHIFT;
>
> This += makes me nervous. It at least merits a comment explaining
> what's going on.
> I'd find a |= more readable to make it more explicit and since sptes
> aren't numbers.
> You could probably also be really explicit about extracting the PFN
> and adding to it, clearing the PFN bits and then putting it back in
> and I bet the compiler would optimize out the extra bit fiddling.

I can change it to |= and add a comment. I prefer not to extra the PFN
and replace it since there's really no reason to. One of the nice
things about this function in general is that we don't have to
construct the child SPTE from scratch, we just have to slightly adjust
the parent SPTE. For the address, the address in the large SPTE is
already there, we just need to add in the offset to the lower-level
page.

>
> > +
> > +       if (child_level == PG_LEVEL_4K) {
> > +               child_spte &= ~PT_PAGE_SIZE_MASK;
> > +
> > +               /* Allow execution for 4K pages if it was disabled for NX HugePages. */
> > +               if (is_nx_huge_page_enabled() && access & ACC_EXEC_MASK)
> > +                       child_spte = mark_spte_executable(child_spte);
> > +       }
> > +
> > +       return child_spte;
> > +}
> > +
> > +
> >  u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled)
> >  {
> >         u64 spte = SPTE_MMU_PRESENT_MASK;
> > diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
> > index 3e4943ee5a01..4efb4837e38d 100644
> > --- a/arch/x86/kvm/mmu/spte.h
> > +++ b/arch/x86/kvm/mmu/spte.h
> > @@ -339,6 +339,7 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
> >                unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn,
> >                u64 old_spte, bool prefetch, bool can_unsync,
> >                bool host_writable, u64 *new_spte);
> > +u64 make_large_page_split_spte(u64 large_spte, int level, int index, unsigned int access);
> >  u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled);
> >  u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access);
> >  u64 mark_spte_for_access_track(u64 spte);
> > diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> > index 5ca0fa659245..366857b9fb3b 100644
> > --- a/arch/x86/kvm/mmu/tdp_mmu.c
> > +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> > @@ -695,6 +695,39 @@ static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
> >         return false;
> >  }
> >
> > +static inline bool
> > +tdp_mmu_need_split_caches_topup_or_resched(struct kvm *kvm, struct tdp_iter *iter)
> > +{
> > +       if (mmu_split_caches_need_topup(kvm))
> > +               return true;
> > +
> > +       return tdp_mmu_iter_need_resched(kvm, iter);
> > +}
> > +
> > +static inline int
> > +tdp_mmu_topup_split_caches_resched(struct kvm *kvm, struct tdp_iter *iter, bool flush)
>
> This functionality could be shoe-horned into
> tdp_mmu_iter_cond_resched, reducing code duplication.
> I don't know if the extra parameters / complexity on that function
> would be worth it, but I'm slightly inclined in that direction.

Ok I'll take a look and see if I can combine them in a nice way.

>
> > +{
> > +       int r;
> > +
> > +       rcu_read_unlock();
> > +
> > +       if (flush)
> > +               kvm_flush_remote_tlbs(kvm);
> > +
> > +       read_unlock(&kvm->mmu_lock);
> > +
> > +       cond_resched();
> > +       r = mmu_topup_split_caches(kvm);
>
> Ah, right. I was confused by this for a second, but it's safe because
> the caches are protected by the slots lock.
>
> > +
> > +       read_lock(&kvm->mmu_lock);
> > +
> > +       rcu_read_lock();
> > +       WARN_ON(iter->gfn > iter->next_last_level_gfn);
> > +       tdp_iter_restart(iter);
> > +
> > +       return r;
> > +}
> > +
> >  /*
> >   * Tears down the mappings for the range of gfns, [start, end), and frees the
> >   * non-root pages mapping GFNs strictly within that range. Returns true if
> > @@ -1241,6 +1274,96 @@ bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
> >         return spte_set;
> >  }
> >
> > +static bool tdp_mmu_split_large_page_atomic(struct kvm *kvm, struct tdp_iter *iter)
> > +{
> > +       const u64 large_spte = iter->old_spte;
> > +       const int level = iter->level;
> > +       struct kvm_mmu_page *child_sp;
> > +       u64 child_spte;
> > +       int i;
> > +
> > +       BUG_ON(mmu_split_caches_need_topup(kvm));
>
> I think it would be safe to just WARN and return here as well.
>
> > +
> > +       child_sp = alloc_child_tdp_mmu_page(&kvm->arch.split_caches, iter);
> > +
> > +       for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
> > +               child_spte = make_large_page_split_spte(large_spte, level, i, ACC_ALL);
>
> Relating to my other comment above on make_large_page_split_spte, you
> could also iterate through the range of PFNs here and pass that as an
> argument to the helper function.
>
> > +
> > +               /*
> > +                * No need for atomics since child_sp has not been installed
> > +                * in the table yet and thus is not reachable by any other
> > +                * thread.
> > +                */
> > +               child_sp->spt[i] = child_spte;
> > +       }
> > +
> > +       return tdp_mmu_install_sp_atomic(kvm, iter, child_sp, false);
> > +}
> > +
> > +static void tdp_mmu_split_large_pages_root(struct kvm *kvm, struct kvm_mmu_page *root,
> > +                                          gfn_t start, gfn_t end, int target_level)
> > +{
> > +       struct tdp_iter iter;
> > +       bool flush = false;
> > +       int r;
> > +
> > +       rcu_read_lock();
> > +
> > +       /*
> > +        * Traverse the page table splitting all large pages above the target
> > +        * level into one lower level. For example, if we encounter a 1GB page
> > +        * we split it into 512 2MB pages.
> > +        *
> > +        * Since the TDP iterator uses a pre-order traversal, we are guaranteed
> > +        * to visit an SPTE before ever visiting its children, which means we
> > +        * will correctly recursively split large pages that are more than one
> > +        * level above the target level (e.g. splitting 1GB to 2MB to 4KB).
> > +        */
> > +       for_each_tdp_pte_min_level(iter, root, target_level + 1, start, end) {
> > +retry:
> > +               if (tdp_mmu_need_split_caches_topup_or_resched(kvm, &iter)) {
> > +                       r = tdp_mmu_topup_split_caches_resched(kvm, &iter, flush);
> > +                       flush = false;
> > +
> > +                       /*
> > +                        * If topping up the split caches failed, we can't split
> > +                        * any more pages. Bail out of the loop.
> > +                        */
> > +                       if (r)
> > +                               break;
> > +
> > +                       continue;
> > +               }
> > +
> > +               if (!is_large_present_pte(iter.old_spte))
> > +                       continue;
> > +
> > +               if (!tdp_mmu_split_large_page_atomic(kvm, &iter))
> > +                       goto retry;
> > +
> > +               flush = true;
> > +       }
> > +
> > +       rcu_read_unlock();
> > +
> > +       if (flush)
> > +               kvm_flush_remote_tlbs(kvm);
> > +}
> > +
> > +void kvm_tdp_mmu_try_split_large_pages(struct kvm *kvm,
> > +                                      const struct kvm_memory_slot *slot,
> > +                                      gfn_t start, gfn_t end,
> > +                                      int target_level)
> > +{
> > +       struct kvm_mmu_page *root;
> > +
> > +       lockdep_assert_held_read(&kvm->mmu_lock);
> > +
> > +       for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
> > +               tdp_mmu_split_large_pages_root(kvm, root, start, end, target_level);
> > +
> > +}
> > +
> >  /*
> >   * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
> >   * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
> > diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
> > index 476b133544dd..7812087836b2 100644
> > --- a/arch/x86/kvm/mmu/tdp_mmu.h
> > +++ b/arch/x86/kvm/mmu/tdp_mmu.h
> > @@ -72,6 +72,11 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
> >                                    struct kvm_memory_slot *slot, gfn_t gfn,
> >                                    int min_level);
> >
> > +void kvm_tdp_mmu_try_split_large_pages(struct kvm *kvm,
> > +                                      const struct kvm_memory_slot *slot,
> > +                                      gfn_t start, gfn_t end,
> > +                                      int target_level);
> > +
> >  static inline void kvm_tdp_mmu_walk_lockless_begin(void)
> >  {
> >         rcu_read_lock();
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index 04e8dabc187d..4702ebfd394b 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -11735,6 +11735,12 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
> >                 if (kvm_dirty_log_manual_protect_and_init_set(kvm))
> >                         return;
> >
> > +               /*
> > +                * Attempt to split all large pages into 4K pages so that vCPUs
> > +                * do not have to take write-protection faults.
> > +                */
> > +               kvm_mmu_slot_try_split_large_pages(kvm, new, PG_LEVEL_4K);
>
> Thank you for parameterizing the target level here. I'm working on a
> proof of concept for 2M dirty tracking right now (still in exploratory
> phase) and this parameter will help future-proof the splitting
> algorithm if we ever decide we don't want to split everything to 4k
> for dirty logging.

Exactly my thinking as well! :)

>
> > +
> >                 if (kvm_x86_ops.cpu_dirty_log_size) {
> >                         kvm_mmu_slot_leaf_clear_dirty(kvm, new);
> >                         kvm_mmu_slot_remove_write_access(kvm, new, PG_LEVEL_2M);
> > --
> > 2.34.0.rc2.393.gf8c9666880-goog
> >
David Matlack Nov. 30, 2021, 11:56 p.m. UTC | #6
On Fri, Nov 26, 2021 at 4:01 AM Peter Xu <peterx@redhat.com> wrote:
>
> Hi, David,
>
> On Fri, Nov 19, 2021 at 11:57:56PM +0000, David Matlack wrote:
> > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> > index 2a7564703ea6..432a4df817ec 100644
> > --- a/arch/x86/include/asm/kvm_host.h
> > +++ b/arch/x86/include/asm/kvm_host.h
> > @@ -1232,6 +1232,9 @@ struct kvm_arch {
> >       hpa_t   hv_root_tdp;
> >       spinlock_t hv_root_tdp_lock;
> >  #endif
> > +
> > +     /* MMU caches used when splitting large pages during VM-ioctls. */
> > +     struct kvm_mmu_memory_caches split_caches;
>
> Are mmu_gfn_array_cache and mmu_pte_list_desc_cache wasted here?  I saw that
> "struct kvm_mmu_memory_cache" still takes up quite a few hundreds of bytes,
> just want to make sure we won't waste them in vain.

Yes they are wasted right now. But there's a couple of things to keep in mind:

1. They are also wasted in every vCPU (in the per-vCPU caches) that
does not use the shadow MMU.
2. They will (I think) be used eventually when I add Eager Page
Splitting support to the shadow MMU.
3. split_caches is per-VM so it's only a few hundred bytes per VM.

If we really want to save the memory the right way forward might be to
make each kvm_mmu_memory_cache a pointer instead of an embedded
struct. Then we can allocate each dynamically only as needed. I can
add that to my TODO list but I don't think it'd be worth blocking this
on it given the points above.

>
> [...]
>
> > +int mmu_topup_split_caches(struct kvm *kvm)
> > +{
> > +     struct kvm_mmu_memory_caches *split_caches = &kvm->arch.split_caches;
> > +     int r;
> > +
> > +     assert_split_caches_invariants(kvm);
> > +
> > +     r = kvm_mmu_topup_memory_cache(&split_caches->page_header_cache, 1);
> > +     if (r)
> > +             goto out;
> > +
> > +     r = kvm_mmu_topup_memory_cache(&split_caches->shadow_page_cache, 1);
> > +     if (r)
> > +             goto out;
>
> Is it intended to only top-up with one cache object?  IIUC this means we'll try
> to proactively yield the cpu for each of the huge page split right after the
> object is consumed.
>
> Wondering whether it be more efficient to make it a slightly larger number, so
> we don't overload the memory but also make the loop a bit more efficient.

IIUC, 1 here is just the min needed for kvm_mmu_topup_memory_cache to
return success. I chose 1 for each because it's the minimum necessary
to make forward progress (split one large page).

No matter what you pass for min kvm_mmu_topup_memory_cache() will
still always try to allocate KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE
objects.


>
> > +
> > +     return 0;
> > +
> > +out:
> > +     pr_warn("Failed to top-up split caches. Will not split large pages.\n");
> > +     return r;
> > +}
>
> Thanks,
>
> --
> Peter Xu
>
Sean Christopherson Dec. 1, 2021, 1 a.m. UTC | #7
On Tue, Nov 30, 2021, David Matlack wrote:
> On Fri, Nov 26, 2021 at 4:01 AM Peter Xu <peterx@redhat.com> wrote:
> >
> > Hi, David,
> >
> > On Fri, Nov 19, 2021 at 11:57:56PM +0000, David Matlack wrote:
> > > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> > > index 2a7564703ea6..432a4df817ec 100644
> > > --- a/arch/x86/include/asm/kvm_host.h
> > > +++ b/arch/x86/include/asm/kvm_host.h
> > > @@ -1232,6 +1232,9 @@ struct kvm_arch {
> > >       hpa_t   hv_root_tdp;
> > >       spinlock_t hv_root_tdp_lock;
> > >  #endif
> > > +
> > > +     /* MMU caches used when splitting large pages during VM-ioctls. */
> > > +     struct kvm_mmu_memory_caches split_caches;
> >
> > Are mmu_gfn_array_cache and mmu_pte_list_desc_cache wasted here?  I saw that
> > "struct kvm_mmu_memory_cache" still takes up quite a few hundreds of bytes,
> > just want to make sure we won't waste them in vain.
> 
> Yes they are wasted right now. But there's a couple of things to keep in mind:
> 
> 1. They are also wasted in every vCPU (in the per-vCPU caches) that
> does not use the shadow MMU.
> 2. They will (I think) be used eventually when I add Eager Page
> Splitting support to the shadow MMU.
> 3. split_caches is per-VM so it's only a few hundred bytes per VM.
> 
> If we really want to save the memory the right way forward might be to
> make each kvm_mmu_memory_cache a pointer instead of an embedded
> struct. Then we can allocate each dynamically only as needed. I can
> add that to my TODO list but I don't think it'd be worth blocking this
> on it given the points above.
> 
> >
> > [...]
> >
> > > +int mmu_topup_split_caches(struct kvm *kvm)
> > > +{
> > > +     struct kvm_mmu_memory_caches *split_caches = &kvm->arch.split_caches;
> > > +     int r;
> > > +
> > > +     assert_split_caches_invariants(kvm);
> > > +
> > > +     r = kvm_mmu_topup_memory_cache(&split_caches->page_header_cache, 1);
> > > +     if (r)
> > > +             goto out;
> > > +
> > > +     r = kvm_mmu_topup_memory_cache(&split_caches->shadow_page_cache, 1);
> > > +     if (r)
> > > +             goto out;
> >
> > Is it intended to only top-up with one cache object?  IIUC this means we'll try
> > to proactively yield the cpu for each of the huge page split right after the
> > object is consumed.
> >
> > Wondering whether it be more efficient to make it a slightly larger number, so
> > we don't overload the memory but also make the loop a bit more efficient.
> 
> IIUC, 1 here is just the min needed for kvm_mmu_topup_memory_cache to
> return success. I chose 1 for each because it's the minimum necessary
> to make forward progress (split one large page).

The @min parameter is minimum number of pages that _must_ be available in the
cache, i.e. it's the maximum number of pages that can theoretically be used by
whatever upcoming operation is going to be consuming pages from the cache.

So '1' is technically correct, but I think it's the wrong choice given the behavior
of this code.  E.g. if there's 1 object in the cache, the initial top-up will do
nothing, and then tdp_mmu_split_large_pages_root() will almost immediately drop
mmu_lock to topup the cache.  Since the in-loop usage explicitly checks for an
empty cache, i.e. any non-zero @min will have identical behavior, I think it makes
sense to use KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE _and_ add a comment explaining why.
 
> No matter what you pass for min kvm_mmu_topup_memory_cache() will
> still always try to allocate KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE
> objects.

No, it will try to allocate KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE if and only if there
are fewer than @min objects in the cache.
David Matlack Dec. 1, 2021, 1:29 a.m. UTC | #8
On Tue, Nov 30, 2021 at 5:01 PM Sean Christopherson <seanjc@google.com> wrote:
>
> On Tue, Nov 30, 2021, David Matlack wrote:
> > On Fri, Nov 26, 2021 at 4:01 AM Peter Xu <peterx@redhat.com> wrote:
> > >
> > > Hi, David,
> > >
> > > On Fri, Nov 19, 2021 at 11:57:56PM +0000, David Matlack wrote:
> > > > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> > > > index 2a7564703ea6..432a4df817ec 100644
> > > > --- a/arch/x86/include/asm/kvm_host.h
> > > > +++ b/arch/x86/include/asm/kvm_host.h
> > > > @@ -1232,6 +1232,9 @@ struct kvm_arch {
> > > >       hpa_t   hv_root_tdp;
> > > >       spinlock_t hv_root_tdp_lock;
> > > >  #endif
> > > > +
> > > > +     /* MMU caches used when splitting large pages during VM-ioctls. */
> > > > +     struct kvm_mmu_memory_caches split_caches;
> > >
> > > Are mmu_gfn_array_cache and mmu_pte_list_desc_cache wasted here?  I saw that
> > > "struct kvm_mmu_memory_cache" still takes up quite a few hundreds of bytes,
> > > just want to make sure we won't waste them in vain.
> >
> > Yes they are wasted right now. But there's a couple of things to keep in mind:
> >
> > 1. They are also wasted in every vCPU (in the per-vCPU caches) that
> > does not use the shadow MMU.
> > 2. They will (I think) be used eventually when I add Eager Page
> > Splitting support to the shadow MMU.
> > 3. split_caches is per-VM so it's only a few hundred bytes per VM.
> >
> > If we really want to save the memory the right way forward might be to
> > make each kvm_mmu_memory_cache a pointer instead of an embedded
> > struct. Then we can allocate each dynamically only as needed. I can
> > add that to my TODO list but I don't think it'd be worth blocking this
> > on it given the points above.
> >
> > >
> > > [...]
> > >
> > > > +int mmu_topup_split_caches(struct kvm *kvm)
> > > > +{
> > > > +     struct kvm_mmu_memory_caches *split_caches = &kvm->arch.split_caches;
> > > > +     int r;
> > > > +
> > > > +     assert_split_caches_invariants(kvm);
> > > > +
> > > > +     r = kvm_mmu_topup_memory_cache(&split_caches->page_header_cache, 1);
> > > > +     if (r)
> > > > +             goto out;
> > > > +
> > > > +     r = kvm_mmu_topup_memory_cache(&split_caches->shadow_page_cache, 1);
> > > > +     if (r)
> > > > +             goto out;
> > >
> > > Is it intended to only top-up with one cache object?  IIUC this means we'll try
> > > to proactively yield the cpu for each of the huge page split right after the
> > > object is consumed.
> > >
> > > Wondering whether it be more efficient to make it a slightly larger number, so
> > > we don't overload the memory but also make the loop a bit more efficient.
> >
> > IIUC, 1 here is just the min needed for kvm_mmu_topup_memory_cache to
> > return success. I chose 1 for each because it's the minimum necessary
> > to make forward progress (split one large page).
>
> The @min parameter is minimum number of pages that _must_ be available in the
> cache, i.e. it's the maximum number of pages that can theoretically be used by
> whatever upcoming operation is going to be consuming pages from the cache.
>
> So '1' is technically correct, but I think it's the wrong choice given the behavior
> of this code.  E.g. if there's 1 object in the cache, the initial top-up will do
> nothing,

This scenario will not happen though, since we free the caches after
splitting. So, the next time userspace enables dirty logging on a
memslot and we go to do the initial top-up the caches will have 0
objects.

> and then tdp_mmu_split_large_pages_root() will almost immediately drop
> mmu_lock to topup the cache.  Since the in-loop usage explicitly checks for an
> empty cache, i.e. any non-zero @min will have identical behavior, I think it makes
> sense to use KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE _and_ add a comment explaining why.

If we set the min to KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE,
kvm_mmu_topup_memory_cache will return ENOMEM if it can't allocate at
least KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE objects, even though we really
only need 1 to make forward progress.

It's a total edge case but there could be a scenario where userspace
sets the cgroup memory limits so tight that we can't allocate
KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE objects when splitting the last few
pages and in the end we only needed 1 or 2 objects to finish
splitting. In this case we'd end up with a spurious pr_warn and may
not split the last few pages depending on which cache failed to get
topped up.


>
> > No matter what you pass for min kvm_mmu_topup_memory_cache() will
> > still always try to allocate KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE
> > objects.
>
> No, it will try to allocate KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE if and only if there
> are fewer than @min objects in the cache.
Peter Xu Dec. 1, 2021, 2:29 a.m. UTC | #9
On Tue, Nov 30, 2021 at 05:29:10PM -0800, David Matlack wrote:
> On Tue, Nov 30, 2021 at 5:01 PM Sean Christopherson <seanjc@google.com> wrote:
> >
> > On Tue, Nov 30, 2021, David Matlack wrote:
> > > On Fri, Nov 26, 2021 at 4:01 AM Peter Xu <peterx@redhat.com> wrote:
> > > >
> > > > Hi, David,
> > > >
> > > > On Fri, Nov 19, 2021 at 11:57:56PM +0000, David Matlack wrote:
> > > > > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> > > > > index 2a7564703ea6..432a4df817ec 100644
> > > > > --- a/arch/x86/include/asm/kvm_host.h
> > > > > +++ b/arch/x86/include/asm/kvm_host.h
> > > > > @@ -1232,6 +1232,9 @@ struct kvm_arch {
> > > > >       hpa_t   hv_root_tdp;
> > > > >       spinlock_t hv_root_tdp_lock;
> > > > >  #endif
> > > > > +
> > > > > +     /* MMU caches used when splitting large pages during VM-ioctls. */
> > > > > +     struct kvm_mmu_memory_caches split_caches;
> > > >
> > > > Are mmu_gfn_array_cache and mmu_pte_list_desc_cache wasted here?  I saw that
> > > > "struct kvm_mmu_memory_cache" still takes up quite a few hundreds of bytes,
> > > > just want to make sure we won't waste them in vain.
> > >
> > > Yes they are wasted right now. But there's a couple of things to keep in mind:
> > >
> > > 1. They are also wasted in every vCPU (in the per-vCPU caches) that
> > > does not use the shadow MMU.
> > > 2. They will (I think) be used eventually when I add Eager Page
> > > Splitting support to the shadow MMU.
> > > 3. split_caches is per-VM so it's only a few hundred bytes per VM.
> > >
> > > If we really want to save the memory the right way forward might be to
> > > make each kvm_mmu_memory_cache a pointer instead of an embedded
> > > struct. Then we can allocate each dynamically only as needed. I can
> > > add that to my TODO list but I don't think it'd be worth blocking this
> > > on it given the points above.

Yeah I never meant to block this series just for this. :)

If there's plan to move forward with shadow mmu support and they'll be needed
at last, then it's good to me to keep it as is.  Maybe before adding the shadow
mmu support we add a comment above the structure?  Depending on whether the
shadow mmu support is in schedule or not, I think.

> > >
> > > >
> > > > [...]
> > > >
> > > > > +int mmu_topup_split_caches(struct kvm *kvm)
> > > > > +{
> > > > > +     struct kvm_mmu_memory_caches *split_caches = &kvm->arch.split_caches;
> > > > > +     int r;
> > > > > +
> > > > > +     assert_split_caches_invariants(kvm);
> > > > > +
> > > > > +     r = kvm_mmu_topup_memory_cache(&split_caches->page_header_cache, 1);
> > > > > +     if (r)
> > > > > +             goto out;
> > > > > +
> > > > > +     r = kvm_mmu_topup_memory_cache(&split_caches->shadow_page_cache, 1);
> > > > > +     if (r)
> > > > > +             goto out;
> > > >
> > > > Is it intended to only top-up with one cache object?  IIUC this means we'll try
> > > > to proactively yield the cpu for each of the huge page split right after the
> > > > object is consumed.
> > > >
> > > > Wondering whether it be more efficient to make it a slightly larger number, so
> > > > we don't overload the memory but also make the loop a bit more efficient.
> > >
> > > IIUC, 1 here is just the min needed for kvm_mmu_topup_memory_cache to
> > > return success. I chose 1 for each because it's the minimum necessary
> > > to make forward progress (split one large page).
> >
> > The @min parameter is minimum number of pages that _must_ be available in the
> > cache, i.e. it's the maximum number of pages that can theoretically be used by
> > whatever upcoming operation is going to be consuming pages from the cache.
> >
> > So '1' is technically correct, but I think it's the wrong choice given the behavior
> > of this code.  E.g. if there's 1 object in the cache, the initial top-up will do
> > nothing,
> 
> This scenario will not happen though, since we free the caches after
> splitting. So, the next time userspace enables dirty logging on a
> memslot and we go to do the initial top-up the caches will have 0
> objects.
> 
> > and then tdp_mmu_split_large_pages_root() will almost immediately drop
> > mmu_lock to topup the cache.  Since the in-loop usage explicitly checks for an
> > empty cache, i.e. any non-zero @min will have identical behavior, I think it makes
> > sense to use KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE _and_ add a comment explaining why.
> 
> If we set the min to KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE,
> kvm_mmu_topup_memory_cache will return ENOMEM if it can't allocate at
> least KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE objects, even though we really
> only need 1 to make forward progress.
> 
> It's a total edge case but there could be a scenario where userspace
> sets the cgroup memory limits so tight that we can't allocate
> KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE objects when splitting the last few
> pages and in the end we only needed 1 or 2 objects to finish
> splitting. In this case we'd end up with a spurious pr_warn and may
> not split the last few pages depending on which cache failed to get
> topped up.

IMHO when -ENOMEM happens, instead of keep trying with 1 shadow sp we should
just bail out even earlier.

Say, if we only have 10 (<40) pages left for shadow sp's use, we'd better make
good use of them lazily to be consumed in follow up page faults when the guest
accessed any of the huge pages, rather than we take them all over to split the
next continuous huge pages assuming it'll be helpful..

From that POV I have a slight preference over Sean's suggestion because that'll
make us fail earlier.  But I agree it shouldn't be a big deal.

Thanks,
Sean Christopherson Dec. 1, 2021, 6:29 p.m. UTC | #10
On Wed, Dec 01, 2021, Peter Xu wrote:
> On Tue, Nov 30, 2021 at 05:29:10PM -0800, David Matlack wrote:
> > On Tue, Nov 30, 2021 at 5:01 PM Sean Christopherson <seanjc@google.com> wrote:
> > > So '1' is technically correct, but I think it's the wrong choice given the behavior
> > > of this code.  E.g. if there's 1 object in the cache, the initial top-up will do
> > > nothing,
> > 
> > This scenario will not happen though, since we free the caches after
> > splitting. So, the next time userspace enables dirty logging on a
> > memslot and we go to do the initial top-up the caches will have 0
> > objects.

Ah.

> > > and then tdp_mmu_split_large_pages_root() will almost immediately drop
> > > mmu_lock to topup the cache.  Since the in-loop usage explicitly checks for an
> > > empty cache, i.e. any non-zero @min will have identical behavior, I think it makes
> > > sense to use KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE _and_ add a comment explaining why.
> > 
> > If we set the min to KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE,
> > kvm_mmu_topup_memory_cache will return ENOMEM if it can't allocate at
> > least KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE objects, even though we really
> > only need 1 to make forward progress.
> > 
> > It's a total edge case but there could be a scenario where userspace
> > sets the cgroup memory limits so tight that we can't allocate
> > KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE objects when splitting the last few
> > pages and in the end we only needed 1 or 2 objects to finish
> > splitting. In this case we'd end up with a spurious pr_warn and may
> > not split the last few pages depending on which cache failed to get
> > topped up.
> 
> IMHO when -ENOMEM happens, instead of keep trying with 1 shadow sp we should
> just bail out even earlier.
> 
> Say, if we only have 10 (<40) pages left for shadow sp's use, we'd better make
> good use of them lazily to be consumed in follow up page faults when the guest
> accessed any of the huge pages, rather than we take them all over to split the
> next continuous huge pages assuming it'll be helpful..
> 
> From that POV I have a slight preference over Sean's suggestion because that'll
> make us fail earlier.  But I agree it shouldn't be a big deal.

Hmm, in this particular case, I think using the caches is the wrong approach.  The
behavior of pre-filling the caches makes sense for vCPUs because faults may need
multiple objects and filling the cache ensures the entire fault can be handled
without dropping mmu_lock.  And any extra/unused objects can be used by future
faults.  For page splitting, neither of those really holds true.  If there are a
lot of pages to split, KVM will have to drop mmu_lock to refill the cache.  And if
there are few pages to split, or the caches are refilled toward the end of the walk,
KVM may end up with a pile of unused objects it needs to free.

Since this code already needs to handle failure, and more importantly, it's a
best-effort optimization, I think trying to use the caches is a square peg, round
hole scenario.

Rather than use the caches, we could do allocation 100% on-demand and never drop
mmu_lock to do allocation.  The one caveat is that direct reclaim would need to be
disallowed so that the allocation won't sleep.  That would mean that eager splitting
would fail under heavy memory pressure when it otherwise might succeed by reclaiming.
That would mean vCPUs get penalized as they'd need to do the splitting on fault and
potentially do direct reclaim as well.  It's not obvious that that would be a problem
in practice, e.g. the vCPU is probably already seeing a fair amount of disruption due
to memory pressure, and slowing down vCPUs might alleviate some of that pressure.

Not using the cache would also reduce the extra complexity, e.g. no need for
special mmu_cache handling or a variant of tdp_mmu_iter_cond_resched().

I'm thinking something like this (very incomplete):

static void init_tdp_mmu_page(struct kvm_mmu_page *sp, u64 *spt, gfn_t gfn,
			      union kvm_mmu_page_role role)
{
	sp->spt = spt;
	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);

	sp->role = role;
	sp->gfn = gfn;
	sp->tdp_mmu_page = true;

	trace_kvm_mmu_get_page(sp, true);
}

static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn,
					       union kvm_mmu_page_role role)
{
	struct kvm_mmu_page *sp;
	u64 *spt;

	sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
	spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);

	init_tdp_mmu_page(sp, spt, gfn, role);
}

static union kvm_mmu_page_role get_child_page_role(struct tdp_iter *iter)
{
	struct kvm_mmu_page *parent = sptep_to_sp(rcu_dereference(iter->sptep));
	union kvm_mmu_page_role role = parent->role;

	role.level--;
	return role;
}

static bool tdp_mmu_install_sp_atomic(struct kvm *kvm,
				      struct tdp_iter *iter,
				      struct kvm_mmu_page *sp,
				      bool account_nx)
{
	u64 spte;

	spte = make_nonleaf_spte(sp->spt, !shadow_accessed_mask);

	if (tdp_mmu_set_spte_atomic(kvm, iter, spte)) {
		tdp_mmu_link_page(kvm, sp, account_nx);
		return true;
	}
	return false;
}

static void tdp_mmu_split_large_pages_root(struct kvm *kvm, struct kvm_mmu_page *root,
					   gfn_t start, gfn_t end, int target_level)
{
	/*
	 * Disallow direct reclaim, allocations will be made while holding
	 * mmu_lock and must not sleep.
	 */
	gfp_t gfp = (GFP_KERNEL_ACCOUNT | __GFP_ZERO) & ~__GFP_DIRECT_RECLAIM;
	struct kvm_mmu_page *sp = NULL;
	struct tdp_iter iter;
	bool flush = false;
	u64 *spt = NULL;
	int r;

	rcu_read_lock();

	/*
	 * Traverse the page table splitting all large pages above the target
	 * level into one lower level. For example, if we encounter a 1GB page
	 * we split it into 512 2MB pages.
	 *
	 * Since the TDP iterator uses a pre-order traversal, we are guaranteed
	 * to visit an SPTE before ever visiting its children, which means we
	 * will correctly recursively split large pages that are more than one
	 * level above the target level (e.g. splitting 1GB to 2MB to 4KB).
	 */
	for_each_tdp_pte_min_level(iter, root, target_level + 1, start, end) {
retry:
		if (tdp_mmu_iter_cond_resched(kvm, &iter, flush, true))
			continue;

		if (!is_shadow_present_pte(iter.old_spte || !is_large_pte(pte))
			continue;

		if (!sp) {
			sp = kmem_cache_alloc(mmu_page_header_cache, gfp);
			if (!sp)
				break;
			spt = (void *)__get_free_page(gfp);
			if (!spt)
				break;
		}

		init_tdp_mmu_page(sp, spt, iter->gfn,
				  get_child_page_role(&iter));

		if (!tdp_mmu_split_large_page(kvm, &iter, sp))
			goto retry;

		sp = NULL;
		spt = NULL;
	}

	free_page((unsigned long)spt);
	kmem_cache_free(mmu_page_header_cache, sp);

	rcu_read_unlock();

	if (flush)
		kvm_flush_remote_tlbs(kvm);
}
David Matlack Dec. 1, 2021, 9:36 p.m. UTC | #11
On Wed, Dec 1, 2021 at 10:29 AM Sean Christopherson <seanjc@google.com> wrote:
>
> On Wed, Dec 01, 2021, Peter Xu wrote:
> > On Tue, Nov 30, 2021 at 05:29:10PM -0800, David Matlack wrote:
> > > On Tue, Nov 30, 2021 at 5:01 PM Sean Christopherson <seanjc@google.com> wrote:
> > > > So '1' is technically correct, but I think it's the wrong choice given the behavior
> > > > of this code.  E.g. if there's 1 object in the cache, the initial top-up will do
> > > > nothing,
> > >
> > > This scenario will not happen though, since we free the caches after
> > > splitting. So, the next time userspace enables dirty logging on a
> > > memslot and we go to do the initial top-up the caches will have 0
> > > objects.
>
> Ah.
>
> > > > and then tdp_mmu_split_large_pages_root() will almost immediately drop
> > > > mmu_lock to topup the cache.  Since the in-loop usage explicitly checks for an
> > > > empty cache, i.e. any non-zero @min will have identical behavior, I think it makes
> > > > sense to use KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE _and_ add a comment explaining why.
> > >
> > > If we set the min to KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE,
> > > kvm_mmu_topup_memory_cache will return ENOMEM if it can't allocate at
> > > least KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE objects, even though we really
> > > only need 1 to make forward progress.
> > >
> > > It's a total edge case but there could be a scenario where userspace
> > > sets the cgroup memory limits so tight that we can't allocate
> > > KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE objects when splitting the last few
> > > pages and in the end we only needed 1 or 2 objects to finish
> > > splitting. In this case we'd end up with a spurious pr_warn and may
> > > not split the last few pages depending on which cache failed to get
> > > topped up.
> >
> > IMHO when -ENOMEM happens, instead of keep trying with 1 shadow sp we should
> > just bail out even earlier.
> >
> > Say, if we only have 10 (<40) pages left for shadow sp's use, we'd better make
> > good use of them lazily to be consumed in follow up page faults when the guest
> > accessed any of the huge pages, rather than we take them all over to split the
> > next continuous huge pages assuming it'll be helpful..
> >
> > From that POV I have a slight preference over Sean's suggestion because that'll
> > make us fail earlier.  But I agree it shouldn't be a big deal.
>
> Hmm, in this particular case, I think using the caches is the wrong approach.  The
> behavior of pre-filling the caches makes sense for vCPUs because faults may need
> multiple objects and filling the cache ensures the entire fault can be handled
> without dropping mmu_lock.  And any extra/unused objects can be used by future
> faults.  For page splitting, neither of those really holds true.  If there are a
> lot of pages to split, KVM will have to drop mmu_lock to refill the cache.  And if
> there are few pages to split, or the caches are refilled toward the end of the walk,
> KVM may end up with a pile of unused objects it needs to free.
>
> Since this code already needs to handle failure, and more importantly, it's a
> best-effort optimization, I think trying to use the caches is a square peg, round
> hole scenario.
>
> Rather than use the caches, we could do allocation 100% on-demand and never drop
> mmu_lock to do allocation.  The one caveat is that direct reclaim would need to be
> disallowed so that the allocation won't sleep.  That would mean that eager splitting
> would fail under heavy memory pressure when it otherwise might succeed by reclaiming.
> That would mean vCPUs get penalized as they'd need to do the splitting on fault and
> potentially do direct reclaim as well.  It's not obvious that that would be a problem
> in practice, e.g. the vCPU is probably already seeing a fair amount of disruption due
> to memory pressure, and slowing down vCPUs might alleviate some of that pressure.

Not necessarily. The vCPUs might be running just fine in the VM being
split because they are in their steady state and not faulting in any
new memory. (Memory pressure might be coming from another VM landing
on the host.)

IMO, if we have an opportunity to avoid doing direct reclaim in the
critical path of customer execution we should take it.

The on-demand approach will also increase the amount of time we have
to hold the MMU lock to page splitting. This is not too terrible for
the TDP MMU since we are holding the MMU lock in read mode, but is
going to become a problem when we add page splitting support for the
shadow MMU.

I do agree that the caches approach, as implemented, will inevitably
end up with a pile of unused objects at the end that need to be freed.
I'd be happy to take a look and see if there's anyway to reduce the
amount of unused objects at the end with a bit smarter top-up logic.

>
> Not using the cache would also reduce the extra complexity, e.g. no need for
> special mmu_cache handling or a variant of tdp_mmu_iter_cond_resched().
>
> I'm thinking something like this (very incomplete):
>
> static void init_tdp_mmu_page(struct kvm_mmu_page *sp, u64 *spt, gfn_t gfn,
>                               union kvm_mmu_page_role role)
> {
>         sp->spt = spt;
>         set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
>
>         sp->role = role;
>         sp->gfn = gfn;
>         sp->tdp_mmu_page = true;
>
>         trace_kvm_mmu_get_page(sp, true);
> }
>
> static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn,
>                                                union kvm_mmu_page_role role)
> {
>         struct kvm_mmu_page *sp;
>         u64 *spt;
>
>         sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
>         spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
>
>         init_tdp_mmu_page(sp, spt, gfn, role);
> }
>
> static union kvm_mmu_page_role get_child_page_role(struct tdp_iter *iter)
> {
>         struct kvm_mmu_page *parent = sptep_to_sp(rcu_dereference(iter->sptep));
>         union kvm_mmu_page_role role = parent->role;
>
>         role.level--;
>         return role;
> }
>
> static bool tdp_mmu_install_sp_atomic(struct kvm *kvm,
>                                       struct tdp_iter *iter,
>                                       struct kvm_mmu_page *sp,
>                                       bool account_nx)
> {
>         u64 spte;
>
>         spte = make_nonleaf_spte(sp->spt, !shadow_accessed_mask);
>
>         if (tdp_mmu_set_spte_atomic(kvm, iter, spte)) {
>                 tdp_mmu_link_page(kvm, sp, account_nx);
>                 return true;
>         }
>         return false;
> }
>
> static void tdp_mmu_split_large_pages_root(struct kvm *kvm, struct kvm_mmu_page *root,
>                                            gfn_t start, gfn_t end, int target_level)
> {
>         /*
>          * Disallow direct reclaim, allocations will be made while holding
>          * mmu_lock and must not sleep.
>          */
>         gfp_t gfp = (GFP_KERNEL_ACCOUNT | __GFP_ZERO) & ~__GFP_DIRECT_RECLAIM;
>         struct kvm_mmu_page *sp = NULL;
>         struct tdp_iter iter;
>         bool flush = false;
>         u64 *spt = NULL;
>         int r;
>
>         rcu_read_lock();
>
>         /*
>          * Traverse the page table splitting all large pages above the target
>          * level into one lower level. For example, if we encounter a 1GB page
>          * we split it into 512 2MB pages.
>          *
>          * Since the TDP iterator uses a pre-order traversal, we are guaranteed
>          * to visit an SPTE before ever visiting its children, which means we
>          * will correctly recursively split large pages that are more than one
>          * level above the target level (e.g. splitting 1GB to 2MB to 4KB).
>          */
>         for_each_tdp_pte_min_level(iter, root, target_level + 1, start, end) {
> retry:
>                 if (tdp_mmu_iter_cond_resched(kvm, &iter, flush, true))
>                         continue;
>
>                 if (!is_shadow_present_pte(iter.old_spte || !is_large_pte(pte))
>                         continue;
>
>                 if (!sp) {
>                         sp = kmem_cache_alloc(mmu_page_header_cache, gfp);
>                         if (!sp)
>                                 break;
>                         spt = (void *)__get_free_page(gfp);
>                         if (!spt)
>                                 break;
>                 }
>
>                 init_tdp_mmu_page(sp, spt, iter->gfn,
>                                   get_child_page_role(&iter));
>
>                 if (!tdp_mmu_split_large_page(kvm, &iter, sp))
>                         goto retry;
>
>                 sp = NULL;
>                 spt = NULL;
>         }
>
>         free_page((unsigned long)spt);
>         kmem_cache_free(mmu_page_header_cache, sp);
>
>         rcu_read_unlock();
>
>         if (flush)
>                 kvm_flush_remote_tlbs(kvm);
> }
Sean Christopherson Dec. 1, 2021, 11:37 p.m. UTC | #12
On Wed, Dec 01, 2021, David Matlack wrote:
> On Wed, Dec 1, 2021 at 10:29 AM Sean Christopherson <seanjc@google.com> wrote:
> > Hmm, in this particular case, I think using the caches is the wrong approach.  The
> > behavior of pre-filling the caches makes sense for vCPUs because faults may need
> > multiple objects and filling the cache ensures the entire fault can be handled
> > without dropping mmu_lock.  And any extra/unused objects can be used by future
> > faults.  For page splitting, neither of those really holds true.  If there are a
> > lot of pages to split, KVM will have to drop mmu_lock to refill the cache.  And if
> > there are few pages to split, or the caches are refilled toward the end of the walk,
> > KVM may end up with a pile of unused objects it needs to free.
> >
> > Since this code already needs to handle failure, and more importantly, it's a
> > best-effort optimization, I think trying to use the caches is a square peg, round
> > hole scenario.
> >
> > Rather than use the caches, we could do allocation 100% on-demand and never drop
> > mmu_lock to do allocation.  The one caveat is that direct reclaim would need to be
> > disallowed so that the allocation won't sleep.  That would mean that eager splitting
> > would fail under heavy memory pressure when it otherwise might succeed by reclaiming.
> > That would mean vCPUs get penalized as they'd need to do the splitting on fault and
> > potentially do direct reclaim as well.  It's not obvious that that would be a problem
> > in practice, e.g. the vCPU is probably already seeing a fair amount of disruption due
> > to memory pressure, and slowing down vCPUs might alleviate some of that pressure.
> 
> Not necessarily. The vCPUs might be running just fine in the VM being
> split because they are in their steady state and not faulting in any
> new memory. (Memory pressure might be coming from another VM landing
> on the host.)

Hrm, true.

> IMO, if we have an opportunity to avoid doing direct reclaim in the
> critical path of customer execution we should take it.
>
> 
> The on-demand approach will also increase the amount of time we have
> to hold the MMU lock to page splitting. This is not too terrible for
> the TDP MMU since we are holding the MMU lock in read mode, but is
> going to become a problem when we add page splitting support for the
> shadow MMU.
> 
> I do agree that the caches approach, as implemented, will inevitably
> end up with a pile of unused objects at the end that need to be freed.
> I'd be happy to take a look and see if there's anyway to reduce the
> amount of unused objects at the end with a bit smarter top-up logic.

It's not just the extra objects, it's the overall complexity that bothers me.
Complexity isn't really the correct word, it's more that as written, the logic
is spread over several files and is disingenuous from the perspective that the
split_cache is in kvm->arch, which implies persistence, but the cache are
completely torn down after evey memslot split.

I suspect part of the problem is that the code is trying to plan for a future
where nested MMUs also support splitting large pages.  Usually I'm all for that
sort of thing, but in this case it creates a lot of APIs that should not exist,
either because the function is not needed at all, or because it's a helper buried
in tdp_mmu.c.  E.g. assert_split_caches_invariants() is overkill.

That's solvable by refactoring and shuffling code, but using kvm_mmu_memory_cache
still feels wrong.  The caches don't fully solve the might_sleep() problem since
the loop still has to drop mmu_lock purely because it needs to allocate memory,
and at the same time the caches are too agressive because we can theoretically get
false positives on OOM scenarios, e.g. a topup could fail when trying to allocate
25 objects, when only 1 is needed.  We could enhance the cache code, which is
pretty rudimentary, but it still feels forced.

One thing we can take advantage of is that remote TLB flushes can be deferred
until after all roots are done, and don't need to be serviced if mmu_lock is
dropped.  Changes from a hugepage to a collection of smaller pages is atomic, no
memory is freed, and there are no changes in gfn=>pfn made by the split.  If
something else comes along and modifies the newly created sp or its children,
then it will flush accordingly.  Similar to write-protecting the page, the only
requirement is that all vCPUs see the small pages before the ioctl() returns,
i.e. before userspace can query the dirty log.  Never needing to flush is one
less reason to use a variant of tdp_mmu_iter_cond_resched(). 

So, what if we do something like this?  Try to allocate on-demand without dropping
mmu_lock.  In the happy case, it will succeed and there's no need to drop mmu_lock.
If allocation fails, drop RCU and mmu_lock and retry with direct relcaim allowed.

Some ugly gotos to reduce indentation, there's probably a better way to dress
this up.  Comments obviously needed.  This also doesn't track whether or not a
flush is needed, that will sadly need to be an in/out param, assuming we want to
return success/failure.

static struct kvm_mmu_page *tdp_mmu_alloc_sp(gfp_t allow_direct_reclaim)
{
	gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | allow_direct_reclaim;
	struct kvm_mmu_page *sp;
	u64 *spt;

	spt = (void *)__get_free_page(gfp);
	if (!spt)
		return NULL;

	sp = kmem_cache_alloc(mmu_page_header_cache, gfp);
	if (!sp) {
		free_page((unsigned long)spt);
		return NULL;
	}

	sp->spt = spt;

	return sp;
}

static int tdp_mmu_split_large_pages(struct kvm *kvm, struct kvm_mmu_page *root,
				     gfn_t start, gfn_t end, int target_level)
{
	struct kvm_mmu_page *sp = NULL;
	struct tdp_iter iter;

	rcu_read_lock();

	for_each_tdp_pte_min_level(iter, root, target_level + 1, start, end) {
retry:
		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
			continue;

		if (!is_shadow_present_pte(iter.old_spte || !is_large_pte(pte))
			continue;

		if (likely(sp))
			goto do_split;

		sp = tdp_mmu_alloc_sp(0);
		if (!sp) {
			rcu_read_unlock();
			read_unlock(&kvm->mmu_lock);

			sp = tdp_mmu_alloc_sp(__GFP_DIRECT_RECLAIM);

			read_lock(&kvm->mmu_lock);

			if (!sp)
				return -ENOMEM;

			rcu_read_lock();
			tdp_iter_restart(iter);
			continue;
		}

do_split:
		init_tdp_mmu_page(sp, iter->gfn, get_child_page_role(&iter));

		if (!tdp_mmu_split_large_page(kvm, &iter, sp))
			goto retry;

		sp = NULL;
	}

	rcu_read_unlock();

	return 0;
}
David Matlack Dec. 2, 2021, 5:41 p.m. UTC | #13
On Wed, Dec 1, 2021 at 3:37 PM Sean Christopherson <seanjc@google.com> wrote:
>
> On Wed, Dec 01, 2021, David Matlack wrote:
> > On Wed, Dec 1, 2021 at 10:29 AM Sean Christopherson <seanjc@google.com> wrote:
> > > Hmm, in this particular case, I think using the caches is the wrong approach.  The
> > > behavior of pre-filling the caches makes sense for vCPUs because faults may need
> > > multiple objects and filling the cache ensures the entire fault can be handled
> > > without dropping mmu_lock.  And any extra/unused objects can be used by future
> > > faults.  For page splitting, neither of those really holds true.  If there are a
> > > lot of pages to split, KVM will have to drop mmu_lock to refill the cache.  And if
> > > there are few pages to split, or the caches are refilled toward the end of the walk,
> > > KVM may end up with a pile of unused objects it needs to free.
> > >
> > > Since this code already needs to handle failure, and more importantly, it's a
> > > best-effort optimization, I think trying to use the caches is a square peg, round
> > > hole scenario.
> > >
> > > Rather than use the caches, we could do allocation 100% on-demand and never drop
> > > mmu_lock to do allocation.  The one caveat is that direct reclaim would need to be
> > > disallowed so that the allocation won't sleep.  That would mean that eager splitting
> > > would fail under heavy memory pressure when it otherwise might succeed by reclaiming.
> > > That would mean vCPUs get penalized as they'd need to do the splitting on fault and
> > > potentially do direct reclaim as well.  It's not obvious that that would be a problem
> > > in practice, e.g. the vCPU is probably already seeing a fair amount of disruption due
> > > to memory pressure, and slowing down vCPUs might alleviate some of that pressure.
> >
> > Not necessarily. The vCPUs might be running just fine in the VM being
> > split because they are in their steady state and not faulting in any
> > new memory. (Memory pressure might be coming from another VM landing
> > on the host.)
>
> Hrm, true.
>
> > IMO, if we have an opportunity to avoid doing direct reclaim in the
> > critical path of customer execution we should take it.
> >
> >
> > The on-demand approach will also increase the amount of time we have
> > to hold the MMU lock to page splitting. This is not too terrible for
> > the TDP MMU since we are holding the MMU lock in read mode, but is
> > going to become a problem when we add page splitting support for the
> > shadow MMU.
> >
> > I do agree that the caches approach, as implemented, will inevitably
> > end up with a pile of unused objects at the end that need to be freed.
> > I'd be happy to take a look and see if there's anyway to reduce the
> > amount of unused objects at the end with a bit smarter top-up logic.
>
> It's not just the extra objects, it's the overall complexity that bothers me.
> Complexity isn't really the correct word, it's more that as written, the logic
> is spread over several files and is disingenuous from the perspective that the
> split_cache is in kvm->arch, which implies persistence, but the cache are
> completely torn down after evey memslot split.
>
> I suspect part of the problem is that the code is trying to plan for a future
> where nested MMUs also support splitting large pages.  Usually I'm all for that
> sort of thing, but in this case it creates a lot of APIs that should not exist,
> either because the function is not needed at all, or because it's a helper buried
> in tdp_mmu.c.  E.g. assert_split_caches_invariants() is overkill.
>
> That's solvable by refactoring and shuffling code, but using kvm_mmu_memory_cache
> still feels wrong.  The caches don't fully solve the might_sleep() problem since
> the loop still has to drop mmu_lock purely because it needs to allocate memory,

I thought dropping the lock to allocate memory was a good thing. It
reduces the length of time we hold the RCU read lock and mmu_lock in
read mode. Plus it avoids the retry-with-reclaim and lets us reuse the
existing sp allocation code.

Eager page splitting itself does not need to be that performant since
it's not on the critical path of vCPU execution. But holding the MMU
lock can negatively affect vCPU performance.

But your preference is to allocate without dropping the lock when possible. Why?

> and at the same time the caches are too agressive because we can theoretically get
> false positives on OOM scenarios, e.g. a topup could fail when trying to allocate
> 25 objects, when only 1 is needed.

This is why I picked a min of 1 for the cache top-up. But this would
be true if we increased the min beyond 1.

> We could enhance the cache code, which is
> pretty rudimentary, but it still feels forced.
>
> One thing we can take advantage of is that remote TLB flushes can be deferred
> until after all roots are done, and don't need to be serviced if mmu_lock is
> dropped.

Good point. I'll revise the TLB flushing in v1 regardless.


> Changes from a hugepage to a collection of smaller pages is atomic, no
> memory is freed, and there are no changes in gfn=>pfn made by the split.  If
> something else comes along and modifies the newly created sp or its children,
> then it will flush accordingly.  Similar to write-protecting the page, the only
> requirement is that all vCPUs see the small pages before the ioctl() returns,
> i.e. before userspace can query the dirty log.  Never needing to flush is one
> less reason to use a variant of tdp_mmu_iter_cond_resched().
>
> So, what if we do something like this?  Try to allocate on-demand without dropping
> mmu_lock.  In the happy case, it will succeed and there's no need to drop mmu_lock.
> If allocation fails, drop RCU and mmu_lock and retry with direct relcaim allowed.
>
> Some ugly gotos to reduce indentation, there's probably a better way to dress
> this up.  Comments obviously needed.  This also doesn't track whether or not a
> flush is needed, that will sadly need to be an in/out param, assuming we want to
> return success/failure.
>
> static struct kvm_mmu_page *tdp_mmu_alloc_sp(gfp_t allow_direct_reclaim)
> {
>         gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | allow_direct_reclaim;
>         struct kvm_mmu_page *sp;
>         u64 *spt;
>
>         spt = (void *)__get_free_page(gfp);
>         if (!spt)
>                 return NULL;
>
>         sp = kmem_cache_alloc(mmu_page_header_cache, gfp);
>         if (!sp) {
>                 free_page((unsigned long)spt);
>                 return NULL;
>         }
>
>         sp->spt = spt;
>
>         return sp;
> }
>
> static int tdp_mmu_split_large_pages(struct kvm *kvm, struct kvm_mmu_page *root,
>                                      gfn_t start, gfn_t end, int target_level)
> {
>         struct kvm_mmu_page *sp = NULL;
>         struct tdp_iter iter;
>
>         rcu_read_lock();
>
>         for_each_tdp_pte_min_level(iter, root, target_level + 1, start, end) {
> retry:
>                 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
>                         continue;
>
>                 if (!is_shadow_present_pte(iter.old_spte || !is_large_pte(pte))
>                         continue;
>
>                 if (likely(sp))
>                         goto do_split;
>
>                 sp = tdp_mmu_alloc_sp(0);
>                 if (!sp) {
>                         rcu_read_unlock();
>                         read_unlock(&kvm->mmu_lock);
>
>                         sp = tdp_mmu_alloc_sp(__GFP_DIRECT_RECLAIM);
>
>                         read_lock(&kvm->mmu_lock);
>
>                         if (!sp)
>                                 return -ENOMEM;
>
>                         rcu_read_lock();
>                         tdp_iter_restart(iter);
>                         continue;
>                 }
>
> do_split:
>                 init_tdp_mmu_page(sp, iter->gfn, get_child_page_role(&iter));
>
>                 if (!tdp_mmu_split_large_page(kvm, &iter, sp))
>                         goto retry;
>
>                 sp = NULL;
>         }
>
>         rcu_read_unlock();
>
>         return 0;
> }
>
Sean Christopherson Dec. 2, 2021, 6:42 p.m. UTC | #14
On Thu, Dec 02, 2021, David Matlack wrote:
> On Wed, Dec 1, 2021 at 3:37 PM Sean Christopherson <seanjc@google.com> wrote:
> > It's not just the extra objects, it's the overall complexity that bothers me.
> > Complexity isn't really the correct word, it's more that as written, the logic
> > is spread over several files and is disingenuous from the perspective that the
> > split_cache is in kvm->arch, which implies persistence, but the cache are
> > completely torn down after evey memslot split.
> >
> > I suspect part of the problem is that the code is trying to plan for a future
> > where nested MMUs also support splitting large pages.  Usually I'm all for that
> > sort of thing, but in this case it creates a lot of APIs that should not exist,
> > either because the function is not needed at all, or because it's a helper buried
> > in tdp_mmu.c.  E.g. assert_split_caches_invariants() is overkill.
> >
> > That's solvable by refactoring and shuffling code, but using kvm_mmu_memory_cache
> > still feels wrong.  The caches don't fully solve the might_sleep() problem since
> > the loop still has to drop mmu_lock purely because it needs to allocate memory,
> 
> I thought dropping the lock to allocate memory was a good thing. It
> reduces the length of time we hold the RCU read lock and mmu_lock in
> read mode. Plus it avoids the retry-with-reclaim and lets us reuse the
> existing sp allocation code.

It's not a simple reuse though, e.g. it needs new logic to detect when the caches
are empty, requires a variant of tdp_mmu_iter_cond_resched(), needs its own instance
of caches and thus initialization/destruction of the caches, etc...

> Eager page splitting itself does not need to be that performant since
> it's not on the critical path of vCPU execution. But holding the MMU
> lock can negatively affect vCPU performance.
> 
> But your preference is to allocate without dropping the lock when possible. Why?

Because they're two different things.  Lock contention is already handled by
tdp_mmu_iter_cond_resched().  If mmu_lock is not contended, holding it for a long
duration is a complete non-issue.

Dropping mmu_lock means restarting the walk at the root because a different task
may have zapped/changed upper level entries.  If every allocation is dropping
mmu_lock, that adds up to a lot of extra memory accesses, especially when using
5-level paging.

Batching allocations via mmu_caches mostly works around that problem, but IMO
it's more complex overall than the retry-on-failure approach because it bleeds
core details into several locations, e.g. the split logic needs to know intimate
details of kvm_mmu_memory_cache, and we end up with two (or one complex) versions
of tdp_mmu_iter_cond_resched().

In general, I also dislike relying on magic numbers (the capacity of the cache)
for performance.  At best, we have to justify the magic number, now and in the
future.  At worst, someone will have a use case that doesn't play nice with KVM's
chosen magic number and then we have to do more tuning, e.g. see the PTE prefetch
stuff where the magic number of '8' (well, 7) ran out of gas for modern usage.
I don't actually think tuning will be problematic for this case, but I'd rather
avoid the discussion entirely if possible.

I'm not completely opposed to using kvm_mmu_memory_cache to batch allocations,
but I think we should do so if and only if batching has measurably better
performance for things we care about.  E.g. if eager splitting takes n% longer
under heavy memory pressure, but vCPUs aren't impacted, do we care?
David Matlack Dec. 3, 2021, midnight UTC | #15
On Thu, Dec 2, 2021 at 10:43 AM Sean Christopherson <seanjc@google.com> wrote:
>
> On Thu, Dec 02, 2021, David Matlack wrote:
> > On Wed, Dec 1, 2021 at 3:37 PM Sean Christopherson <seanjc@google.com> wrote:
> > > It's not just the extra objects, it's the overall complexity that bothers me.
> > > Complexity isn't really the correct word, it's more that as written, the logic
> > > is spread over several files and is disingenuous from the perspective that the
> > > split_cache is in kvm->arch, which implies persistence, but the cache are
> > > completely torn down after evey memslot split.
> > >kmem_cache_alloc
> > > I suspect part of the problem is that the code is trying to plan for a future
> > > where nested MMUs also support splitting large pages.  Usually I'm all for that
> > > sort of thing, but in this case it creates a lot of APIs that should not exist,
> > > either because the function is not needed at all, or because it's a helper buried
> > > in tdp_mmu.c.  E.g. assert_split_caches_invariants() is overkill.
> > >
> > > That's solvable by refactoring and shuffling code, but using kvm_mmu_memory_cache
> > > still feels wrong.  The caches don't fully solve the might_sleep() problem since
> > > the loop still has to drop mmu_lock purely because it needs to allocate memory,
> >
> > I thought dropping the lock to allocate memory was a good thing. It
> > reduces the length of time we hold the RCU read lock and mmu_lock in
> > read mode. Plus it avoids the retry-with-reclaim and lets us reuse the
> > existing sp allocation code.
>
> It's not a simple reuse though, e.g. it needs new logic to detect when the caches
> are empty, requires a variant of tdp_mmu_iter_cond_resched(), needs its own instance
> of caches and thus initialization/destruction of the caches, etc...
>
> > Eager page splitting itself does not need to be that performant since
> > it's not on the critical path of vCPU execution. But holding the MMU
> > lock can negatively affect vCPU performance.
> >
> > But your preference is to allocate without dropping the lock when possible. Why?
>
> Because they're two different things.  Lock contention is already handled by
> tdp_mmu_iter_cond_resched().  If mmu_lock is not contended, holding it for a long
> duration is a complete non-issue.

So I think you are positing that disabling reclaim will make the
allocations fast enough that the time between
tdp_mmu_iter_cond_resched checks will be acceptable. Is there really
no risk of long tail latency in kmem_cache_alloc() or
__get_free_page()? Even if it's rare, they will be common at scale.

This is why I'm being so hesitant, and prefer to avoid the problem
entirely by doing all allocations outside the lock. But I'm honestly
more than happy to be convinced otherwise and go with your approach.

>
> Dropping mmu_lock means restarting the walk at the root because a different task
> may have zapped/changed upper level entries.  If every allocation is dropping
> mmu_lock, that adds up to a lot of extra memory accesses, especially when using
> 5-level paging.
>
> Batching allocations via mmu_caches mostly works around that problem, but IMO
> it's more complex overall than the retry-on-failure approach because it bleeds
> core details into several locations, e.g. the split logic needs to know intimate
> details of kvm_mmu_memory_cache, and we end up with two (or one complex) versions
> of tdp_mmu_iter_cond_resched().
>
> In general, I also dislike relying on magic numbers (the capacity of the cache)
> for performance.  At best, we have to justify the magic number, now and in the
> future.  At worst, someone will have a use case that doesn't play nice with KVM's
> chosen magic number and then we have to do more tuning, e.g. see the PTE prefetch
> stuff where the magic number of '8' (well, 7) ran out of gas for modern usage.
> I don't actually think tuning will be problematic for this case, but I'd rather
> avoid the discussion entirely if possible.
>
> I'm not completely opposed to using kvm_mmu_memory_cache to batch allocations,
> but I think we should do so if and only if batching has measurably better
> performance for things we care about.  E.g. if eager splitting takes n% longer
> under heavy memory pressure, but vCPUs aren't impacted, do we care?
Sean Christopherson Dec. 3, 2021, 1:07 a.m. UTC | #16
On Thu, Dec 02, 2021, David Matlack wrote:
> On Thu, Dec 2, 2021 at 10:43 AM Sean Christopherson <seanjc@google.com> wrote:
> > Because they're two different things.  Lock contention is already handled by
> > tdp_mmu_iter_cond_resched().  If mmu_lock is not contended, holding it for a long
> > duration is a complete non-issue.
> 
> So I think you are positing that disabling reclaim will make the
> allocations fast enough that the time between
> tdp_mmu_iter_cond_resched checks will be acceptable.

Yep.

> Is there really no risk of long tail latency in kmem_cache_alloc() or
> __get_free_page()? Even if it's rare, they will be common at scale.

If there is a potentially long latency in __get_free_page(), then we're hosed no
matter what because per alloc_pages(), it's allowed in any context, including NMI,
IRQ, and Soft-IRQ.  I've no idea how often those contexts allocate, but I assume
it's not _that_ rare given the amount of stuff that networking does in Soft-IRQ
context, e.g. see the stack trace from commit 2620fe268e80, the use of PF_MEMALLOC,
the use of GFP_ATOMIC in napi_alloc_skb, etc...  Anb it's not just direct
allocations, e.g. anything that uses a radix tree or XArray will potentially
trigger allocation on insertion.

But I would be very, very surprised if alloc_pages() without GFP_DIRECT_RECLAIM
has a long tail latency, otherwise allocating from any atomic context would be
doomed.

> This is why I'm being so hesitant, and prefer to avoid the problem
> entirely by doing all allocations outside the lock. But I'm honestly
> more than happy to be convinced otherwise and go with your approach.
David Matlack Dec. 3, 2021, 5:22 p.m. UTC | #17
On Thu, Dec 2, 2021 at 5:07 PM Sean Christopherson <seanjc@google.com> wrote:
>
> On Thu, Dec 02, 2021, David Matlack wrote:
> > Is there really no risk of long tail latency in kmem_cache_alloc() or
> > __get_free_page()? Even if it's rare, they will be common at scale.
>
> If there is a potentially long latency in __get_free_page(), then we're hosed no
> matter what because per alloc_pages(), it's allowed in any context, including NMI,
> IRQ, and Soft-IRQ.  I've no idea how often those contexts allocate, but I assume
> it's not _that_ rare given the amount of stuff that networking does in Soft-IRQ
> context, e.g. see the stack trace from commit 2620fe268e80, the use of PF_MEMALLOC,
> the use of GFP_ATOMIC in napi_alloc_skb, etc...  Anb it's not just direct
> allocations, e.g. anything that uses a radix tree or XArray will potentially
> trigger allocation on insertion.
>
> But I would be very, very surprised if alloc_pages() without GFP_DIRECT_RECLAIM
> has a long tail latency, otherwise allocating from any atomic context would be
> doomed.

In that case I agree your approach should not introduce any more MMU
lock contention than the split_caches approach in practice, and will
require a lot less new code. I'll attempt to do some testing to
confirm, but assuming that goes fine I'll go with your approach in v1.

Thanks!
diff mbox series

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 2a7564703ea6..432a4df817ec 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1232,6 +1232,9 @@  struct kvm_arch {
 	hpa_t	hv_root_tdp;
 	spinlock_t hv_root_tdp_lock;
 #endif
+
+	/* MMU caches used when splitting large pages during VM-ioctls. */
+	struct kvm_mmu_memory_caches split_caches;
 };
 
 struct kvm_vm_stat {
@@ -1588,6 +1591,9 @@  void kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
 void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
 				      const struct kvm_memory_slot *memslot,
 				      int start_level);
+void kvm_mmu_slot_try_split_large_pages(struct kvm *kvm,
+					const struct kvm_memory_slot *memslot,
+					int target_level);
 void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
 				   const struct kvm_memory_slot *memslot);
 void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 54f0d2228135..6768ef9c0891 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -738,6 +738,66 @@  static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
 					  PT64_ROOT_MAX_LEVEL);
 }
 
+static inline void assert_split_caches_invariants(struct kvm *kvm)
+{
+	/*
+	 * The split caches must only be modified while holding the slots_lock,
+	 * since it is only used during memslot VM-ioctls.
+	 */
+	lockdep_assert_held(&kvm->slots_lock);
+
+	/*
+	 * Only the TDP MMU supports large page splitting using
+	 * kvm->arch.split_caches, which is why we only have to allocate
+	 * page_header_cache and shadow_page_cache. Assert that the TDP
+	 * MMU is at least enabled when the split cache is allocated.
+	 */
+	BUG_ON(!is_tdp_mmu_enabled(kvm));
+}
+
+int mmu_topup_split_caches(struct kvm *kvm)
+{
+	struct kvm_mmu_memory_caches *split_caches = &kvm->arch.split_caches;
+	int r;
+
+	assert_split_caches_invariants(kvm);
+
+	r = kvm_mmu_topup_memory_cache(&split_caches->page_header_cache, 1);
+	if (r)
+		goto out;
+
+	r = kvm_mmu_topup_memory_cache(&split_caches->shadow_page_cache, 1);
+	if (r)
+		goto out;
+
+	return 0;
+
+out:
+	pr_warn("Failed to top-up split caches. Will not split large pages.\n");
+	return r;
+}
+
+static void mmu_free_split_caches(struct kvm *kvm)
+{
+	assert_split_caches_invariants(kvm);
+
+	kvm_mmu_free_memory_cache(&kvm->arch.split_caches.pte_list_desc_cache);
+	kvm_mmu_free_memory_cache(&kvm->arch.split_caches.shadow_page_cache);
+}
+
+bool mmu_split_caches_need_topup(struct kvm *kvm)
+{
+	assert_split_caches_invariants(kvm);
+
+	if (kvm->arch.split_caches.page_header_cache.nobjs == 0)
+		return true;
+
+	if (kvm->arch.split_caches.shadow_page_cache.nobjs == 0)
+		return true;
+
+	return false;
+}
+
 static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
 {
 	struct kvm_mmu_memory_caches *mmu_caches;
@@ -5696,6 +5756,7 @@  void kvm_mmu_init_vm(struct kvm *kvm)
 
 	spin_lock_init(&kvm->arch.mmu_unsync_pages_lock);
 
+	mmu_init_memory_caches(&kvm->arch.split_caches);
 	kvm_mmu_init_tdp_mmu(kvm);
 
 	node->track_write = kvm_mmu_pte_write;
@@ -5819,6 +5880,28 @@  void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
 		kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
 }
 
+void kvm_mmu_slot_try_split_large_pages(struct kvm *kvm,
+					const struct kvm_memory_slot *memslot,
+					int target_level)
+{
+	u64 start, end;
+
+	if (!is_tdp_mmu_enabled(kvm))
+		return;
+
+	if (mmu_topup_split_caches(kvm))
+		return;
+
+	start = memslot->base_gfn;
+	end = start + memslot->npages;
+
+	read_lock(&kvm->mmu_lock);
+	kvm_tdp_mmu_try_split_large_pages(kvm, memslot, start, end, target_level);
+	read_unlock(&kvm->mmu_lock);
+
+	mmu_free_split_caches(kvm);
+}
+
 static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
 					 struct kvm_rmap_head *rmap_head,
 					 const struct kvm_memory_slot *slot)
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index 52c6527b1a06..89b9b907c567 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -161,4 +161,7 @@  void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);
 void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp);
 void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp);
 
+int mmu_topup_split_caches(struct kvm *kvm);
+bool mmu_split_caches_need_topup(struct kvm *kvm);
+
 #endif /* __KVM_X86_MMU_INTERNAL_H */
diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
index df2cdb8bcf77..6bb9b597a854 100644
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -191,6 +191,52 @@  bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 	return wrprot;
 }
 
+static u64 mark_spte_executable(u64 spte)
+{
+	bool is_access_track = is_access_track_spte(spte);
+
+	if (is_access_track)
+		spte = restore_acc_track_spte(spte);
+
+	spte &= ~shadow_nx_mask;
+	spte |= shadow_x_mask;
+
+	if (is_access_track)
+		spte = mark_spte_for_access_track(spte);
+
+	return spte;
+}
+
+/*
+ * Construct an SPTE that maps a sub-page of the given large SPTE. This is
+ * used during large page splitting, to build the SPTEs that make up the new
+ * page table.
+ */
+u64 make_large_page_split_spte(u64 large_spte, int level, int index, unsigned int access)
+{
+	u64 child_spte;
+	int child_level;
+
+	BUG_ON(is_mmio_spte(large_spte));
+	BUG_ON(!is_large_present_pte(large_spte));
+
+	child_spte = large_spte;
+	child_level = level - 1;
+
+	child_spte += (index * KVM_PAGES_PER_HPAGE(child_level)) << PAGE_SHIFT;
+
+	if (child_level == PG_LEVEL_4K) {
+		child_spte &= ~PT_PAGE_SIZE_MASK;
+
+		/* Allow execution for 4K pages if it was disabled for NX HugePages. */
+		if (is_nx_huge_page_enabled() && access & ACC_EXEC_MASK)
+			child_spte = mark_spte_executable(child_spte);
+	}
+
+	return child_spte;
+}
+
+
 u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled)
 {
 	u64 spte = SPTE_MMU_PRESENT_MASK;
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index 3e4943ee5a01..4efb4837e38d 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -339,6 +339,7 @@  bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 	       unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn,
 	       u64 old_spte, bool prefetch, bool can_unsync,
 	       bool host_writable, u64 *new_spte);
+u64 make_large_page_split_spte(u64 large_spte, int level, int index, unsigned int access);
 u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled);
 u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access);
 u64 mark_spte_for_access_track(u64 spte);
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 5ca0fa659245..366857b9fb3b 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -695,6 +695,39 @@  static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
 	return false;
 }
 
+static inline bool
+tdp_mmu_need_split_caches_topup_or_resched(struct kvm *kvm, struct tdp_iter *iter)
+{
+	if (mmu_split_caches_need_topup(kvm))
+		return true;
+
+	return tdp_mmu_iter_need_resched(kvm, iter);
+}
+
+static inline int
+tdp_mmu_topup_split_caches_resched(struct kvm *kvm, struct tdp_iter *iter, bool flush)
+{
+	int r;
+
+	rcu_read_unlock();
+
+	if (flush)
+		kvm_flush_remote_tlbs(kvm);
+
+	read_unlock(&kvm->mmu_lock);
+
+	cond_resched();
+	r = mmu_topup_split_caches(kvm);
+
+	read_lock(&kvm->mmu_lock);
+
+	rcu_read_lock();
+	WARN_ON(iter->gfn > iter->next_last_level_gfn);
+	tdp_iter_restart(iter);
+
+	return r;
+}
+
 /*
  * Tears down the mappings for the range of gfns, [start, end), and frees the
  * non-root pages mapping GFNs strictly within that range. Returns true if
@@ -1241,6 +1274,96 @@  bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
 	return spte_set;
 }
 
+static bool tdp_mmu_split_large_page_atomic(struct kvm *kvm, struct tdp_iter *iter)
+{
+	const u64 large_spte = iter->old_spte;
+	const int level = iter->level;
+	struct kvm_mmu_page *child_sp;
+	u64 child_spte;
+	int i;
+
+	BUG_ON(mmu_split_caches_need_topup(kvm));
+
+	child_sp = alloc_child_tdp_mmu_page(&kvm->arch.split_caches, iter);
+
+	for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
+		child_spte = make_large_page_split_spte(large_spte, level, i, ACC_ALL);
+
+		/*
+		 * No need for atomics since child_sp has not been installed
+		 * in the table yet and thus is not reachable by any other
+		 * thread.
+		 */
+		child_sp->spt[i] = child_spte;
+	}
+
+	return tdp_mmu_install_sp_atomic(kvm, iter, child_sp, false);
+}
+
+static void tdp_mmu_split_large_pages_root(struct kvm *kvm, struct kvm_mmu_page *root,
+					   gfn_t start, gfn_t end, int target_level)
+{
+	struct tdp_iter iter;
+	bool flush = false;
+	int r;
+
+	rcu_read_lock();
+
+	/*
+	 * Traverse the page table splitting all large pages above the target
+	 * level into one lower level. For example, if we encounter a 1GB page
+	 * we split it into 512 2MB pages.
+	 *
+	 * Since the TDP iterator uses a pre-order traversal, we are guaranteed
+	 * to visit an SPTE before ever visiting its children, which means we
+	 * will correctly recursively split large pages that are more than one
+	 * level above the target level (e.g. splitting 1GB to 2MB to 4KB).
+	 */
+	for_each_tdp_pte_min_level(iter, root, target_level + 1, start, end) {
+retry:
+		if (tdp_mmu_need_split_caches_topup_or_resched(kvm, &iter)) {
+			r = tdp_mmu_topup_split_caches_resched(kvm, &iter, flush);
+			flush = false;
+
+			/*
+			 * If topping up the split caches failed, we can't split
+			 * any more pages. Bail out of the loop.
+			 */
+			if (r)
+				break;
+
+			continue;
+		}
+
+		if (!is_large_present_pte(iter.old_spte))
+			continue;
+
+		if (!tdp_mmu_split_large_page_atomic(kvm, &iter))
+			goto retry;
+
+		flush = true;
+	}
+
+	rcu_read_unlock();
+
+	if (flush)
+		kvm_flush_remote_tlbs(kvm);
+}
+
+void kvm_tdp_mmu_try_split_large_pages(struct kvm *kvm,
+				       const struct kvm_memory_slot *slot,
+				       gfn_t start, gfn_t end,
+				       int target_level)
+{
+	struct kvm_mmu_page *root;
+
+	lockdep_assert_held_read(&kvm->mmu_lock);
+
+	for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
+		tdp_mmu_split_large_pages_root(kvm, root, start, end, target_level);
+
+}
+
 /*
  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index 476b133544dd..7812087836b2 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -72,6 +72,11 @@  bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
 				   struct kvm_memory_slot *slot, gfn_t gfn,
 				   int min_level);
 
+void kvm_tdp_mmu_try_split_large_pages(struct kvm *kvm,
+				       const struct kvm_memory_slot *slot,
+				       gfn_t start, gfn_t end,
+				       int target_level);
+
 static inline void kvm_tdp_mmu_walk_lockless_begin(void)
 {
 	rcu_read_lock();
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 04e8dabc187d..4702ebfd394b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -11735,6 +11735,12 @@  static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
 		if (kvm_dirty_log_manual_protect_and_init_set(kvm))
 			return;
 
+		/*
+		 * Attempt to split all large pages into 4K pages so that vCPUs
+		 * do not have to take write-protection faults.
+		 */
+		kvm_mmu_slot_try_split_large_pages(kvm, new, PG_LEVEL_4K);
+
 		if (kvm_x86_ops.cpu_dirty_log_size) {
 			kvm_mmu_slot_leaf_clear_dirty(kvm, new);
 			kvm_mmu_slot_remove_write_access(kvm, new, PG_LEVEL_2M);