diff mbox series

[04/22] kvm: mmu: Allocate and free TDP MMU roots

Message ID 20200925212302.3979661-5-bgardon@google.com (mailing list archive)
State New, archived
Headers show
Series Introduce the TDP MMU | expand

Commit Message

Ben Gardon Sept. 25, 2020, 9:22 p.m. UTC
The TDP MMU must be able to allocate paging structure root pages and track
the usage of those pages. Implement a similar, but separate system for root
page allocation to that of the x86 shadow paging implementation. When
future patches add synchronization model changes to allow for parallel
page faults, these pages will need to be handled differently from the
x86 shadow paging based MMU's root pages.

Tested by running kvm-unit-tests and KVM selftests on an Intel Haswell
machine. This series introduced no new failures.

This series can be viewed in Gerrit at:
	https://linux-review.googlesource.com/c/virt/kvm/kvm/+/2538

Signed-off-by: Ben Gardon <bgardon@google.com>
---
 arch/x86/include/asm/kvm_host.h |   1 +
 arch/x86/kvm/mmu/mmu.c          |  27 +++---
 arch/x86/kvm/mmu/mmu_internal.h |   9 ++
 arch/x86/kvm/mmu/tdp_mmu.c      | 157 ++++++++++++++++++++++++++++++++
 arch/x86/kvm/mmu/tdp_mmu.h      |   5 +
 5 files changed, 188 insertions(+), 11 deletions(-)

Comments

Sean Christopherson Sept. 30, 2020, 6:06 a.m. UTC | #1
On Fri, Sep 25, 2020 at 02:22:44PM -0700, Ben Gardon wrote:
  static u64 __read_mostly shadow_nx_mask;
> @@ -3597,10 +3592,14 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
>  	if (!VALID_PAGE(*root_hpa))
>  		return;
>  
> -	sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK);
> -	--sp->root_count;
> -	if (!sp->root_count && sp->role.invalid)
> -		kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
> +	if (is_tdp_mmu_root(kvm, *root_hpa)) {
> +		kvm_tdp_mmu_put_root_hpa(kvm, *root_hpa);
> +	} else {
> +		sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK);
> +		--sp->root_count;
> +		if (!sp->root_count && sp->role.invalid)
> +			kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);

Hmm, I see that future patches use put_tdp_mmu_root()/get_tdp_mmu_root(),
but the code itself isn't specific to the TDP MMU.  Even if this ends up
being the only non-TDP user of get/put, I think it'd be worth making them
common helpers, e.g.

	sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK);
	if (mmu_put_root(sp) {
		if (is_tdp_mmu(...))
			kvm_tdp_mmu_free_root(kvm, sp);
		else if (sp->role.invalid)
			kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
	}

> +	}
>  
>  	*root_hpa = INVALID_PAGE;
>  }
> @@ -3691,7 +3690,13 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
>  	unsigned i;
>  
>  	if (shadow_root_level >= PT64_ROOT_4LEVEL) {
> -		root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level, true);
> +		if (vcpu->kvm->arch.tdp_mmu_enabled) {

I believe this will break 32-bit NPT.  Or at a minimum, look weird.  It'd
be better to explicitly disable the TDP MMU on 32-bit KVM, then this becomes

	if (vcpu->kvm->arch.tdp_mmu_enabled) {

	} else if (shadow_root_level >= PT64_ROOT_4LEVEL) {

	} else {

	}

> +			root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu);
> +		} else {
> +			root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level,
> +					      true);
> +		}

May not matter in the end, but the braces aren't needed.

> +
>  		if (!VALID_PAGE(root))
>  			return -ENOSPC;
>  		vcpu->arch.mmu->root_hpa = root;
> diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
> index 65bb110847858..530b7d893c7b3 100644
> --- a/arch/x86/kvm/mmu/mmu_internal.h
> +++ b/arch/x86/kvm/mmu/mmu_internal.h
> @@ -41,8 +41,12 @@ struct kvm_mmu_page {
>  
>  	/* Number of writes since the last time traversal visited this page.  */
>  	atomic_t write_flooding_count;
> +
> +	bool tdp_mmu_page;
>  };
>  
> +extern struct kmem_cache *mmu_page_header_cache;
> +
>  static inline struct kvm_mmu_page *to_shadow_page(hpa_t shadow_page)
>  {
>  	struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT);
> @@ -69,6 +73,11 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
>  	(((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
>  #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
>  
> +#define ACC_EXEC_MASK    1
> +#define ACC_WRITE_MASK   PT_WRITABLE_MASK
> +#define ACC_USER_MASK    PT_USER_MASK
> +#define ACC_ALL          (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
> +
>  /* Functions for interpreting SPTEs */
>  kvm_pfn_t spte_to_pfn(u64 pte);
>  bool is_mmio_spte(u64 spte);
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> index 8241e18c111e6..cdca829e42040 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.c
> +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> @@ -1,5 +1,7 @@
>  /* SPDX-License-Identifier: GPL-2.0 */
>  
> +#include "mmu.h"
> +#include "mmu_internal.h"
>  #include "tdp_mmu.h"
>  
>  static bool __read_mostly tdp_mmu_enabled = true;
> @@ -25,10 +27,165 @@ void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
>  
>  	/* This should not be changed for the lifetime of the VM. */
>  	kvm->arch.tdp_mmu_enabled = true;
> +
> +	INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
>  }
>  
>  void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
>  {
>  	if (!kvm->arch.tdp_mmu_enabled)
>  		return;
> +
> +	WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
> +}
> +
> +#define for_each_tdp_mmu_root(_kvm, _root)			    \
> +	list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link)
> +
> +bool is_tdp_mmu_root(struct kvm *kvm, hpa_t hpa)
> +{
> +	struct kvm_mmu_page *root;
> +
> +	if (!kvm->arch.tdp_mmu_enabled)
> +		return false;
> +
> +	root = to_shadow_page(hpa);
> +
> +	if (WARN_ON(!root))
> +		return false;
> +
> +	return root->tdp_mmu_page;

Why all the extra checks?

> +}
> +
> +static void free_tdp_mmu_root(struct kvm *kvm, struct kvm_mmu_page *root)
> +{
> +	lockdep_assert_held(&kvm->mmu_lock);
> +
> +	WARN_ON(root->root_count);
> +	WARN_ON(!root->tdp_mmu_page);
> +
> +	list_del(&root->link);
> +
> +	free_page((unsigned long)root->spt);
> +	kmem_cache_free(mmu_page_header_cache, root);
> +}
> +
> +static void put_tdp_mmu_root(struct kvm *kvm, struct kvm_mmu_page *root)
> +{
> +	lockdep_assert_held(&kvm->mmu_lock);
> +
> +	root->root_count--;
> +	if (!root->root_count)
> +		free_tdp_mmu_root(kvm, root);
> +}
> +
> +static void get_tdp_mmu_root(struct kvm *kvm, struct kvm_mmu_page *root)
> +{
> +	lockdep_assert_held(&kvm->mmu_lock);
> +	WARN_ON(!root->root_count);
> +
> +	root->root_count++;
> +}
> +
> +void kvm_tdp_mmu_put_root_hpa(struct kvm *kvm, hpa_t root_hpa)
> +{
> +	struct kvm_mmu_page *root;
> +
> +	root = to_shadow_page(root_hpa);
> +
> +	if (WARN_ON(!root))
> +		return;
> +
> +	put_tdp_mmu_root(kvm, root);
> +}
> +
> +static struct kvm_mmu_page *find_tdp_mmu_root_with_role(
> +		struct kvm *kvm, union kvm_mmu_page_role role)
> +{
> +	struct kvm_mmu_page *root;
> +
> +	lockdep_assert_held(&kvm->mmu_lock);
> +	for_each_tdp_mmu_root(kvm, root) {
> +		WARN_ON(!root->root_count);
> +
> +		if (root->role.word == role.word)
> +			return root;
> +	}
> +
> +	return NULL;
> +}
> +
> +static struct kvm_mmu_page *alloc_tdp_mmu_root(struct kvm_vcpu *vcpu,
> +					       union kvm_mmu_page_role role)
> +{
> +	struct kvm_mmu_page *new_root;
> +	struct kvm_mmu_page *root;
> +
> +	new_root = kvm_mmu_memory_cache_alloc(
> +			&vcpu->arch.mmu_page_header_cache);
> +	new_root->spt = kvm_mmu_memory_cache_alloc(
> +			&vcpu->arch.mmu_shadow_page_cache);
> +	set_page_private(virt_to_page(new_root->spt), (unsigned long)new_root);
> +
> +	new_root->role.word = role.word;
> +	new_root->root_count = 1;
> +	new_root->gfn = 0;
> +	new_root->tdp_mmu_page = true;
> +
> +	spin_lock(&vcpu->kvm->mmu_lock);
> +
> +	/* Check that no matching root exists before adding this one. */
> +	root = find_tdp_mmu_root_with_role(vcpu->kvm, role);
> +	if (root) {
> +		get_tdp_mmu_root(vcpu->kvm, root);
> +		spin_unlock(&vcpu->kvm->mmu_lock);

Hrm, I'm not a big fan of dropping locks in the middle of functions, but the
alternatives aren't great.  :-/  Best I can come up with is

	if (root)
		get_tdp_mmu_root()
	else
		list_add();

	spin_unlock();

	if (root) {
		free_page()
		kmem_cache_free()
	} else {
		root = new_root;
	}

	return root;

Not sure that's any better.

> +		free_page((unsigned long)new_root->spt);
> +		kmem_cache_free(mmu_page_header_cache, new_root);
> +		return root;
> +	}
> +
> +	list_add(&new_root->link, &vcpu->kvm->arch.tdp_mmu_roots);
> +	spin_unlock(&vcpu->kvm->mmu_lock);
> +
> +	return new_root;
> +}
> +
> +static struct kvm_mmu_page *get_tdp_mmu_vcpu_root(struct kvm_vcpu *vcpu)
> +{
> +	struct kvm_mmu_page *root;
> +	union kvm_mmu_page_role role;
> +
> +	role = vcpu->arch.mmu->mmu_role.base;
> +	role.level = vcpu->arch.mmu->shadow_root_level;
> +	role.direct = true;
> +	role.gpte_is_8_bytes = true;
> +	role.access = ACC_ALL;
> +
> +	spin_lock(&vcpu->kvm->mmu_lock);
> +
> +	/* Search for an already allocated root with the same role. */
> +	root = find_tdp_mmu_root_with_role(vcpu->kvm, role);
> +	if (root) {
> +		get_tdp_mmu_root(vcpu->kvm, root);
> +		spin_unlock(&vcpu->kvm->mmu_lock);

Rather than manually unlock and return, this can be

	if (root)
		get_tdp_mmju_root();

	spin_unlock()

	if (!root)
		root = alloc_tdp_mmu_root();

	return root;

You could also add a helper to do the "get" along with the "find".  Not sure
if that's worth the code.
	
> +		return root;
> +	}
> +
> +	spin_unlock(&vcpu->kvm->mmu_lock);
> +
> +	/* If there is no appropriate root, allocate one. */
> +	root = alloc_tdp_mmu_root(vcpu, role);
> +
> +	return root;
> +}
> +
> +hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
> +{
> +	struct kvm_mmu_page *root;
> +
> +	root = get_tdp_mmu_vcpu_root(vcpu);
> +	if (!root)
> +		return INVALID_PAGE;
> +
> +	return __pa(root->spt);
>  }
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
> index dd3764f5a9aa3..9274debffeaa1 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.h
> +++ b/arch/x86/kvm/mmu/tdp_mmu.h
> @@ -7,4 +7,9 @@
>  
>  void kvm_mmu_init_tdp_mmu(struct kvm *kvm);
>  void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm);
> +
> +bool is_tdp_mmu_root(struct kvm *kvm, hpa_t root);
> +hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu);
> +void kvm_tdp_mmu_put_root_hpa(struct kvm *kvm, hpa_t root_hpa);
> +
>  #endif /* __KVM_X86_MMU_TDP_MMU_H */
> -- 
> 2.28.0.709.gb0816b6eb0-goog
>
Paolo Bonzini Sept. 30, 2020, 6:26 a.m. UTC | #2
On 30/09/20 08:06, Sean Christopherson wrote:
>> +static struct kvm_mmu_page *alloc_tdp_mmu_root(struct kvm_vcpu *vcpu,
>> +					       union kvm_mmu_page_role role)
>> +{
>> +	struct kvm_mmu_page *new_root;
>> +	struct kvm_mmu_page *root;
>> +
>> +	new_root = kvm_mmu_memory_cache_alloc(
>> +			&vcpu->arch.mmu_page_header_cache);
>> +	new_root->spt = kvm_mmu_memory_cache_alloc(
>> +			&vcpu->arch.mmu_shadow_page_cache);
>> +	set_page_private(virt_to_page(new_root->spt), (unsigned long)new_root);
>> +
>> +	new_root->role.word = role.word;
>> +	new_root->root_count = 1;
>> +	new_root->gfn = 0;
>> +	new_root->tdp_mmu_page = true;
>> +
>> +	spin_lock(&vcpu->kvm->mmu_lock);
>> +
>> +	/* Check that no matching root exists before adding this one. */
>> +	root = find_tdp_mmu_root_with_role(vcpu->kvm, role);
>> +	if (root) {
>> +		get_tdp_mmu_root(vcpu->kvm, root);
>> +		spin_unlock(&vcpu->kvm->mmu_lock);
> Hrm, I'm not a big fan of dropping locks in the middle of functions, but the
> alternatives aren't great.  :-/  Best I can come up with is
> 
> 	if (root)
> 		get_tdp_mmu_root()
> 	else
> 		list_add();
> 
> 	spin_unlock();
> 
> 	if (root) {
> 		free_page()
> 		kmem_cache_free()
> 	} else {
> 		root = new_root;
> 	}
> 
> 	return root;
> 
> Not sure that's any better.
> 
>> +		free_page((unsigned long)new_root->spt);
>> +		kmem_cache_free(mmu_page_header_cache, new_root);
>> +		return root;
>> +	}
>> +
>> +	list_add(&new_root->link, &vcpu->kvm->arch.tdp_mmu_roots);
>> +	spin_unlock(&vcpu->kvm->mmu_lock);
>> +
>> +	return new_root;
>> +}
>> +
>> +static struct kvm_mmu_page *get_tdp_mmu_vcpu_root(struct kvm_vcpu *vcpu)
>> +{
>> +	struct kvm_mmu_page *root;
>> +	union kvm_mmu_page_role role;
>> +
>> +	role = vcpu->arch.mmu->mmu_role.base;
>> +	role.level = vcpu->arch.mmu->shadow_root_level;
>> +	role.direct = true;
>> +	role.gpte_is_8_bytes = true;
>> +	role.access = ACC_ALL;
>> +
>> +	spin_lock(&vcpu->kvm->mmu_lock);
>> +
>> +	/* Search for an already allocated root with the same role. */
>> +	root = find_tdp_mmu_root_with_role(vcpu->kvm, role);
>> +	if (root) {
>> +		get_tdp_mmu_root(vcpu->kvm, root);
>> +		spin_unlock(&vcpu->kvm->mmu_lock);
> Rather than manually unlock and return, this can be
> 
> 	if (root)
> 		get_tdp_mmju_root();
> 
> 	spin_unlock()
> 
> 	if (!root)
> 		root = alloc_tdp_mmu_root();
> 
> 	return root;
> 
> You could also add a helper to do the "get" along with the "find".  Not sure
> if that's worth the code.

All in all I don't think it's any clearer than Ben's code.  At least in
his case the "if"s clearly point at the double-checked locking pattern.

Paolo
Sean Christopherson Sept. 30, 2020, 3:38 p.m. UTC | #3
On Wed, Sep 30, 2020 at 08:26:28AM +0200, Paolo Bonzini wrote:
> On 30/09/20 08:06, Sean Christopherson wrote:
> >> +static struct kvm_mmu_page *get_tdp_mmu_vcpu_root(struct kvm_vcpu *vcpu)
> >> +{
> >> +	struct kvm_mmu_page *root;
> >> +	union kvm_mmu_page_role role;
> >> +
> >> +	role = vcpu->arch.mmu->mmu_role.base;
> >> +	role.level = vcpu->arch.mmu->shadow_root_level;
> >> +	role.direct = true;
> >> +	role.gpte_is_8_bytes = true;
> >> +	role.access = ACC_ALL;
> >> +
> >> +	spin_lock(&vcpu->kvm->mmu_lock);
> >> +
> >> +	/* Search for an already allocated root with the same role. */
> >> +	root = find_tdp_mmu_root_with_role(vcpu->kvm, role);
> >> +	if (root) {
> >> +		get_tdp_mmu_root(vcpu->kvm, root);
> >> +		spin_unlock(&vcpu->kvm->mmu_lock);
> > Rather than manually unlock and return, this can be
> > 
> > 	if (root)
> > 		get_tdp_mmju_root();
> > 
> > 	spin_unlock()
> > 
> > 	if (!root)
> > 		root = alloc_tdp_mmu_root();
> > 
> > 	return root;
> > 
> > You could also add a helper to do the "get" along with the "find".  Not sure
> > if that's worth the code.
> 
> All in all I don't think it's any clearer than Ben's code.  At least in
> his case the "if"s clearly point at the double-checked locking pattern.

Actually, why is this even dropping the lock to do the alloc?  The allocs are
coming from the caches, which are designed to be invoked while holding the
spin lock.

Also relevant is that, other than this code, the only user of
find_tdp_mmu_root_with_role() is kvm_tdp_mmu_root_hpa_for_role(), and that
helper is itself unused.  I.e. the "find" can be open coded.

Putting those two together yields this, which IMO is much cleaner.

static struct kvm_mmu_page *get_tdp_mmu_vcpu_root(struct kvm_vcpu *vcpu)
{
        union kvm_mmu_page_role role;
	struct kvm *kvm = vcpu->kvm;
        struct kvm_mmu_page *root;

	role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level);

        spin_lock(&kvm->mmu_lock);

        /* Check for an existing root before allocating a new one. */
        for_each_tdp_mmu_root(kvm, root) {
                if (root->role.word == role.word) {
                        get_tdp_mmu_root(root);
                        spin_unlock(&kvm->mmu_lock);
                        return root;
                }
        }

        root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level);
        root->root_count = 1;

        list_add(&root->link, &kvm->arch.tdp_mmu_roots);

        spin_unlock(&kvm->mmu_lock);

        return root;
}
Ben Gardon Oct. 12, 2020, 10:59 p.m. UTC | #4
On Tue, Sep 29, 2020 at 11:06 PM Sean Christopherson
<sean.j.christopherson@intel.com> wrote:
>
> On Fri, Sep 25, 2020 at 02:22:44PM -0700, Ben Gardon wrote:
>   static u64 __read_mostly shadow_nx_mask;
> > @@ -3597,10 +3592,14 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
> >       if (!VALID_PAGE(*root_hpa))
> >               return;
> >
> > -     sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK);
> > -     --sp->root_count;
> > -     if (!sp->root_count && sp->role.invalid)
> > -             kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
> > +     if (is_tdp_mmu_root(kvm, *root_hpa)) {
> > +             kvm_tdp_mmu_put_root_hpa(kvm, *root_hpa);
> > +     } else {
> > +             sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK);
> > +             --sp->root_count;
> > +             if (!sp->root_count && sp->role.invalid)
> > +                     kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
>
> Hmm, I see that future patches use put_tdp_mmu_root()/get_tdp_mmu_root(),
> but the code itself isn't specific to the TDP MMU.  Even if this ends up
> being the only non-TDP user of get/put, I think it'd be worth making them
> common helpers, e.g.
>
>         sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK);
>         if (mmu_put_root(sp) {
>                 if (is_tdp_mmu(...))
>                         kvm_tdp_mmu_free_root(kvm, sp);
>                 else if (sp->role.invalid)
>                         kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
>         }
>
> > +     }
> >
> >       *root_hpa = INVALID_PAGE;
> >  }
> > @@ -3691,7 +3690,13 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
> >       unsigned i;
> >
> >       if (shadow_root_level >= PT64_ROOT_4LEVEL) {
> > -             root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level, true);
> > +             if (vcpu->kvm->arch.tdp_mmu_enabled) {
>
> I believe this will break 32-bit NPT.  Or at a minimum, look weird.  It'd
> be better to explicitly disable the TDP MMU on 32-bit KVM, then this becomes
>
>         if (vcpu->kvm->arch.tdp_mmu_enabled) {
>
>         } else if (shadow_root_level >= PT64_ROOT_4LEVEL) {
>
>         } else {
>
>         }
>

How does this break 32-bit NPT? I'm not sure I understand how we would
get into a bad state here because I'm not familiar with the specifics
of 32 bit NPT.

> > +                     root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu);
> > +             } else {
> > +                     root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level,
> > +                                           true);
> > +             }
>
> May not matter in the end, but the braces aren't needed.
>
> > +
> >               if (!VALID_PAGE(root))
> >                       return -ENOSPC;
> >               vcpu->arch.mmu->root_hpa = root;
> > diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
> > index 65bb110847858..530b7d893c7b3 100644
> > --- a/arch/x86/kvm/mmu/mmu_internal.h
> > +++ b/arch/x86/kvm/mmu/mmu_internal.h
> > @@ -41,8 +41,12 @@ struct kvm_mmu_page {
> >
> >       /* Number of writes since the last time traversal visited this page.  */
> >       atomic_t write_flooding_count;
> > +
> > +     bool tdp_mmu_page;
> >  };
> >
> > +extern struct kmem_cache *mmu_page_header_cache;
> > +
> >  static inline struct kvm_mmu_page *to_shadow_page(hpa_t shadow_page)
> >  {
> >       struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT);
> > @@ -69,6 +73,11 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
> >       (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
> >  #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
> >
> > +#define ACC_EXEC_MASK    1
> > +#define ACC_WRITE_MASK   PT_WRITABLE_MASK
> > +#define ACC_USER_MASK    PT_USER_MASK
> > +#define ACC_ALL          (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
> > +
> >  /* Functions for interpreting SPTEs */
> >  kvm_pfn_t spte_to_pfn(u64 pte);
> >  bool is_mmio_spte(u64 spte);
> > diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> > index 8241e18c111e6..cdca829e42040 100644
> > --- a/arch/x86/kvm/mmu/tdp_mmu.c
> > +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> > @@ -1,5 +1,7 @@
> >  /* SPDX-License-Identifier: GPL-2.0 */
> >
> > +#include "mmu.h"
> > +#include "mmu_internal.h"
> >  #include "tdp_mmu.h"
> >
> >  static bool __read_mostly tdp_mmu_enabled = true;
> > @@ -25,10 +27,165 @@ void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
> >
> >       /* This should not be changed for the lifetime of the VM. */
> >       kvm->arch.tdp_mmu_enabled = true;
> > +
> > +     INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
> >  }
> >
> >  void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
> >  {
> >       if (!kvm->arch.tdp_mmu_enabled)
> >               return;
> > +
> > +     WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
> > +}
> > +
> > +#define for_each_tdp_mmu_root(_kvm, _root)                       \
> > +     list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link)
> > +
> > +bool is_tdp_mmu_root(struct kvm *kvm, hpa_t hpa)
> > +{
> > +     struct kvm_mmu_page *root;
> > +
> > +     if (!kvm->arch.tdp_mmu_enabled)
> > +             return false;
> > +
> > +     root = to_shadow_page(hpa);
> > +
> > +     if (WARN_ON(!root))
> > +             return false;
> > +
> > +     return root->tdp_mmu_page;
>
> Why all the extra checks?
>
> > +}
> > +
> > +static void free_tdp_mmu_root(struct kvm *kvm, struct kvm_mmu_page *root)
> > +{
> > +     lockdep_assert_held(&kvm->mmu_lock);
> > +
> > +     WARN_ON(root->root_count);
> > +     WARN_ON(!root->tdp_mmu_page);
> > +
> > +     list_del(&root->link);
> > +
> > +     free_page((unsigned long)root->spt);
> > +     kmem_cache_free(mmu_page_header_cache, root);
> > +}
> > +
> > +static void put_tdp_mmu_root(struct kvm *kvm, struct kvm_mmu_page *root)
> > +{
> > +     lockdep_assert_held(&kvm->mmu_lock);
> > +
> > +     root->root_count--;
> > +     if (!root->root_count)
> > +             free_tdp_mmu_root(kvm, root);
> > +}
> > +
> > +static void get_tdp_mmu_root(struct kvm *kvm, struct kvm_mmu_page *root)
> > +{
> > +     lockdep_assert_held(&kvm->mmu_lock);
> > +     WARN_ON(!root->root_count);
> > +
> > +     root->root_count++;
> > +}
> > +
> > +void kvm_tdp_mmu_put_root_hpa(struct kvm *kvm, hpa_t root_hpa)
> > +{
> > +     struct kvm_mmu_page *root;
> > +
> > +     root = to_shadow_page(root_hpa);
> > +
> > +     if (WARN_ON(!root))
> > +             return;
> > +
> > +     put_tdp_mmu_root(kvm, root);
> > +}
> > +
> > +static struct kvm_mmu_page *find_tdp_mmu_root_with_role(
> > +             struct kvm *kvm, union kvm_mmu_page_role role)
> > +{
> > +     struct kvm_mmu_page *root;
> > +
> > +     lockdep_assert_held(&kvm->mmu_lock);
> > +     for_each_tdp_mmu_root(kvm, root) {
> > +             WARN_ON(!root->root_count);
> > +
> > +             if (root->role.word == role.word)
> > +                     return root;
> > +     }
> > +
> > +     return NULL;
> > +}
> > +
> > +static struct kvm_mmu_page *alloc_tdp_mmu_root(struct kvm_vcpu *vcpu,
> > +                                            union kvm_mmu_page_role role)
> > +{
> > +     struct kvm_mmu_page *new_root;
> > +     struct kvm_mmu_page *root;
> > +
> > +     new_root = kvm_mmu_memory_cache_alloc(
> > +                     &vcpu->arch.mmu_page_header_cache);
> > +     new_root->spt = kvm_mmu_memory_cache_alloc(
> > +                     &vcpu->arch.mmu_shadow_page_cache);
> > +     set_page_private(virt_to_page(new_root->spt), (unsigned long)new_root);
> > +
> > +     new_root->role.word = role.word;
> > +     new_root->root_count = 1;
> > +     new_root->gfn = 0;
> > +     new_root->tdp_mmu_page = true;
> > +
> > +     spin_lock(&vcpu->kvm->mmu_lock);
> > +
> > +     /* Check that no matching root exists before adding this one. */
> > +     root = find_tdp_mmu_root_with_role(vcpu->kvm, role);
> > +     if (root) {
> > +             get_tdp_mmu_root(vcpu->kvm, root);
> > +             spin_unlock(&vcpu->kvm->mmu_lock);
>
> Hrm, I'm not a big fan of dropping locks in the middle of functions, but the
> alternatives aren't great.  :-/  Best I can come up with is
>
>         if (root)
>                 get_tdp_mmu_root()
>         else
>                 list_add();
>
>         spin_unlock();
>
>         if (root) {
>                 free_page()
>                 kmem_cache_free()
>         } else {
>                 root = new_root;
>         }
>
>         return root;
>
> Not sure that's any better.
>
> > +             free_page((unsigned long)new_root->spt);
> > +             kmem_cache_free(mmu_page_header_cache, new_root);
> > +             return root;
> > +     }
> > +
> > +     list_add(&new_root->link, &vcpu->kvm->arch.tdp_mmu_roots);
> > +     spin_unlock(&vcpu->kvm->mmu_lock);
> > +
> > +     return new_root;
> > +}
> > +
> > +static struct kvm_mmu_page *get_tdp_mmu_vcpu_root(struct kvm_vcpu *vcpu)
> > +{
> > +     struct kvm_mmu_page *root;
> > +     union kvm_mmu_page_role role;
> > +
> > +     role = vcpu->arch.mmu->mmu_role.base;
> > +     role.level = vcpu->arch.mmu->shadow_root_level;
> > +     role.direct = true;
> > +     role.gpte_is_8_bytes = true;
> > +     role.access = ACC_ALL;
> > +
> > +     spin_lock(&vcpu->kvm->mmu_lock);
> > +
> > +     /* Search for an already allocated root with the same role. */
> > +     root = find_tdp_mmu_root_with_role(vcpu->kvm, role);
> > +     if (root) {
> > +             get_tdp_mmu_root(vcpu->kvm, root);
> > +             spin_unlock(&vcpu->kvm->mmu_lock);
>
> Rather than manually unlock and return, this can be
>
>         if (root)
>                 get_tdp_mmju_root();
>
>         spin_unlock()
>
>         if (!root)
>                 root = alloc_tdp_mmu_root();
>
>         return root;
>
> You could also add a helper to do the "get" along with the "find".  Not sure
> if that's worth the code.
>
> > +             return root;
> > +     }
> > +
> > +     spin_unlock(&vcpu->kvm->mmu_lock);
> > +
> > +     /* If there is no appropriate root, allocate one. */
> > +     root = alloc_tdp_mmu_root(vcpu, role);
> > +
> > +     return root;
> > +}
> > +
> > +hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
> > +{
> > +     struct kvm_mmu_page *root;
> > +
> > +     root = get_tdp_mmu_vcpu_root(vcpu);
> > +     if (!root)
> > +             return INVALID_PAGE;
> > +
> > +     return __pa(root->spt);
> >  }
> > diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
> > index dd3764f5a9aa3..9274debffeaa1 100644
> > --- a/arch/x86/kvm/mmu/tdp_mmu.h
> > +++ b/arch/x86/kvm/mmu/tdp_mmu.h
> > @@ -7,4 +7,9 @@
> >
> >  void kvm_mmu_init_tdp_mmu(struct kvm *kvm);
> >  void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm);
> > +
> > +bool is_tdp_mmu_root(struct kvm *kvm, hpa_t root);
> > +hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu);
> > +void kvm_tdp_mmu_put_root_hpa(struct kvm *kvm, hpa_t root_hpa);
> > +
> >  #endif /* __KVM_X86_MMU_TDP_MMU_H */
> > --
> > 2.28.0.709.gb0816b6eb0-goog
> >
Sean Christopherson Oct. 12, 2020, 11:59 p.m. UTC | #5
Heads up, you may get this multiple times, our mail servers got "upgraded"
recently and are giving me troubles...

On Mon, Oct 12, 2020 at 03:59:35PM -0700, Ben Gardon wrote:
> On Tue, Sep 29, 2020 at 11:06 PM Sean Christopherson
> <sean.j.christopherson@intel.com> wrote:
> > > @@ -3691,7 +3690,13 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
> > >       unsigned i;
> > >
> > >       if (shadow_root_level >= PT64_ROOT_4LEVEL) {
> > > -             root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level, true);
> > > +             if (vcpu->kvm->arch.tdp_mmu_enabled) {
> >
> > I believe this will break 32-bit NPT.  Or at a minimum, look weird.  It'd
> > be better to explicitly disable the TDP MMU on 32-bit KVM, then this becomes
> >
> >         if (vcpu->kvm->arch.tdp_mmu_enabled) {
> >
> >         } else if (shadow_root_level >= PT64_ROOT_4LEVEL) {
> >
> >         } else {
> >
> >         }
> >
>
> How does this break 32-bit NPT? I'm not sure I understand how we would
> get into a bad state here because I'm not familiar with the specifics
> of 32 bit NPT.

32-bit NPT will have a max TDP level of PT32E_ROOT_LEVEL (3), i.e. will
fail the "shadow_root_level >= PT64_ROOT_4LEVEL" check, and thus won't get
to the tdp_mmu_enabled check.  That would likely break as some parts of KVM
would see tdp_mmu_enabled, but this root allocation would continue using
the legacy MMU.

It's somewhat of a moot point, because IIRC there are other things that will
break with 32-bit KVM, i.e. TDP MMU will be 64-bit only.  But burying that
assumption/dependency in these flows is weird.

> > > +                     root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu);
> > > +             } else {
> > > +                     root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level,
> > > +                                           true);
> > > +             }
diff mbox series

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 35107819f48ae..9ce6b35ecb33a 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -972,6 +972,7 @@  struct kvm_arch {
 	 * operations.
 	 */
 	bool tdp_mmu_enabled;
+	struct list_head tdp_mmu_roots;
 };
 
 struct kvm_vm_stat {
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 0cb0c26939dfc..0f871e36394da 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -170,11 +170,6 @@  module_param(dbg, bool, 0644);
 #define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \
 			| shadow_x_mask | shadow_nx_mask | shadow_me_mask)
 
-#define ACC_EXEC_MASK    1
-#define ACC_WRITE_MASK   PT_WRITABLE_MASK
-#define ACC_USER_MASK    PT_USER_MASK
-#define ACC_ALL          (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
-
 /* The mask for the R/X bits in EPT PTEs */
 #define PT64_EPT_READABLE_MASK			0x1ull
 #define PT64_EPT_EXECUTABLE_MASK		0x4ull
@@ -232,7 +227,7 @@  struct kvm_shadow_walk_iterator {
 	     __shadow_walk_next(&(_walker), spte))
 
 static struct kmem_cache *pte_list_desc_cache;
-static struct kmem_cache *mmu_page_header_cache;
+struct kmem_cache *mmu_page_header_cache;
 static struct percpu_counter kvm_total_used_mmu_pages;
 
 static u64 __read_mostly shadow_nx_mask;
@@ -3597,10 +3592,14 @@  static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
 	if (!VALID_PAGE(*root_hpa))
 		return;
 
-	sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK);
-	--sp->root_count;
-	if (!sp->root_count && sp->role.invalid)
-		kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
+	if (is_tdp_mmu_root(kvm, *root_hpa)) {
+		kvm_tdp_mmu_put_root_hpa(kvm, *root_hpa);
+	} else {
+		sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK);
+		--sp->root_count;
+		if (!sp->root_count && sp->role.invalid)
+			kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
+	}
 
 	*root_hpa = INVALID_PAGE;
 }
@@ -3691,7 +3690,13 @@  static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
 	unsigned i;
 
 	if (shadow_root_level >= PT64_ROOT_4LEVEL) {
-		root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level, true);
+		if (vcpu->kvm->arch.tdp_mmu_enabled) {
+			root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu);
+		} else {
+			root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level,
+					      true);
+		}
+
 		if (!VALID_PAGE(root))
 			return -ENOSPC;
 		vcpu->arch.mmu->root_hpa = root;
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index 65bb110847858..530b7d893c7b3 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -41,8 +41,12 @@  struct kvm_mmu_page {
 
 	/* Number of writes since the last time traversal visited this page.  */
 	atomic_t write_flooding_count;
+
+	bool tdp_mmu_page;
 };
 
+extern struct kmem_cache *mmu_page_header_cache;
+
 static inline struct kvm_mmu_page *to_shadow_page(hpa_t shadow_page)
 {
 	struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT);
@@ -69,6 +73,11 @@  bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
 	(((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
 
+#define ACC_EXEC_MASK    1
+#define ACC_WRITE_MASK   PT_WRITABLE_MASK
+#define ACC_USER_MASK    PT_USER_MASK
+#define ACC_ALL          (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
+
 /* Functions for interpreting SPTEs */
 kvm_pfn_t spte_to_pfn(u64 pte);
 bool is_mmio_spte(u64 spte);
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 8241e18c111e6..cdca829e42040 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -1,5 +1,7 @@ 
 /* SPDX-License-Identifier: GPL-2.0 */
 
+#include "mmu.h"
+#include "mmu_internal.h"
 #include "tdp_mmu.h"
 
 static bool __read_mostly tdp_mmu_enabled = true;
@@ -25,10 +27,165 @@  void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
 
 	/* This should not be changed for the lifetime of the VM. */
 	kvm->arch.tdp_mmu_enabled = true;
+
+	INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
 }
 
 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
 {
 	if (!kvm->arch.tdp_mmu_enabled)
 		return;
+
+	WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
+}
+
+#define for_each_tdp_mmu_root(_kvm, _root)			    \
+	list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link)
+
+bool is_tdp_mmu_root(struct kvm *kvm, hpa_t hpa)
+{
+	struct kvm_mmu_page *root;
+
+	if (!kvm->arch.tdp_mmu_enabled)
+		return false;
+
+	root = to_shadow_page(hpa);
+
+	if (WARN_ON(!root))
+		return false;
+
+	return root->tdp_mmu_page;
+}
+
+static void free_tdp_mmu_root(struct kvm *kvm, struct kvm_mmu_page *root)
+{
+	lockdep_assert_held(&kvm->mmu_lock);
+
+	WARN_ON(root->root_count);
+	WARN_ON(!root->tdp_mmu_page);
+
+	list_del(&root->link);
+
+	free_page((unsigned long)root->spt);
+	kmem_cache_free(mmu_page_header_cache, root);
+}
+
+static void put_tdp_mmu_root(struct kvm *kvm, struct kvm_mmu_page *root)
+{
+	lockdep_assert_held(&kvm->mmu_lock);
+
+	root->root_count--;
+	if (!root->root_count)
+		free_tdp_mmu_root(kvm, root);
+}
+
+static void get_tdp_mmu_root(struct kvm *kvm, struct kvm_mmu_page *root)
+{
+	lockdep_assert_held(&kvm->mmu_lock);
+	WARN_ON(!root->root_count);
+
+	root->root_count++;
+}
+
+void kvm_tdp_mmu_put_root_hpa(struct kvm *kvm, hpa_t root_hpa)
+{
+	struct kvm_mmu_page *root;
+
+	root = to_shadow_page(root_hpa);
+
+	if (WARN_ON(!root))
+		return;
+
+	put_tdp_mmu_root(kvm, root);
+}
+
+static struct kvm_mmu_page *find_tdp_mmu_root_with_role(
+		struct kvm *kvm, union kvm_mmu_page_role role)
+{
+	struct kvm_mmu_page *root;
+
+	lockdep_assert_held(&kvm->mmu_lock);
+	for_each_tdp_mmu_root(kvm, root) {
+		WARN_ON(!root->root_count);
+
+		if (root->role.word == role.word)
+			return root;
+	}
+
+	return NULL;
+}
+
+static struct kvm_mmu_page *alloc_tdp_mmu_root(struct kvm_vcpu *vcpu,
+					       union kvm_mmu_page_role role)
+{
+	struct kvm_mmu_page *new_root;
+	struct kvm_mmu_page *root;
+
+	new_root = kvm_mmu_memory_cache_alloc(
+			&vcpu->arch.mmu_page_header_cache);
+	new_root->spt = kvm_mmu_memory_cache_alloc(
+			&vcpu->arch.mmu_shadow_page_cache);
+	set_page_private(virt_to_page(new_root->spt), (unsigned long)new_root);
+
+	new_root->role.word = role.word;
+	new_root->root_count = 1;
+	new_root->gfn = 0;
+	new_root->tdp_mmu_page = true;
+
+	spin_lock(&vcpu->kvm->mmu_lock);
+
+	/* Check that no matching root exists before adding this one. */
+	root = find_tdp_mmu_root_with_role(vcpu->kvm, role);
+	if (root) {
+		get_tdp_mmu_root(vcpu->kvm, root);
+		spin_unlock(&vcpu->kvm->mmu_lock);
+		free_page((unsigned long)new_root->spt);
+		kmem_cache_free(mmu_page_header_cache, new_root);
+		return root;
+	}
+
+	list_add(&new_root->link, &vcpu->kvm->arch.tdp_mmu_roots);
+	spin_unlock(&vcpu->kvm->mmu_lock);
+
+	return new_root;
+}
+
+static struct kvm_mmu_page *get_tdp_mmu_vcpu_root(struct kvm_vcpu *vcpu)
+{
+	struct kvm_mmu_page *root;
+	union kvm_mmu_page_role role;
+
+	role = vcpu->arch.mmu->mmu_role.base;
+	role.level = vcpu->arch.mmu->shadow_root_level;
+	role.direct = true;
+	role.gpte_is_8_bytes = true;
+	role.access = ACC_ALL;
+
+	spin_lock(&vcpu->kvm->mmu_lock);
+
+	/* Search for an already allocated root with the same role. */
+	root = find_tdp_mmu_root_with_role(vcpu->kvm, role);
+	if (root) {
+		get_tdp_mmu_root(vcpu->kvm, root);
+		spin_unlock(&vcpu->kvm->mmu_lock);
+		return root;
+	}
+
+	spin_unlock(&vcpu->kvm->mmu_lock);
+
+	/* If there is no appropriate root, allocate one. */
+	root = alloc_tdp_mmu_root(vcpu, role);
+
+	return root;
+}
+
+hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
+{
+	struct kvm_mmu_page *root;
+
+	root = get_tdp_mmu_vcpu_root(vcpu);
+	if (!root)
+		return INVALID_PAGE;
+
+	return __pa(root->spt);
 }
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index dd3764f5a9aa3..9274debffeaa1 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -7,4 +7,9 @@ 
 
 void kvm_mmu_init_tdp_mmu(struct kvm *kvm);
 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm);
+
+bool is_tdp_mmu_root(struct kvm *kvm, hpa_t root);
+hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu);
+void kvm_tdp_mmu_put_root_hpa(struct kvm *kvm, hpa_t root_hpa);
+
 #endif /* __KVM_X86_MMU_TDP_MMU_H */