diff mbox series

[v2,04/20] kvm: x86/mmu: Allocate and free TDP MMU roots

Message ID 20201014182700.2888246-5-bgardon@google.com (mailing list archive)
State New, archived
Headers show
Series Introduce the TDP MMU | expand

Commit Message

Ben Gardon Oct. 14, 2020, 6:26 p.m. UTC
The TDP MMU must be able to allocate paging structure root pages and track
the usage of those pages. Implement a similar, but separate system for root
page allocation to that of the x86 shadow paging implementation. When
future patches add synchronization model changes to allow for parallel
page faults, these pages will need to be handled differently from the
x86 shadow paging based MMU's root pages.

Tested by running kvm-unit-tests and KVM selftests on an Intel Haswell
machine. This series introduced no new failures.

This series can be viewed in Gerrit at:
	https://linux-review.googlesource.com/c/virt/kvm/kvm/+/2538

Signed-off-by: Ben Gardon <bgardon@google.com>
---
 arch/x86/include/asm/kvm_host.h |   1 +
 arch/x86/kvm/mmu/mmu.c          |  29 +++++---
 arch/x86/kvm/mmu/mmu_internal.h |  24 +++++++
 arch/x86/kvm/mmu/tdp_mmu.c      | 114 ++++++++++++++++++++++++++++++++
 arch/x86/kvm/mmu/tdp_mmu.h      |   5 ++
 5 files changed, 162 insertions(+), 11 deletions(-)

Comments

Paolo Bonzini Oct. 16, 2020, 2:56 p.m. UTC | #1
On 14/10/20 20:26, Ben Gardon wrote:
> +
> +static void put_tdp_mmu_root(struct kvm *kvm, struct kvm_mmu_page *root)
> +{
> +	if (kvm_mmu_put_root(root))
> +		kvm_tdp_mmu_free_root(kvm, root);
> +}

Unused...

> +static void get_tdp_mmu_root(struct kvm *kvm, struct kvm_mmu_page *root)
> +{
> +	lockdep_assert_held(&kvm->mmu_lock);
> +
> +	kvm_mmu_get_root(root);
> +}
> +

... and duplicate with kvm_mmu_get_root itself since we can move the
assertion there.

Paolo
Ben Gardon Oct. 19, 2020, 5:02 p.m. UTC | #2
On Fri, Oct 16, 2020 at 7:56 AM Paolo Bonzini <pbonzini@redhat.com> wrote:
>
> On 14/10/20 20:26, Ben Gardon wrote:
> > +
> > +static void put_tdp_mmu_root(struct kvm *kvm, struct kvm_mmu_page *root)
> > +{
> > +     if (kvm_mmu_put_root(root))
> > +             kvm_tdp_mmu_free_root(kvm, root);
> > +}
>
> Unused...

Woops, I should have added an unused tag or added this in commit 7.
It's used by many other functions, but nothing in this patch anymore.

>
> > +static void get_tdp_mmu_root(struct kvm *kvm, struct kvm_mmu_page *root)
> > +{
> > +     lockdep_assert_held(&kvm->mmu_lock);
> > +
> > +     kvm_mmu_get_root(root);
> > +}
> > +
>
> ... and duplicate with kvm_mmu_get_root itself since we can move the
> assertion there.
>
> Paolo
>
Yu Zhang Oct. 21, 2020, 3:09 p.m. UTC | #3
On Wed, Oct 14, 2020 at 11:26:44AM -0700, Ben Gardon wrote:
> The TDP MMU must be able to allocate paging structure root pages and track
> the usage of those pages. Implement a similar, but separate system for root
> page allocation to that of the x86 shadow paging implementation. When
> future patches add synchronization model changes to allow for parallel
> page faults, these pages will need to be handled differently from the
> x86 shadow paging based MMU's root pages.
> 
> Tested by running kvm-unit-tests and KVM selftests on an Intel Haswell
> machine. This series introduced no new failures.
> 
> This series can be viewed in Gerrit at:
> 	https://linux-review.googlesource.com/c/virt/kvm/kvm/+/2538
> 
> Signed-off-by: Ben Gardon <bgardon@google.com>
> ---
>  arch/x86/include/asm/kvm_host.h |   1 +
>  arch/x86/kvm/mmu/mmu.c          |  29 +++++---
>  arch/x86/kvm/mmu/mmu_internal.h |  24 +++++++
>  arch/x86/kvm/mmu/tdp_mmu.c      | 114 ++++++++++++++++++++++++++++++++
>  arch/x86/kvm/mmu/tdp_mmu.h      |   5 ++
>  5 files changed, 162 insertions(+), 11 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 6b6dbc20ce23a..e0ec1dd271a32 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -989,6 +989,7 @@ struct kvm_arch {
>  	 * operations.
>  	 */
>  	bool tdp_mmu_enabled;
> +	struct list_head tdp_mmu_roots;
>  };
>  
>  struct kvm_vm_stat {
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index f53d29e09367c..a3340ed59ad1d 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -144,11 +144,6 @@ module_param(dbg, bool, 0644);
>  #define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \
>  			| shadow_x_mask | shadow_nx_mask | shadow_me_mask)
>  
> -#define ACC_EXEC_MASK    1
> -#define ACC_WRITE_MASK   PT_WRITABLE_MASK
> -#define ACC_USER_MASK    PT_USER_MASK
> -#define ACC_ALL          (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
> -
>  /* The mask for the R/X bits in EPT PTEs */
>  #define PT64_EPT_READABLE_MASK			0x1ull
>  #define PT64_EPT_EXECUTABLE_MASK		0x4ull
> @@ -209,7 +204,7 @@ struct kvm_shadow_walk_iterator {
>  	     __shadow_walk_next(&(_walker), spte))
>  
>  static struct kmem_cache *pte_list_desc_cache;
> -static struct kmem_cache *mmu_page_header_cache;
> +struct kmem_cache *mmu_page_header_cache;
>  static struct percpu_counter kvm_total_used_mmu_pages;
>  
>  static u64 __read_mostly shadow_nx_mask;
> @@ -3588,9 +3583,13 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
>  		return;
>  
>  	sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK);
> -	--sp->root_count;
> -	if (!sp->root_count && sp->role.invalid)
> -		kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
> +
> +	if (kvm_mmu_put_root(sp)) {
> +		if (sp->tdp_mmu_page)
> +			kvm_tdp_mmu_free_root(kvm, sp);
> +		else if (sp->role.invalid)
> +			kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
> +	}
>  
>  	*root_hpa = INVALID_PAGE;
>  }
> @@ -3680,8 +3679,16 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
>  	hpa_t root;
>  	unsigned i;
>  
> -	if (shadow_root_level >= PT64_ROOT_4LEVEL) {
> -		root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level, true);
> +	if (vcpu->kvm->arch.tdp_mmu_enabled) {
> +		root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu);
> +
> +		if (!VALID_PAGE(root))
> +			return -ENOSPC;
> +		vcpu->arch.mmu->root_hpa = root;
> +	} else if (shadow_root_level >= PT64_ROOT_4LEVEL) {
> +		root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level,
> +				      true);
> +
>  		if (!VALID_PAGE(root))
>  			return -ENOSPC;
>  		vcpu->arch.mmu->root_hpa = root;
> diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
> index 74ccbf001a42e..6cedf578c9a8d 100644
> --- a/arch/x86/kvm/mmu/mmu_internal.h
> +++ b/arch/x86/kvm/mmu/mmu_internal.h
> @@ -43,8 +43,12 @@ struct kvm_mmu_page {
>  
>  	/* Number of writes since the last time traversal visited this page.  */
>  	atomic_t write_flooding_count;
> +
> +	bool tdp_mmu_page;
>  };
>  
> +extern struct kmem_cache *mmu_page_header_cache;
> +
>  static inline struct kvm_mmu_page *to_shadow_page(hpa_t shadow_page)
>  {
>  	struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT);
> @@ -96,6 +100,11 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
>  	(PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
>  						* PT64_LEVEL_BITS))) - 1))
>  
> +#define ACC_EXEC_MASK    1
> +#define ACC_WRITE_MASK   PT_WRITABLE_MASK
> +#define ACC_USER_MASK    PT_USER_MASK
> +#define ACC_ALL          (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
> +
>  /* Functions for interpreting SPTEs */
>  static inline bool is_mmio_spte(u64 spte)
>  {
> @@ -126,4 +135,19 @@ static inline kvm_pfn_t spte_to_pfn(u64 pte)
>  	return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
>  }
>  
> +static inline void kvm_mmu_get_root(struct kvm_mmu_page *sp)
> +{
> +	BUG_ON(!sp->root_count);
> +
> +	++sp->root_count;
> +}
> +
> +static inline bool kvm_mmu_put_root(struct kvm_mmu_page *sp)
> +{
> +	--sp->root_count;
> +
> +	return !sp->root_count;
> +}
> +
> +
>  #endif /* __KVM_X86_MMU_INTERNAL_H */
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> index b3809835e90b1..09a84a6e157b6 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.c
> +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> @@ -1,5 +1,7 @@
>  // SPDX-License-Identifier: GPL-2.0
>  
> +#include "mmu.h"
> +#include "mmu_internal.h"
>  #include "tdp_mmu.h"
>  
>  static bool __read_mostly tdp_mmu_enabled = false;
> @@ -29,10 +31,122 @@ void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
>  
>  	/* This should not be changed for the lifetime of the VM. */
>  	kvm->arch.tdp_mmu_enabled = true;
> +
> +	INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
>  }
>  
>  void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
>  {
>  	if (!kvm->arch.tdp_mmu_enabled)
>  		return;
> +
> +	WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
> +}
> +
> +#define for_each_tdp_mmu_root(_kvm, _root)			    \
> +	list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link)
> +
> +bool is_tdp_mmu_root(struct kvm *kvm, hpa_t hpa)
> +{
> +	struct kvm_mmu_page *sp;
> +
> +	sp = to_shadow_page(hpa);
> +
> +	return sp->tdp_mmu_page && sp->root_count;
> +}
> +
> +void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root)
> +{
> +	lockdep_assert_held(&kvm->mmu_lock);
> +
> +	WARN_ON(root->root_count);
> +	WARN_ON(!root->tdp_mmu_page);
> +
> +	list_del(&root->link);
> +
> +	free_page((unsigned long)root->spt);
> +	kmem_cache_free(mmu_page_header_cache, root);
> +}
> +
> +static void put_tdp_mmu_root(struct kvm *kvm, struct kvm_mmu_page *root)
> +{
> +	if (kvm_mmu_put_root(root))
> +		kvm_tdp_mmu_free_root(kvm, root);
> +}
> +
> +static void get_tdp_mmu_root(struct kvm *kvm, struct kvm_mmu_page *root)
> +{
> +	lockdep_assert_held(&kvm->mmu_lock);
> +
> +	kvm_mmu_get_root(root);
> +}
> +
> +static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu,
> +						   int level)
> +{
> +	union kvm_mmu_page_role role;
> +
> +	role = vcpu->arch.mmu->mmu_role.base;
> +	role.level = vcpu->arch.mmu->shadow_root_level;

role.level = level;
The role will be calculated for non root pages later.

> +	role.direct = true;
> +	role.gpte_is_8_bytes = true;
> +	role.access = ACC_ALL;
> +
> +	return role;
> +}
> +
> +static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn,
> +					       int level)
> +{
> +	struct kvm_mmu_page *sp;
> +
> +	sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
> +	sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
> +	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
> +
> +	sp->role.word = page_role_for_level(vcpu, level).word;
> +	sp->gfn = gfn;
> +	sp->tdp_mmu_page = true;
> +
> +	return sp;
> +}
> +
> +static struct kvm_mmu_page *get_tdp_mmu_vcpu_root(struct kvm_vcpu *vcpu)
> +{
> +	union kvm_mmu_page_role role;
> +	struct kvm *kvm = vcpu->kvm;
> +	struct kvm_mmu_page *root;
> +
> +	role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level);
> +
> +	spin_lock(&kvm->mmu_lock);
> +
> +	/* Check for an existing root before allocating a new one. */
> +	for_each_tdp_mmu_root(kvm, root) {
> +		if (root->role.word == role.word) {
> +			get_tdp_mmu_root(kvm, root);
> +			spin_unlock(&kvm->mmu_lock);
> +			return root;
> +		}
> +	}
> +
> +	root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level);
> +	root->root_count = 1;
> +
> +	list_add(&root->link, &kvm->arch.tdp_mmu_roots);
> +
> +	spin_unlock(&kvm->mmu_lock);
> +
> +	return root;
> +}
> +
> +hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
> +{
> +	struct kvm_mmu_page *root;
> +
> +	root = get_tdp_mmu_vcpu_root(vcpu);
> +	if (!root)
> +		return INVALID_PAGE;
> +
> +	return __pa(root->spt);
>  }
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
> index cd4a562a70e9a..ac0ef91294420 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.h
> +++ b/arch/x86/kvm/mmu/tdp_mmu.h
> @@ -7,4 +7,9 @@
>  
>  void kvm_mmu_init_tdp_mmu(struct kvm *kvm);
>  void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm);
> +
> +bool is_tdp_mmu_root(struct kvm *kvm, hpa_t root);
> +hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu);
> +void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root);
> +
>  #endif /* __KVM_X86_MMU_TDP_MMU_H */
> -- 
> 2.28.0.1011.ga647a8990f-goog
> 

Thanks
Yu
Ben Gardon Oct. 21, 2020, 5:54 p.m. UTC | #4
On Wed, Oct 21, 2020 at 8:09 AM Yu Zhang <yu.c.zhang@linux.intel.com> wrote:
>
> On Wed, Oct 14, 2020 at 11:26:44AM -0700, Ben Gardon wrote:
> > The TDP MMU must be able to allocate paging structure root pages and track
> > the usage of those pages. Implement a similar, but separate system for root
> > page allocation to that of the x86 shadow paging implementation. When
> > future patches add synchronization model changes to allow for parallel
> > page faults, these pages will need to be handled differently from the
> > x86 shadow paging based MMU's root pages.
> >
> > Tested by running kvm-unit-tests and KVM selftests on an Intel Haswell
> > machine. This series introduced no new failures.
> >
> > This series can be viewed in Gerrit at:
> >       https://linux-review.googlesource.com/c/virt/kvm/kvm/+/2538
> >
> > Signed-off-by: Ben Gardon <bgardon@google.com>
> > ---
> >  arch/x86/include/asm/kvm_host.h |   1 +
> >  arch/x86/kvm/mmu/mmu.c          |  29 +++++---
> >  arch/x86/kvm/mmu/mmu_internal.h |  24 +++++++
> >  arch/x86/kvm/mmu/tdp_mmu.c      | 114 ++++++++++++++++++++++++++++++++
> >  arch/x86/kvm/mmu/tdp_mmu.h      |   5 ++
> >  5 files changed, 162 insertions(+), 11 deletions(-)
> >
> > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> > index 6b6dbc20ce23a..e0ec1dd271a32 100644
> > --- a/arch/x86/include/asm/kvm_host.h
> > +++ b/arch/x86/include/asm/kvm_host.h
> > @@ -989,6 +989,7 @@ struct kvm_arch {
> >        * operations.
> >        */
> >       bool tdp_mmu_enabled;
> > +     struct list_head tdp_mmu_roots;
> >  };
> >
> >  struct kvm_vm_stat {
> > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> > index f53d29e09367c..a3340ed59ad1d 100644
> > --- a/arch/x86/kvm/mmu/mmu.c
> > +++ b/arch/x86/kvm/mmu/mmu.c
> > @@ -144,11 +144,6 @@ module_param(dbg, bool, 0644);
> >  #define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \
> >                       | shadow_x_mask | shadow_nx_mask | shadow_me_mask)
> >
> > -#define ACC_EXEC_MASK    1
> > -#define ACC_WRITE_MASK   PT_WRITABLE_MASK
> > -#define ACC_USER_MASK    PT_USER_MASK
> > -#define ACC_ALL          (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
> > -
> >  /* The mask for the R/X bits in EPT PTEs */
> >  #define PT64_EPT_READABLE_MASK                       0x1ull
> >  #define PT64_EPT_EXECUTABLE_MASK             0x4ull
> > @@ -209,7 +204,7 @@ struct kvm_shadow_walk_iterator {
> >            __shadow_walk_next(&(_walker), spte))
> >
> >  static struct kmem_cache *pte_list_desc_cache;
> > -static struct kmem_cache *mmu_page_header_cache;
> > +struct kmem_cache *mmu_page_header_cache;
> >  static struct percpu_counter kvm_total_used_mmu_pages;
> >
> >  static u64 __read_mostly shadow_nx_mask;
> > @@ -3588,9 +3583,13 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
> >               return;
> >
> >       sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK);
> > -     --sp->root_count;
> > -     if (!sp->root_count && sp->role.invalid)
> > -             kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
> > +
> > +     if (kvm_mmu_put_root(sp)) {
> > +             if (sp->tdp_mmu_page)
> > +                     kvm_tdp_mmu_free_root(kvm, sp);
> > +             else if (sp->role.invalid)
> > +                     kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
> > +     }
> >
> >       *root_hpa = INVALID_PAGE;
> >  }
> > @@ -3680,8 +3679,16 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
> >       hpa_t root;
> >       unsigned i;
> >
> > -     if (shadow_root_level >= PT64_ROOT_4LEVEL) {
> > -             root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level, true);
> > +     if (vcpu->kvm->arch.tdp_mmu_enabled) {
> > +             root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu);
> > +
> > +             if (!VALID_PAGE(root))
> > +                     return -ENOSPC;
> > +             vcpu->arch.mmu->root_hpa = root;
> > +     } else if (shadow_root_level >= PT64_ROOT_4LEVEL) {
> > +             root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level,
> > +                                   true);
> > +
> >               if (!VALID_PAGE(root))
> >                       return -ENOSPC;
> >               vcpu->arch.mmu->root_hpa = root;
> > diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
> > index 74ccbf001a42e..6cedf578c9a8d 100644
> > --- a/arch/x86/kvm/mmu/mmu_internal.h
> > +++ b/arch/x86/kvm/mmu/mmu_internal.h
> > @@ -43,8 +43,12 @@ struct kvm_mmu_page {
> >
> >       /* Number of writes since the last time traversal visited this page.  */
> >       atomic_t write_flooding_count;
> > +
> > +     bool tdp_mmu_page;
> >  };
> >
> > +extern struct kmem_cache *mmu_page_header_cache;
> > +
> >  static inline struct kvm_mmu_page *to_shadow_page(hpa_t shadow_page)
> >  {
> >       struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT);
> > @@ -96,6 +100,11 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
> >       (PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
> >                                               * PT64_LEVEL_BITS))) - 1))
> >
> > +#define ACC_EXEC_MASK    1
> > +#define ACC_WRITE_MASK   PT_WRITABLE_MASK
> > +#define ACC_USER_MASK    PT_USER_MASK
> > +#define ACC_ALL          (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
> > +
> >  /* Functions for interpreting SPTEs */
> >  static inline bool is_mmio_spte(u64 spte)
> >  {
> > @@ -126,4 +135,19 @@ static inline kvm_pfn_t spte_to_pfn(u64 pte)
> >       return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
> >  }
> >
> > +static inline void kvm_mmu_get_root(struct kvm_mmu_page *sp)
> > +{
> > +     BUG_ON(!sp->root_count);
> > +
> > +     ++sp->root_count;
> > +}
> > +
> > +static inline bool kvm_mmu_put_root(struct kvm_mmu_page *sp)
> > +{
> > +     --sp->root_count;
> > +
> > +     return !sp->root_count;
> > +}
> > +
> > +
> >  #endif /* __KVM_X86_MMU_INTERNAL_H */
> > diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> > index b3809835e90b1..09a84a6e157b6 100644
> > --- a/arch/x86/kvm/mmu/tdp_mmu.c
> > +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> > @@ -1,5 +1,7 @@
> >  // SPDX-License-Identifier: GPL-2.0
> >
> > +#include "mmu.h"
> > +#include "mmu_internal.h"
> >  #include "tdp_mmu.h"
> >
> >  static bool __read_mostly tdp_mmu_enabled = false;
> > @@ -29,10 +31,122 @@ void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
> >
> >       /* This should not be changed for the lifetime of the VM. */
> >       kvm->arch.tdp_mmu_enabled = true;
> > +
> > +     INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
> >  }
> >
> >  void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
> >  {
> >       if (!kvm->arch.tdp_mmu_enabled)
> >               return;
> > +
> > +     WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
> > +}
> > +
> > +#define for_each_tdp_mmu_root(_kvm, _root)                       \
> > +     list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link)
> > +
> > +bool is_tdp_mmu_root(struct kvm *kvm, hpa_t hpa)
> > +{
> > +     struct kvm_mmu_page *sp;
> > +
> > +     sp = to_shadow_page(hpa);
> > +
> > +     return sp->tdp_mmu_page && sp->root_count;
> > +}
> > +
> > +void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root)
> > +{
> > +     lockdep_assert_held(&kvm->mmu_lock);
> > +
> > +     WARN_ON(root->root_count);
> > +     WARN_ON(!root->tdp_mmu_page);
> > +
> > +     list_del(&root->link);
> > +
> > +     free_page((unsigned long)root->spt);
> > +     kmem_cache_free(mmu_page_header_cache, root);
> > +}
> > +
> > +static void put_tdp_mmu_root(struct kvm *kvm, struct kvm_mmu_page *root)
> > +{
> > +     if (kvm_mmu_put_root(root))
> > +             kvm_tdp_mmu_free_root(kvm, root);
> > +}
> > +
> > +static void get_tdp_mmu_root(struct kvm *kvm, struct kvm_mmu_page *root)
> > +{
> > +     lockdep_assert_held(&kvm->mmu_lock);
> > +
> > +     kvm_mmu_get_root(root);
> > +}
> > +
> > +static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu,
> > +                                                int level)
> > +{
> > +     union kvm_mmu_page_role role;
> > +
> > +     role = vcpu->arch.mmu->mmu_role.base;
> > +     role.level = vcpu->arch.mmu->shadow_root_level;
>
> role.level = level;
> The role will be calculated for non root pages later.

Thank you for catching that Yu, that was definitely an error!
I'm guessing this never showed up in my testing because I don't think
the TDP MMU actually uses role.level for anything other than root
pages.

>
> > +     role.direct = true;
> > +     role.gpte_is_8_bytes = true;
> > +     role.access = ACC_ALL;
> > +
> > +     return role;
> > +}
> > +
> > +static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn,
> > +                                            int level)
> > +{
> > +     struct kvm_mmu_page *sp;
> > +
> > +     sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
> > +     sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
> > +     set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
> > +
> > +     sp->role.word = page_role_for_level(vcpu, level).word;
> > +     sp->gfn = gfn;
> > +     sp->tdp_mmu_page = true;
> > +
> > +     return sp;
> > +}
> > +
> > +static struct kvm_mmu_page *get_tdp_mmu_vcpu_root(struct kvm_vcpu *vcpu)
> > +{
> > +     union kvm_mmu_page_role role;
> > +     struct kvm *kvm = vcpu->kvm;
> > +     struct kvm_mmu_page *root;
> > +
> > +     role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level);
> > +
> > +     spin_lock(&kvm->mmu_lock);
> > +
> > +     /* Check for an existing root before allocating a new one. */
> > +     for_each_tdp_mmu_root(kvm, root) {
> > +             if (root->role.word == role.word) {
> > +                     get_tdp_mmu_root(kvm, root);
> > +                     spin_unlock(&kvm->mmu_lock);
> > +                     return root;
> > +             }
> > +     }
> > +
> > +     root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level);
> > +     root->root_count = 1;
> > +
> > +     list_add(&root->link, &kvm->arch.tdp_mmu_roots);
> > +
> > +     spin_unlock(&kvm->mmu_lock);
> > +
> > +     return root;
> > +}
> > +
> > +hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
> > +{
> > +     struct kvm_mmu_page *root;
> > +
> > +     root = get_tdp_mmu_vcpu_root(vcpu);
> > +     if (!root)
> > +             return INVALID_PAGE;
> > +
> > +     return __pa(root->spt);
> >  }
> > diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
> > index cd4a562a70e9a..ac0ef91294420 100644
> > --- a/arch/x86/kvm/mmu/tdp_mmu.h
> > +++ b/arch/x86/kvm/mmu/tdp_mmu.h
> > @@ -7,4 +7,9 @@
> >
> >  void kvm_mmu_init_tdp_mmu(struct kvm *kvm);
> >  void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm);
> > +
> > +bool is_tdp_mmu_root(struct kvm *kvm, hpa_t root);
> > +hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu);
> > +void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root);
> > +
> >  #endif /* __KVM_X86_MMU_TDP_MMU_H */
> > --
> > 2.28.0.1011.ga647a8990f-goog
> >
>
> Thanks
> Yu
Paolo Bonzini Oct. 21, 2020, 6:01 p.m. UTC | #5
On 21/10/20 19:54, Ben Gardon wrote:
> On Wed, Oct 21, 2020 at 8:09 AM Yu Zhang <yu.c.zhang@linux.intel.com> wrote:
>>
>> On Wed, Oct 14, 2020 at 11:26:44AM -0700, Ben Gardon wrote:
>>> The TDP MMU must be able to allocate paging structure root pages and track
>>> the usage of those pages. Implement a similar, but separate system for root
>>> page allocation to that of the x86 shadow paging implementation. When
>>> future patches add synchronization model changes to allow for parallel
>>> page faults, these pages will need to be handled differently from the
>>> x86 shadow paging based MMU's root pages.
>>>
>>> Tested by running kvm-unit-tests and KVM selftests on an Intel Haswell
>>> machine. This series introduced no new failures.
>>>
>>> This series can be viewed in Gerrit at:
>>>       https://linux-review.googlesource.com/c/virt/kvm/kvm/+/2538
>>>
>>> Signed-off-by: Ben Gardon <bgardon@google.com>
>>> ---
>>>  arch/x86/include/asm/kvm_host.h |   1 +
>>>  arch/x86/kvm/mmu/mmu.c          |  29 +++++---
>>>  arch/x86/kvm/mmu/mmu_internal.h |  24 +++++++
>>>  arch/x86/kvm/mmu/tdp_mmu.c      | 114 ++++++++++++++++++++++++++++++++
>>>  arch/x86/kvm/mmu/tdp_mmu.h      |   5 ++
>>>  5 files changed, 162 insertions(+), 11 deletions(-)
>>>
>>> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
>>> index 6b6dbc20ce23a..e0ec1dd271a32 100644
>>> --- a/arch/x86/include/asm/kvm_host.h
>>> +++ b/arch/x86/include/asm/kvm_host.h
>>> @@ -989,6 +989,7 @@ struct kvm_arch {
>>>        * operations.
>>>        */
>>>       bool tdp_mmu_enabled;
>>> +     struct list_head tdp_mmu_roots;
>>>  };
>>>
>>>  struct kvm_vm_stat {
>>> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
>>> index f53d29e09367c..a3340ed59ad1d 100644
>>> --- a/arch/x86/kvm/mmu/mmu.c
>>> +++ b/arch/x86/kvm/mmu/mmu.c
>>> @@ -144,11 +144,6 @@ module_param(dbg, bool, 0644);
>>>  #define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \
>>>                       | shadow_x_mask | shadow_nx_mask | shadow_me_mask)
>>>
>>> -#define ACC_EXEC_MASK    1
>>> -#define ACC_WRITE_MASK   PT_WRITABLE_MASK
>>> -#define ACC_USER_MASK    PT_USER_MASK
>>> -#define ACC_ALL          (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
>>> -
>>>  /* The mask for the R/X bits in EPT PTEs */
>>>  #define PT64_EPT_READABLE_MASK                       0x1ull
>>>  #define PT64_EPT_EXECUTABLE_MASK             0x4ull
>>> @@ -209,7 +204,7 @@ struct kvm_shadow_walk_iterator {
>>>            __shadow_walk_next(&(_walker), spte))
>>>
>>>  static struct kmem_cache *pte_list_desc_cache;
>>> -static struct kmem_cache *mmu_page_header_cache;
>>> +struct kmem_cache *mmu_page_header_cache;
>>>  static struct percpu_counter kvm_total_used_mmu_pages;
>>>
>>>  static u64 __read_mostly shadow_nx_mask;
>>> @@ -3588,9 +3583,13 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
>>>               return;
>>>
>>>       sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK);
>>> -     --sp->root_count;
>>> -     if (!sp->root_count && sp->role.invalid)
>>> -             kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
>>> +
>>> +     if (kvm_mmu_put_root(sp)) {
>>> +             if (sp->tdp_mmu_page)
>>> +                     kvm_tdp_mmu_free_root(kvm, sp);
>>> +             else if (sp->role.invalid)
>>> +                     kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
>>> +     }
>>>
>>>       *root_hpa = INVALID_PAGE;
>>>  }
>>> @@ -3680,8 +3679,16 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
>>>       hpa_t root;
>>>       unsigned i;
>>>
>>> -     if (shadow_root_level >= PT64_ROOT_4LEVEL) {
>>> -             root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level, true);
>>> +     if (vcpu->kvm->arch.tdp_mmu_enabled) {
>>> +             root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu);
>>> +
>>> +             if (!VALID_PAGE(root))
>>> +                     return -ENOSPC;
>>> +             vcpu->arch.mmu->root_hpa = root;
>>> +     } else if (shadow_root_level >= PT64_ROOT_4LEVEL) {
>>> +             root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level,
>>> +                                   true);
>>> +
>>>               if (!VALID_PAGE(root))
>>>                       return -ENOSPC;
>>>               vcpu->arch.mmu->root_hpa = root;
>>> diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
>>> index 74ccbf001a42e..6cedf578c9a8d 100644
>>> --- a/arch/x86/kvm/mmu/mmu_internal.h
>>> +++ b/arch/x86/kvm/mmu/mmu_internal.h
>>> @@ -43,8 +43,12 @@ struct kvm_mmu_page {
>>>
>>>       /* Number of writes since the last time traversal visited this page.  */
>>>       atomic_t write_flooding_count;
>>> +
>>> +     bool tdp_mmu_page;
>>>  };
>>>
>>> +extern struct kmem_cache *mmu_page_header_cache;
>>> +
>>>  static inline struct kvm_mmu_page *to_shadow_page(hpa_t shadow_page)
>>>  {
>>>       struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT);
>>> @@ -96,6 +100,11 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
>>>       (PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
>>>                                               * PT64_LEVEL_BITS))) - 1))
>>>
>>> +#define ACC_EXEC_MASK    1
>>> +#define ACC_WRITE_MASK   PT_WRITABLE_MASK
>>> +#define ACC_USER_MASK    PT_USER_MASK
>>> +#define ACC_ALL          (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
>>> +
>>>  /* Functions for interpreting SPTEs */
>>>  static inline bool is_mmio_spte(u64 spte)
>>>  {
>>> @@ -126,4 +135,19 @@ static inline kvm_pfn_t spte_to_pfn(u64 pte)
>>>       return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
>>>  }
>>>
>>> +static inline void kvm_mmu_get_root(struct kvm_mmu_page *sp)
>>> +{
>>> +     BUG_ON(!sp->root_count);
>>> +
>>> +     ++sp->root_count;
>>> +}
>>> +
>>> +static inline bool kvm_mmu_put_root(struct kvm_mmu_page *sp)
>>> +{
>>> +     --sp->root_count;
>>> +
>>> +     return !sp->root_count;
>>> +}
>>> +
>>> +
>>>  #endif /* __KVM_X86_MMU_INTERNAL_H */
>>> diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
>>> index b3809835e90b1..09a84a6e157b6 100644
>>> --- a/arch/x86/kvm/mmu/tdp_mmu.c
>>> +++ b/arch/x86/kvm/mmu/tdp_mmu.c
>>> @@ -1,5 +1,7 @@
>>>  // SPDX-License-Identifier: GPL-2.0
>>>
>>> +#include "mmu.h"
>>> +#include "mmu_internal.h"
>>>  #include "tdp_mmu.h"
>>>
>>>  static bool __read_mostly tdp_mmu_enabled = false;
>>> @@ -29,10 +31,122 @@ void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
>>>
>>>       /* This should not be changed for the lifetime of the VM. */
>>>       kvm->arch.tdp_mmu_enabled = true;
>>> +
>>> +     INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
>>>  }
>>>
>>>  void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
>>>  {
>>>       if (!kvm->arch.tdp_mmu_enabled)
>>>               return;
>>> +
>>> +     WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
>>> +}
>>> +
>>> +#define for_each_tdp_mmu_root(_kvm, _root)                       \
>>> +     list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link)
>>> +
>>> +bool is_tdp_mmu_root(struct kvm *kvm, hpa_t hpa)
>>> +{
>>> +     struct kvm_mmu_page *sp;
>>> +
>>> +     sp = to_shadow_page(hpa);
>>> +
>>> +     return sp->tdp_mmu_page && sp->root_count;
>>> +}
>>> +
>>> +void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root)
>>> +{
>>> +     lockdep_assert_held(&kvm->mmu_lock);
>>> +
>>> +     WARN_ON(root->root_count);
>>> +     WARN_ON(!root->tdp_mmu_page);
>>> +
>>> +     list_del(&root->link);
>>> +
>>> +     free_page((unsigned long)root->spt);
>>> +     kmem_cache_free(mmu_page_header_cache, root);
>>> +}
>>> +
>>> +static void put_tdp_mmu_root(struct kvm *kvm, struct kvm_mmu_page *root)
>>> +{
>>> +     if (kvm_mmu_put_root(root))
>>> +             kvm_tdp_mmu_free_root(kvm, root);
>>> +}
>>> +
>>> +static void get_tdp_mmu_root(struct kvm *kvm, struct kvm_mmu_page *root)
>>> +{
>>> +     lockdep_assert_held(&kvm->mmu_lock);
>>> +
>>> +     kvm_mmu_get_root(root);
>>> +}
>>> +
>>> +static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu,
>>> +                                                int level)
>>> +{
>>> +     union kvm_mmu_page_role role;
>>> +
>>> +     role = vcpu->arch.mmu->mmu_role.base;
>>> +     role.level = vcpu->arch.mmu->shadow_root_level;
>>
>> role.level = level;
>> The role will be calculated for non root pages later.
> 
> Thank you for catching that Yu, that was definitely an error!
> I'm guessing this never showed up in my testing because I don't think
> the TDP MMU actually uses role.level for anything other than root
> pages.

I'll fix it up, thanks to both!

Paolo
diff mbox series

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 6b6dbc20ce23a..e0ec1dd271a32 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -989,6 +989,7 @@  struct kvm_arch {
 	 * operations.
 	 */
 	bool tdp_mmu_enabled;
+	struct list_head tdp_mmu_roots;
 };
 
 struct kvm_vm_stat {
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index f53d29e09367c..a3340ed59ad1d 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -144,11 +144,6 @@  module_param(dbg, bool, 0644);
 #define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \
 			| shadow_x_mask | shadow_nx_mask | shadow_me_mask)
 
-#define ACC_EXEC_MASK    1
-#define ACC_WRITE_MASK   PT_WRITABLE_MASK
-#define ACC_USER_MASK    PT_USER_MASK
-#define ACC_ALL          (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
-
 /* The mask for the R/X bits in EPT PTEs */
 #define PT64_EPT_READABLE_MASK			0x1ull
 #define PT64_EPT_EXECUTABLE_MASK		0x4ull
@@ -209,7 +204,7 @@  struct kvm_shadow_walk_iterator {
 	     __shadow_walk_next(&(_walker), spte))
 
 static struct kmem_cache *pte_list_desc_cache;
-static struct kmem_cache *mmu_page_header_cache;
+struct kmem_cache *mmu_page_header_cache;
 static struct percpu_counter kvm_total_used_mmu_pages;
 
 static u64 __read_mostly shadow_nx_mask;
@@ -3588,9 +3583,13 @@  static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
 		return;
 
 	sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK);
-	--sp->root_count;
-	if (!sp->root_count && sp->role.invalid)
-		kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
+
+	if (kvm_mmu_put_root(sp)) {
+		if (sp->tdp_mmu_page)
+			kvm_tdp_mmu_free_root(kvm, sp);
+		else if (sp->role.invalid)
+			kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
+	}
 
 	*root_hpa = INVALID_PAGE;
 }
@@ -3680,8 +3679,16 @@  static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
 	hpa_t root;
 	unsigned i;
 
-	if (shadow_root_level >= PT64_ROOT_4LEVEL) {
-		root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level, true);
+	if (vcpu->kvm->arch.tdp_mmu_enabled) {
+		root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu);
+
+		if (!VALID_PAGE(root))
+			return -ENOSPC;
+		vcpu->arch.mmu->root_hpa = root;
+	} else if (shadow_root_level >= PT64_ROOT_4LEVEL) {
+		root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level,
+				      true);
+
 		if (!VALID_PAGE(root))
 			return -ENOSPC;
 		vcpu->arch.mmu->root_hpa = root;
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index 74ccbf001a42e..6cedf578c9a8d 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -43,8 +43,12 @@  struct kvm_mmu_page {
 
 	/* Number of writes since the last time traversal visited this page.  */
 	atomic_t write_flooding_count;
+
+	bool tdp_mmu_page;
 };
 
+extern struct kmem_cache *mmu_page_header_cache;
+
 static inline struct kvm_mmu_page *to_shadow_page(hpa_t shadow_page)
 {
 	struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT);
@@ -96,6 +100,11 @@  bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
 	(PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
 						* PT64_LEVEL_BITS))) - 1))
 
+#define ACC_EXEC_MASK    1
+#define ACC_WRITE_MASK   PT_WRITABLE_MASK
+#define ACC_USER_MASK    PT_USER_MASK
+#define ACC_ALL          (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
+
 /* Functions for interpreting SPTEs */
 static inline bool is_mmio_spte(u64 spte)
 {
@@ -126,4 +135,19 @@  static inline kvm_pfn_t spte_to_pfn(u64 pte)
 	return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
 }
 
+static inline void kvm_mmu_get_root(struct kvm_mmu_page *sp)
+{
+	BUG_ON(!sp->root_count);
+
+	++sp->root_count;
+}
+
+static inline bool kvm_mmu_put_root(struct kvm_mmu_page *sp)
+{
+	--sp->root_count;
+
+	return !sp->root_count;
+}
+
+
 #endif /* __KVM_X86_MMU_INTERNAL_H */
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index b3809835e90b1..09a84a6e157b6 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -1,5 +1,7 @@ 
 // SPDX-License-Identifier: GPL-2.0
 
+#include "mmu.h"
+#include "mmu_internal.h"
 #include "tdp_mmu.h"
 
 static bool __read_mostly tdp_mmu_enabled = false;
@@ -29,10 +31,122 @@  void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
 
 	/* This should not be changed for the lifetime of the VM. */
 	kvm->arch.tdp_mmu_enabled = true;
+
+	INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
 }
 
 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
 {
 	if (!kvm->arch.tdp_mmu_enabled)
 		return;
+
+	WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
+}
+
+#define for_each_tdp_mmu_root(_kvm, _root)			    \
+	list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link)
+
+bool is_tdp_mmu_root(struct kvm *kvm, hpa_t hpa)
+{
+	struct kvm_mmu_page *sp;
+
+	sp = to_shadow_page(hpa);
+
+	return sp->tdp_mmu_page && sp->root_count;
+}
+
+void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root)
+{
+	lockdep_assert_held(&kvm->mmu_lock);
+
+	WARN_ON(root->root_count);
+	WARN_ON(!root->tdp_mmu_page);
+
+	list_del(&root->link);
+
+	free_page((unsigned long)root->spt);
+	kmem_cache_free(mmu_page_header_cache, root);
+}
+
+static void put_tdp_mmu_root(struct kvm *kvm, struct kvm_mmu_page *root)
+{
+	if (kvm_mmu_put_root(root))
+		kvm_tdp_mmu_free_root(kvm, root);
+}
+
+static void get_tdp_mmu_root(struct kvm *kvm, struct kvm_mmu_page *root)
+{
+	lockdep_assert_held(&kvm->mmu_lock);
+
+	kvm_mmu_get_root(root);
+}
+
+static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu,
+						   int level)
+{
+	union kvm_mmu_page_role role;
+
+	role = vcpu->arch.mmu->mmu_role.base;
+	role.level = vcpu->arch.mmu->shadow_root_level;
+	role.direct = true;
+	role.gpte_is_8_bytes = true;
+	role.access = ACC_ALL;
+
+	return role;
+}
+
+static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn,
+					       int level)
+{
+	struct kvm_mmu_page *sp;
+
+	sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
+	sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
+	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
+
+	sp->role.word = page_role_for_level(vcpu, level).word;
+	sp->gfn = gfn;
+	sp->tdp_mmu_page = true;
+
+	return sp;
+}
+
+static struct kvm_mmu_page *get_tdp_mmu_vcpu_root(struct kvm_vcpu *vcpu)
+{
+	union kvm_mmu_page_role role;
+	struct kvm *kvm = vcpu->kvm;
+	struct kvm_mmu_page *root;
+
+	role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level);
+
+	spin_lock(&kvm->mmu_lock);
+
+	/* Check for an existing root before allocating a new one. */
+	for_each_tdp_mmu_root(kvm, root) {
+		if (root->role.word == role.word) {
+			get_tdp_mmu_root(kvm, root);
+			spin_unlock(&kvm->mmu_lock);
+			return root;
+		}
+	}
+
+	root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level);
+	root->root_count = 1;
+
+	list_add(&root->link, &kvm->arch.tdp_mmu_roots);
+
+	spin_unlock(&kvm->mmu_lock);
+
+	return root;
+}
+
+hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
+{
+	struct kvm_mmu_page *root;
+
+	root = get_tdp_mmu_vcpu_root(vcpu);
+	if (!root)
+		return INVALID_PAGE;
+
+	return __pa(root->spt);
 }
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index cd4a562a70e9a..ac0ef91294420 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -7,4 +7,9 @@ 
 
 void kvm_mmu_init_tdp_mmu(struct kvm *kvm);
 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm);
+
+bool is_tdp_mmu_root(struct kvm *kvm, hpa_t root);
+hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu);
+void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root);
+
 #endif /* __KVM_X86_MMU_TDP_MMU_H */