diff mbox series

[v7,044/102] KVM: x86/mmu: Add a private pointer to struct kvm_mmu_page

Message ID 392839e09c48ff4e14a598ff6ed8bd56429bf17b.1656366338.git.isaku.yamahata@intel.com (mailing list archive)
State New, archived
Headers show
Series KVM TDX basic feature support | expand

Commit Message

Isaku Yamahata June 27, 2022, 9:53 p.m. UTC
From: Isaku Yamahata <isaku.yamahata@intel.com>

For private GPA, CPU refers a private page table whose contents are
encrypted.  The dedicated APIs to operate on it (e.g. updating/reading its
PTE entry) are used and their cost is expensive.

When KVM resolves KVM page fault, it walks the page tables.  To reuse the
existing KVM MMU code and mitigate the heavy cost to directly walk
encrypted private page table, allocate a more page to mirror the existing
KVM page table.  Resolve KVM page fault with the existing code, and do
additional operations necessary for the mirrored private page table.  To
distinguish such cases, the existing KVM page table is called a shared page
table (i.e. no mirrored private page table), and the KVM page table with
mirrored private page table is called a private page table.  The
relationship is depicted below.

Add private pointer to struct kvm_mmu_page for mirrored private page table
and add helper functions to allocate/initialize/free a mirrored private
page table page.  Also, add helper functions to check if a given
kvm_mmu_page is private.  The later patch introduces hooks to operate on
the mirrored private page table.

              KVM page fault                     |
                     |                           |
                     V                           |
        -------------+----------                 |
        |                      |                 |
        V                      V                 |
     shared GPA           private GPA            |
        |                      |                 |
        V                      V                 |
 CPU/KVM shared PT root  KVM private PT root     |  CPU private PT root
        |                      |                 |           |
        V                      V                 |           V
     shared PT            private PT <----mirror----> mirrored private PT
        |                      |                 |           |
        |                      \-----------------+------\    |
        |                                        |      |    |
        V                                        |      V    V
  shared guest page                              |    private guest page
                                                 |
                           non-encrypted memory  |    encrypted memory
                                                 |
PT: page table

Both CPU and KVM refer to CPU/KVM shared page table.  Private page table
is used only by KVM.  CPU refers to mirrored private page table.

Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/mmu/mmu.c          |  9 ++++
 arch/x86/kvm/mmu/mmu_internal.h | 84 +++++++++++++++++++++++++++++++++
 arch/x86/kvm/mmu/tdp_mmu.c      |  3 ++
 4 files changed, 97 insertions(+)

Comments

Huang, Kai July 1, 2022, 11:12 a.m. UTC | #1
On Mon, 2022-06-27 at 14:53 -0700, isaku.yamahata@intel.com wrote:
> From: Isaku Yamahata <isaku.yamahata@intel.com>
> 
> For private GPA, CPU refers a private page table whose contents are
> encrypted.  The dedicated APIs to operate on it (e.g. updating/reading its
> PTE entry) are used and their cost is expensive.
> 
> When KVM resolves KVM page fault, it walks the page tables.  To reuse the
> existing KVM MMU code and mitigate the heavy cost to directly walk
> encrypted private page table, allocate a more page to mirror the existing
> KVM page table.  Resolve KVM page fault with the existing code, and do
> additional operations necessary for the mirrored private page table.  To
> distinguish such cases, the existing KVM page table is called a shared page
> table (i.e. no mirrored private page table), and the KVM page table with
> mirrored private page table is called a private page table.  The
> relationship is depicted below.
> 
> Add private pointer to struct kvm_mmu_page for mirrored private page table
> and add helper functions to allocate/initialize/free a mirrored private
> page table page.  Also, add helper functions to check if a given
> kvm_mmu_page is private.  The later patch introduces hooks to operate on
> the mirrored private page table.
> 
>               KVM page fault                     |
>                      |                           |
>                      V                           |
>         -------------+----------                 |
>         |                      |                 |
>         V                      V                 |
>      shared GPA           private GPA            |
>         |                      |                 |
>         V                      V                 |
>  CPU/KVM shared PT root  KVM private PT root     |  CPU private PT root
>         |                      |                 |           |
>         V                      V                 |           V
>      shared PT            private PT <----mirror----> mirrored private PT
>         |                      |                 |           |
>         |                      \-----------------+------\    |
>         |                                        |      |    |
>         V                                        |      V    V
>   shared guest page                              |    private guest page
>                                                  |
>                            non-encrypted memory  |    encrypted memory
>                                                  |
> PT: page table
> 
> Both CPU and KVM refer to CPU/KVM shared page table.  Private page table
> is used only by KVM.  CPU refers to mirrored private page table.

Shouldn't the private page table maintained by KVM be "mirrored private PT"?

To me "mirrored" normally implies it is fake, or backup which isn't actually
used.  But here "mirrored private PT" is actually used by hardware.

And to me, "CPU and KVM" above are confusing.  For instance, "Both CPU and KVM
refer to CPU/KVM shared page table" took me at least one minute to understand,
with the help from the diagram -- otherwise I won't be able to understand.

I guess you can just say somewhere:

1) Shared PT is visible to KVM and it is used by CPU;
1) Private PT is used by CPU but it is invisible to KVM;
2) Mirrored private PT is visible to KVM but not used by CPU.  It is used to
mirror the actual private PT which is used by CPU.


[...]

> +
> +static inline void kvm_mmu_init_private_sp(struct kvm_mmu_page *sp, void *private_sp)
> +{
> +	sp->private_sp = private_sp;
> +}
> 

[...]

> @@ -295,6 +297,7 @@ static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep,
>  	sp->gfn = gfn;
>  	sp->ptep = sptep;
>  	sp->tdp_mmu_page = true;
> +	kvm_mmu_init_private_sp(sp);

Can this even compile?  Unless I am seeing mistakenly, kvm_mmu_init_private_sp()
(see above) has two arguments..

Please make sure each patch can at least compile and doesn't cause warning...
Yuan Yao July 11, 2022, 6:28 a.m. UTC | #2
On Mon, Jun 27, 2022 at 02:53:36PM -0700, isaku.yamahata@intel.com wrote:
> From: Isaku Yamahata <isaku.yamahata@intel.com>
>
> For private GPA, CPU refers a private page table whose contents are
> encrypted.  The dedicated APIs to operate on it (e.g. updating/reading its
> PTE entry) are used and their cost is expensive.
>
> When KVM resolves KVM page fault, it walks the page tables.  To reuse the
> existing KVM MMU code and mitigate the heavy cost to directly walk
> encrypted private page table, allocate a more page to mirror the existing
> KVM page table.  Resolve KVM page fault with the existing code, and do
> additional operations necessary for the mirrored private page table.  To
> distinguish such cases, the existing KVM page table is called a shared page
> table (i.e. no mirrored private page table), and the KVM page table with
> mirrored private page table is called a private page table.  The
> relationship is depicted below.
>
> Add private pointer to struct kvm_mmu_page for mirrored private page table
> and add helper functions to allocate/initialize/free a mirrored private
> page table page.  Also, add helper functions to check if a given
> kvm_mmu_page is private.  The later patch introduces hooks to operate on
> the mirrored private page table.
>
>               KVM page fault                     |
>                      |                           |
>                      V                           |
>         -------------+----------                 |
>         |                      |                 |
>         V                      V                 |
>      shared GPA           private GPA            |
>         |                      |                 |
>         V                      V                 |
>  CPU/KVM shared PT root  KVM private PT root     |  CPU private PT root
>         |                      |                 |           |
>         V                      V                 |           V
>      shared PT            private PT <----mirror----> mirrored private PT
>         |                      |                 |           |
>         |                      \-----------------+------\    |
>         |                                        |      |    |
>         V                                        |      V    V
>   shared guest page                              |    private guest page
>                                                  |
>                            non-encrypted memory  |    encrypted memory
>                                                  |
> PT: page table
>
> Both CPU and KVM refer to CPU/KVM shared page table.  Private page table
> is used only by KVM.  CPU refers to mirrored private page table.
>
> Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
> ---
>  arch/x86/include/asm/kvm_host.h |  1 +
>  arch/x86/kvm/mmu/mmu.c          |  9 ++++
>  arch/x86/kvm/mmu/mmu_internal.h | 84 +++++++++++++++++++++++++++++++++
>  arch/x86/kvm/mmu/tdp_mmu.c      |  3 ++
>  4 files changed, 97 insertions(+)
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index f4d4ed41641b..bfc934dc9a33 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -716,6 +716,7 @@ struct kvm_vcpu_arch {
>  	struct kvm_mmu_memory_cache mmu_shadow_page_cache;
>  	struct kvm_mmu_memory_cache mmu_gfn_array_cache;
>  	struct kvm_mmu_memory_cache mmu_page_header_cache;
> +	struct kvm_mmu_memory_cache mmu_private_sp_cache;
>
>  	/*
>  	 * QEMU userspace and the guest each have their own FPU state.
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index c517c7bca105..a5bf3e40e209 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -691,6 +691,13 @@ static int mmu_topup_shadow_page_cache(struct kvm_vcpu *vcpu)
>  	int start, end, i, r;
>  	bool is_tdp_mmu = is_tdp_mmu_enabled(vcpu->kvm);
>
> +	if (kvm_gfn_shared_mask(vcpu->kvm)) {
> +		r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_private_sp_cache,
> +					       PT64_ROOT_MAX_LEVEL);
> +		if (r)
> +			return r;
> +	}
> +
>  	if (is_tdp_mmu && shadow_nonpresent_value)
>  		start = kvm_mmu_memory_cache_nr_free_objects(mc);
>
> @@ -732,6 +739,7 @@ static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
>  {
>  	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache);
>  	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadow_page_cache);
> +	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_private_sp_cache);
>  	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_gfn_array_cache);
>  	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
>  }
> @@ -1736,6 +1744,7 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct
>  	if (!direct)
>  		sp->gfns = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_gfn_array_cache);
>  	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
> +	kvm_mmu_init_private_sp(sp, NULL);
>
>  	/*
>  	 * active_mmu_pages must be a FIFO list, as kvm_zap_obsolete_pages()
> diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
> index 44a04fad4bed..9f3a6bea60a3 100644
> --- a/arch/x86/kvm/mmu/mmu_internal.h
> +++ b/arch/x86/kvm/mmu/mmu_internal.h
> @@ -55,6 +55,10 @@ struct kvm_mmu_page {
>  	u64 *spt;
>  	/* hold the gfn of each spte inside spt */
>  	gfn_t *gfns;
> +#ifdef CONFIG_KVM_MMU_PRIVATE
> +	/* associated private shadow page, e.g. SEPT page. */
> +	void *private_sp;
> +#endif
>  	/* Currently serving as active root */
>  	union {
>  		int root_count;
> @@ -115,6 +119,86 @@ static inline int kvm_mmu_page_as_id(struct kvm_mmu_page *sp)
>  	return kvm_mmu_role_as_id(sp->role);
>  }
>
> +/*
> + * TDX vcpu allocates page for root Secure EPT page and assigns to CPU secure

"TDX vcpu" is a little confused, how about "TDX moudule allocates(or manages) page
for ..." ?

> + * EPT pointer.  KVM doesn't need to allocate and link to the secure EPT.
> + * Dummy value to make is_pivate_sp() return true.
> + */
> +#define KVM_MMU_PRIVATE_SP_ROOT	((void *)1)
> +
> +#ifdef CONFIG_KVM_MMU_PRIVATE
> +static inline bool is_private_sp(struct kvm_mmu_page *sp)
> +{
> +	return !!sp->private_sp;
> +}
> +
> +static inline bool is_private_sptep(u64 *sptep)
> +{
> +	WARN_ON(!sptep);
> +	return is_private_sp(sptep_to_sp(sptep));
> +}
> +
> +static inline void *kvm_mmu_private_sp(struct kvm_mmu_page *sp)
> +{
> +	return sp->private_sp;
> +}
> +
> +static inline void kvm_mmu_init_private_sp(struct kvm_mmu_page *sp, void *private_sp)
> +{
> +	sp->private_sp = private_sp;
> +}
> +
> +/* Valid sp->role.level is required. */

I didn't see such requirement in kvm_mmu_alloc_private_sp(), please
consider to move the comment with the code that introduces such
requirement together.

> +static inline void kvm_mmu_alloc_private_sp(
> +	struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, bool is_root)
> +{
> +	if (is_root)
> +		sp->private_sp = KVM_MMU_PRIVATE_SP_ROOT;
> +	else
> +		sp->private_sp = kvm_mmu_memory_cache_alloc(
> +			&vcpu->arch.mmu_private_sp_cache);
> +	/*
> +	 * Because mmu_private_sp_cache is topped up before staring kvm page
> +	 * fault resolving, the allocation above shouldn't fail.
> +	 */
> +	WARN_ON_ONCE(!sp->private_sp);
> +}
> +
> +static inline void kvm_mmu_free_private_sp(struct kvm_mmu_page *sp)
> +{
> +	if (sp->private_sp != KVM_MMU_PRIVATE_SP_ROOT)
> +		free_page((unsigned long)sp->private_sp);
> +}
> +#else
> +static inline bool is_private_sp(struct kvm_mmu_page *sp)
> +{
> +	return false;
> +}
> +
> +static inline bool is_private_sptep(u64 *sptep)
> +{
> +	return false;
> +}
> +
> +static inline void *kvm_mmu_private_sp(struct kvm_mmu_page *sp)
> +{
> +	return NULL;
> +}
> +
> +static inline void kvm_mmu_init_private_sp(struct kvm_mmu_page *sp, void *private_sp)
> +{
> +}
> +
> +static inline void kvm_mmu_alloc_private_sp(
> +	struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, bool is_root)
> +{
> +}
> +
> +static inline void kvm_mmu_free_private_sp(struct kvm_mmu_page *sp)
> +{
> +}
> +#endif
> +
>  static inline bool kvm_mmu_page_ad_need_write_protect(struct kvm_mmu_page *sp)
>  {
>  	/*
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> index 7eb41b176d1e..b2568b062faa 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.c
> +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> @@ -72,6 +72,8 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
>
>  static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
>  {
> +	if (is_private_sp(sp))
> +		kvm_mmu_free_private_sp(sp);
>  	free_page((unsigned long)sp->spt);
>  	kmem_cache_free(mmu_page_header_cache, sp);
>  }
> @@ -295,6 +297,7 @@ static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep,
>  	sp->gfn = gfn;
>  	sp->ptep = sptep;
>  	sp->tdp_mmu_page = true;
> +	kvm_mmu_init_private_sp(sp);
>
>  	trace_kvm_mmu_get_page(sp, true);
>  }
> --
> 2.25.1
>
Isaku Yamahata July 19, 2022, 3:35 p.m. UTC | #3
On Fri, Jul 01, 2022 at 11:12:44PM +1200,
Kai Huang <kai.huang@intel.com> wrote:

> On Mon, 2022-06-27 at 14:53 -0700, isaku.yamahata@intel.com wrote:
> > From: Isaku Yamahata <isaku.yamahata@intel.com>
> > 
> > For private GPA, CPU refers a private page table whose contents are
> > encrypted.  The dedicated APIs to operate on it (e.g. updating/reading its
> > PTE entry) are used and their cost is expensive.
> > 
> > When KVM resolves KVM page fault, it walks the page tables.  To reuse the
> > existing KVM MMU code and mitigate the heavy cost to directly walk
> > encrypted private page table, allocate a more page to mirror the existing
> > KVM page table.  Resolve KVM page fault with the existing code, and do
> > additional operations necessary for the mirrored private page table.  To
> > distinguish such cases, the existing KVM page table is called a shared page
> > table (i.e. no mirrored private page table), and the KVM page table with
> > mirrored private page table is called a private page table.  The
> > relationship is depicted below.
> > 
> > Add private pointer to struct kvm_mmu_page for mirrored private page table
> > and add helper functions to allocate/initialize/free a mirrored private
> > page table page.  Also, add helper functions to check if a given
> > kvm_mmu_page is private.  The later patch introduces hooks to operate on
> > the mirrored private page table.
> > 
> >               KVM page fault                     |
> >                      |                           |
> >                      V                           |
> >         -------------+----------                 |
> >         |                      |                 |
> >         V                      V                 |
> >      shared GPA           private GPA            |
> >         |                      |                 |
> >         V                      V                 |
> >  CPU/KVM shared PT root  KVM private PT root     |  CPU private PT root
> >         |                      |                 |           |
> >         V                      V                 |           V
> >      shared PT            private PT <----mirror----> mirrored private PT
> >         |                      |                 |           |
> >         |                      \-----------------+------\    |
> >         |                                        |      |    |
> >         V                                        |      V    V
> >   shared guest page                              |    private guest page
> >                                                  |
> >                            non-encrypted memory  |    encrypted memory
> >                                                  |
> > PT: page table
> > 
> > Both CPU and KVM refer to CPU/KVM shared page table.  Private page table
> > is used only by KVM.  CPU refers to mirrored private page table.
> 
> Shouldn't the private page table maintained by KVM be "mirrored private PT"?
> 
> To me "mirrored" normally implies it is fake, or backup which isn't actually
> used.  But here "mirrored private PT" is actually used by hardware.
> 
> And to me, "CPU and KVM" above are confusing.  For instance, "Both CPU and KVM
> refer to CPU/KVM shared page table" took me at least one minute to understand,
> with the help from the diagram -- otherwise I won't be able to understand.
> 
> I guess you can just say somewhere:
> 
> 1) Shared PT is visible to KVM and it is used by CPU;
> 1) Private PT is used by CPU but it is invisible to KVM;
> 2) Mirrored private PT is visible to KVM but not used by CPU.  It is used to
> mirror the actual private PT which is used by CPU.

I removed "mirror" word and use protected for encrypted page table.


    KVM: x86/mmu: Add a private pointer to struct kvm_mmu_page
    
    For private GPA, CPU refers a private page table whose contents are
    encrypted.  The dedicated APIs to operate on it (e.g. updating/reading its
    PTE entry) are used and their cost is expensive.
    
    When KVM resolves KVM page fault, it walks the page tables.  To reuse the
    existing KVM MMU code and mitigate the heavy cost to directly walk
    protected (encrypted) page table, allocate one more page to copy the
    protected page table for KVM MMU code to directly walk.  Resolve KVM page
    fault with the existing code, and do additional operations necessary for
    the protected page table.  To distinguish such cases, the existing KVM page
    table is called a shared page table (i.e. not associated with protected
    page table), and the page table with protected page table is called a
    private page table.  The relationship is depicted below.
    
    Add a private pointer to struct kvm_mmu_page for protected page table and
    add helper functions to allocate/initialize/free a protected page table
    page.  Also, add helper functions to check if a given kvm_mmu_page is
    private.  The later patch introduces hooks to operate on the protected page
    table.
    
                  KVM page fault                     |
                         |                           |
                         V                           |
            -------------+----------                 |
            |                      |                 |
            V                      V                 |
         shared GPA           private GPA            |
            |                      |                 |
            V                      V                 |
        shared PT root      private PT root          |    protected PT root
            |                      |                 |           |
            V                      V                 |           V
         shared PT            private PT ----propagate----> protected PT
            |                      |                 |           |
            |                      \-----------------+------\    |
            |                                        |      |    |
            V                                        |      V    V
      shared guest page                              |    private guest page
                                                     |
                               non-encrypted memory  |    encrypted memory
                                                     |
    PT: page table
    - Shared PT is visible to KVM and it is used by CPU.
    - Protected PT is used by CPU but it is invisible to KVM.
    - Private PT is visible to KVM but not used by CPU.  It is used to
      propagate PT change to the actual protected PT which is used by CPU.
    
    Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
David Matlack July 28, 2022, 7:41 p.m. UTC | #4
On Mon, Jun 27, 2022 at 02:53:36PM -0700, isaku.yamahata@intel.com wrote:
> From: Isaku Yamahata <isaku.yamahata@intel.com>
> 
> For private GPA, CPU refers a private page table whose contents are
> encrypted.  The dedicated APIs to operate on it (e.g. updating/reading its
> PTE entry) are used and their cost is expensive.
> 
> When KVM resolves KVM page fault, it walks the page tables.  To reuse the
> existing KVM MMU code and mitigate the heavy cost to directly walk
> encrypted private page table, allocate a more page to mirror the existing
> KVM page table.  Resolve KVM page fault with the existing code, and do
> additional operations necessary for the mirrored private page table.  To
> distinguish such cases, the existing KVM page table is called a shared page
> table (i.e. no mirrored private page table), and the KVM page table with
> mirrored private page table is called a private page table.  The
> relationship is depicted below.
> 
> Add private pointer to struct kvm_mmu_page for mirrored private page table
> and add helper functions to allocate/initialize/free a mirrored private
> page table page.  Also, add helper functions to check if a given
> kvm_mmu_page is private.  The later patch introduces hooks to operate on
> the mirrored private page table.
> 
>               KVM page fault                     |
>                      |                           |
>                      V                           |
>         -------------+----------                 |
>         |                      |                 |
>         V                      V                 |
>      shared GPA           private GPA            |
>         |                      |                 |
>         V                      V                 |
>  CPU/KVM shared PT root  KVM private PT root     |  CPU private PT root
>         |                      |                 |           |
>         V                      V                 |           V
>      shared PT            private PT <----mirror----> mirrored private PT
>         |                      |                 |           |
>         |                      \-----------------+------\    |
>         |                                        |      |    |
>         V                                        |      V    V
>   shared guest page                              |    private guest page
>                                                  |
>                            non-encrypted memory  |    encrypted memory
>                                                  |
> PT: page table
> 
> Both CPU and KVM refer to CPU/KVM shared page table.  Private page table
> is used only by KVM.  CPU refers to mirrored private page table.
> 
> Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
> ---
>  arch/x86/include/asm/kvm_host.h |  1 +
>  arch/x86/kvm/mmu/mmu.c          |  9 ++++
>  arch/x86/kvm/mmu/mmu_internal.h | 84 +++++++++++++++++++++++++++++++++
>  arch/x86/kvm/mmu/tdp_mmu.c      |  3 ++
>  4 files changed, 97 insertions(+)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index f4d4ed41641b..bfc934dc9a33 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -716,6 +716,7 @@ struct kvm_vcpu_arch {
>  	struct kvm_mmu_memory_cache mmu_shadow_page_cache;
>  	struct kvm_mmu_memory_cache mmu_gfn_array_cache;
>  	struct kvm_mmu_memory_cache mmu_page_header_cache;
> +	struct kvm_mmu_memory_cache mmu_private_sp_cache;

I notice that mmu_private_sp_cache.gfp_zero is left unset so these pages
may contain garbage. Is this by design because the TDX module can't rely
on the contents being zero and has to take care of initializing the page
itself? i.e. GFP_ZERO would be a waste of cycles?

If I'm correct please include a comment here in the next revision to
explain why GFP_ZERO is not necessary.

>  
>  	/*
>  	 * QEMU userspace and the guest each have their own FPU state.
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index c517c7bca105..a5bf3e40e209 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -691,6 +691,13 @@ static int mmu_topup_shadow_page_cache(struct kvm_vcpu *vcpu)
>  	int start, end, i, r;
>  	bool is_tdp_mmu = is_tdp_mmu_enabled(vcpu->kvm);
>  
> +	if (kvm_gfn_shared_mask(vcpu->kvm)) {
> +		r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_private_sp_cache,
> +					       PT64_ROOT_MAX_LEVEL);
> +		if (r)
> +			return r;
> +	}
> +
>  	if (is_tdp_mmu && shadow_nonpresent_value)
>  		start = kvm_mmu_memory_cache_nr_free_objects(mc);
>  
> @@ -732,6 +739,7 @@ static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
>  {
>  	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache);
>  	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadow_page_cache);
> +	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_private_sp_cache);
>  	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_gfn_array_cache);
>  	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
>  }
> @@ -1736,6 +1744,7 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct
>  	if (!direct)
>  		sp->gfns = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_gfn_array_cache);
>  	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
> +	kvm_mmu_init_private_sp(sp, NULL);

This is unnecessary. kvm_mmu_page structs are zero-initialized so
private_sp will already be NULL.

>  
>  	/*
>  	 * active_mmu_pages must be a FIFO list, as kvm_zap_obsolete_pages()
> diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
> index 44a04fad4bed..9f3a6bea60a3 100644
> --- a/arch/x86/kvm/mmu/mmu_internal.h
> +++ b/arch/x86/kvm/mmu/mmu_internal.h
> @@ -55,6 +55,10 @@ struct kvm_mmu_page {
>  	u64 *spt;
>  	/* hold the gfn of each spte inside spt */
>  	gfn_t *gfns;
> +#ifdef CONFIG_KVM_MMU_PRIVATE
> +	/* associated private shadow page, e.g. SEPT page. */

Can we use "Secure EPT" instead of SEPT in KVM code and comments? (i.e.
also including variable names like sept_page -> secure_ept_page)

"SEPT" looks like a mispelling of SPTE, which is used all over KVM. It
will be difficult to read code that contains both acronyms.

> +	void *private_sp;

Please name this "private_spt" and move it up next to "spt".

sp" or "shadow page" is used to refer to kvm_mmu_page structs. For
example, look at all the code in KVM that uses `struct kvm_mmu_page *sp`.

"spt" is "shadow page table", i.e. the actual page table memory. See
kvm_mmu_page.spt. Calling this field "private_spt" makes it obvious that
this pointer is pointing to a page table.

Also please update the language in the comment accordingly to "private
shadow page table".

> +#endif
>  	/* Currently serving as active root */
>  	union {
>  		int root_count;
> @@ -115,6 +119,86 @@ static inline int kvm_mmu_page_as_id(struct kvm_mmu_page *sp)
>  	return kvm_mmu_role_as_id(sp->role);
>  }
>  
> +/*
> + * TDX vcpu allocates page for root Secure EPT page and assigns to CPU secure
> + * EPT pointer.  KVM doesn't need to allocate and link to the secure EPT.
> + * Dummy value to make is_pivate_sp() return true.
> + */
> +#define KVM_MMU_PRIVATE_SP_ROOT	((void *)1)
> +
> +#ifdef CONFIG_KVM_MMU_PRIVATE
> +static inline bool is_private_sp(struct kvm_mmu_page *sp)
> +{
> +	return !!sp->private_sp;
> +}
> +
> +static inline bool is_private_sptep(u64 *sptep)
> +{
> +	WARN_ON(!sptep);
> +	return is_private_sp(sptep_to_sp(sptep));
> +}
> +
> +static inline void *kvm_mmu_private_sp(struct kvm_mmu_page *sp)
> +{
> +	return sp->private_sp;
> +}
> +
> +static inline void kvm_mmu_init_private_sp(struct kvm_mmu_page *sp, void *private_sp)
> +{
> +	sp->private_sp = private_sp;
> +}
> +
> +/* Valid sp->role.level is required. */
> +static inline void kvm_mmu_alloc_private_sp(
> +	struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, bool is_root)
> +{
> +	if (is_root)
> +		sp->private_sp = KVM_MMU_PRIVATE_SP_ROOT;
> +	else
> +		sp->private_sp = kvm_mmu_memory_cache_alloc(
> +			&vcpu->arch.mmu_private_sp_cache);
> +	/*
> +	 * Because mmu_private_sp_cache is topped up before staring kvm page
> +	 * fault resolving, the allocation above shouldn't fail.
> +	 */
> +	WARN_ON_ONCE(!sp->private_sp);
> +}
> +
> +static inline void kvm_mmu_free_private_sp(struct kvm_mmu_page *sp)
> +{
> +	if (sp->private_sp != KVM_MMU_PRIVATE_SP_ROOT)
> +		free_page((unsigned long)sp->private_sp);
> +}
> +#else
> +static inline bool is_private_sp(struct kvm_mmu_page *sp)
> +{
> +	return false;
> +}
> +
> +static inline bool is_private_sptep(u64 *sptep)
> +{
> +	return false;
> +}
> +
> +static inline void *kvm_mmu_private_sp(struct kvm_mmu_page *sp)
> +{
> +	return NULL;
> +}
> +
> +static inline void kvm_mmu_init_private_sp(struct kvm_mmu_page *sp, void *private_sp)
> +{
> +}
> +
> +static inline void kvm_mmu_alloc_private_sp(
> +	struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, bool is_root)
> +{
> +}
> +
> +static inline void kvm_mmu_free_private_sp(struct kvm_mmu_page *sp)
> +{
> +}
> +#endif
> +
>  static inline bool kvm_mmu_page_ad_need_write_protect(struct kvm_mmu_page *sp)
>  {
>  	/*
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> index 7eb41b176d1e..b2568b062faa 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.c
> +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> @@ -72,6 +72,8 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
>  
>  static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
>  {
> +	if (is_private_sp(sp))
> +		kvm_mmu_free_private_sp(sp);
>  	free_page((unsigned long)sp->spt);
>  	kmem_cache_free(mmu_page_header_cache, sp);
>  }
> @@ -295,6 +297,7 @@ static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep,
>  	sp->gfn = gfn;
>  	sp->ptep = sptep;
>  	sp->tdp_mmu_page = true;
> +	kvm_mmu_init_private_sp(sp);
>  
>  	trace_kvm_mmu_get_page(sp, true);
>  }
> -- 
> 2.25.1
>
David Matlack July 28, 2022, 8:13 p.m. UTC | #5
On Mon, Jun 27, 2022 at 02:53:36PM -0700, isaku.yamahata@intel.com wrote:
> From: Isaku Yamahata <isaku.yamahata@intel.com>
> 
> For private GPA, CPU refers a private page table whose contents are
> encrypted.  The dedicated APIs to operate on it (e.g. updating/reading its
> PTE entry) are used and their cost is expensive.
> 
> When KVM resolves KVM page fault, it walks the page tables.  To reuse the
> existing KVM MMU code and mitigate the heavy cost to directly walk
> encrypted private page table, allocate a more page to mirror the existing
> KVM page table.  Resolve KVM page fault with the existing code, and do
> additional operations necessary for the mirrored private page table.  To
> distinguish such cases, the existing KVM page table is called a shared page
> table (i.e. no mirrored private page table), and the KVM page table with
> mirrored private page table is called a private page table.  The
> relationship is depicted below.
> 
> Add private pointer to struct kvm_mmu_page for mirrored private page table
> and add helper functions to allocate/initialize/free a mirrored private
> page table page.  Also, add helper functions to check if a given
> kvm_mmu_page is private.  The later patch introduces hooks to operate on
> the mirrored private page table.
> 
>               KVM page fault                     |
>                      |                           |
>                      V                           |
>         -------------+----------                 |
>         |                      |                 |
>         V                      V                 |
>      shared GPA           private GPA            |
>         |                      |                 |
>         V                      V                 |
>  CPU/KVM shared PT root  KVM private PT root     |  CPU private PT root
>         |                      |                 |           |
>         V                      V                 |           V
>      shared PT            private PT <----mirror----> mirrored private PT
>         |                      |                 |           |
>         |                      \-----------------+------\    |
>         |                                        |      |    |
>         V                                        |      V    V
>   shared guest page                              |    private guest page
>                                                  |
>                            non-encrypted memory  |    encrypted memory
>                                                  |
> PT: page table
> 
> Both CPU and KVM refer to CPU/KVM shared page table.  Private page table
> is used only by KVM.  CPU refers to mirrored private page table.
> 
> Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
> ---
>  arch/x86/include/asm/kvm_host.h |  1 +
>  arch/x86/kvm/mmu/mmu.c          |  9 ++++
>  arch/x86/kvm/mmu/mmu_internal.h | 84 +++++++++++++++++++++++++++++++++
>  arch/x86/kvm/mmu/tdp_mmu.c      |  3 ++
>  4 files changed, 97 insertions(+)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index f4d4ed41641b..bfc934dc9a33 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -716,6 +716,7 @@ struct kvm_vcpu_arch {
>  	struct kvm_mmu_memory_cache mmu_shadow_page_cache;
>  	struct kvm_mmu_memory_cache mmu_gfn_array_cache;
>  	struct kvm_mmu_memory_cache mmu_page_header_cache;
> +	struct kvm_mmu_memory_cache mmu_private_sp_cache;
>  
>  	/*
>  	 * QEMU userspace and the guest each have their own FPU state.
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index c517c7bca105..a5bf3e40e209 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -691,6 +691,13 @@ static int mmu_topup_shadow_page_cache(struct kvm_vcpu *vcpu)
>  	int start, end, i, r;
>  	bool is_tdp_mmu = is_tdp_mmu_enabled(vcpu->kvm);
>  
> +	if (kvm_gfn_shared_mask(vcpu->kvm)) {
> +		r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_private_sp_cache,
> +					       PT64_ROOT_MAX_LEVEL);
> +		if (r)
> +			return r;
> +	}
> +
>  	if (is_tdp_mmu && shadow_nonpresent_value)
>  		start = kvm_mmu_memory_cache_nr_free_objects(mc);
>  
> @@ -732,6 +739,7 @@ static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
>  {
>  	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache);
>  	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadow_page_cache);
> +	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_private_sp_cache);
>  	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_gfn_array_cache);
>  	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
>  }
> @@ -1736,6 +1744,7 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct
>  	if (!direct)
>  		sp->gfns = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_gfn_array_cache);
>  	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
> +	kvm_mmu_init_private_sp(sp, NULL);
>  
>  	/*
>  	 * active_mmu_pages must be a FIFO list, as kvm_zap_obsolete_pages()
> diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
> index 44a04fad4bed..9f3a6bea60a3 100644
> --- a/arch/x86/kvm/mmu/mmu_internal.h
> +++ b/arch/x86/kvm/mmu/mmu_internal.h
> @@ -55,6 +55,10 @@ struct kvm_mmu_page {
>  	u64 *spt;
>  	/* hold the gfn of each spte inside spt */
>  	gfn_t *gfns;
> +#ifdef CONFIG_KVM_MMU_PRIVATE
> +	/* associated private shadow page, e.g. SEPT page. */
> +	void *private_sp;
> +#endif

write_flooding_count and unsync_children are only used in shadow MMU SPs
and private_sp is only used in TDP MMU SPs. So it seems like we could
put these together in a union and drop CONFIG_KVM_MMU_PRIVATE without
increasing the size of kvm_mmu_page. i.e.

	union {
		struct {
			unsigned int unsync_children;
			/* Number of writes since the last time traversal visited this page.  */
			atomic_t write_flooding_count;
		};
		/*
		 * The associated private shadow page table, e.g. for Secure EPT.
		 * Only valid if tdp_mmu_page is true.
		 */
		void *private_spt;
	};

Then change is_private_sp() to:

static inline bool is_private_sp(struct kvm_mmu_page *sp)
{
	return sp->tdp_mmu_page && sp->private_sp;
}

This will allow us to drop CONFIG_KVM_MMU_PRIVATE, the only benefit of
which I see is to avoid increasing the size of kvm_mmu_page. However
to actually realize that benefit Cloud vendors (for example) would have
to create separate kernel builds for TDX and non-TDX hosts, which seems
like a huge hassel.

>  	/* Currently serving as active root */
>  	union {
>  		int root_count;
> @@ -115,6 +119,86 @@ static inline int kvm_mmu_page_as_id(struct kvm_mmu_page *sp)
>  	return kvm_mmu_role_as_id(sp->role);
>  }
>  
> +/*
> + * TDX vcpu allocates page for root Secure EPT page and assigns to CPU secure
> + * EPT pointer.  KVM doesn't need to allocate and link to the secure EPT.
> + * Dummy value to make is_pivate_sp() return true.
> + */
> +#define KVM_MMU_PRIVATE_SP_ROOT	((void *)1)
> +
> +#ifdef CONFIG_KVM_MMU_PRIVATE
> +static inline bool is_private_sp(struct kvm_mmu_page *sp)
> +{
> +	return !!sp->private_sp;
> +}
> +
> +static inline bool is_private_sptep(u64 *sptep)
> +{
> +	WARN_ON(!sptep);
> +	return is_private_sp(sptep_to_sp(sptep));
> +}
> +
> +static inline void *kvm_mmu_private_sp(struct kvm_mmu_page *sp)
> +{
> +	return sp->private_sp;
> +}
> +
> +static inline void kvm_mmu_init_private_sp(struct kvm_mmu_page *sp, void *private_sp)
> +{
> +	sp->private_sp = private_sp;
> +}
> +
> +/* Valid sp->role.level is required. */
> +static inline void kvm_mmu_alloc_private_sp(
> +	struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, bool is_root)
> +{
> +	if (is_root)
> +		sp->private_sp = KVM_MMU_PRIVATE_SP_ROOT;
> +	else
> +		sp->private_sp = kvm_mmu_memory_cache_alloc(
> +			&vcpu->arch.mmu_private_sp_cache);
> +	/*
> +	 * Because mmu_private_sp_cache is topped up before staring kvm page
> +	 * fault resolving, the allocation above shouldn't fail.
> +	 */
> +	WARN_ON_ONCE(!sp->private_sp);
> +}
> +
> +static inline void kvm_mmu_free_private_sp(struct kvm_mmu_page *sp)
> +{
> +	if (sp->private_sp != KVM_MMU_PRIVATE_SP_ROOT)
> +		free_page((unsigned long)sp->private_sp);
> +}
> +#else
> +static inline bool is_private_sp(struct kvm_mmu_page *sp)
> +{
> +	return false;
> +}
> +
> +static inline bool is_private_sptep(u64 *sptep)
> +{
> +	return false;
> +}
> +
> +static inline void *kvm_mmu_private_sp(struct kvm_mmu_page *sp)
> +{
> +	return NULL;
> +}
> +
> +static inline void kvm_mmu_init_private_sp(struct kvm_mmu_page *sp, void *private_sp)
> +{
> +}
> +
> +static inline void kvm_mmu_alloc_private_sp(
> +	struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, bool is_root)
> +{
> +}
> +
> +static inline void kvm_mmu_free_private_sp(struct kvm_mmu_page *sp)
> +{
> +}
> +#endif
> +
>  static inline bool kvm_mmu_page_ad_need_write_protect(struct kvm_mmu_page *sp)
>  {
>  	/*
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> index 7eb41b176d1e..b2568b062faa 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.c
> +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> @@ -72,6 +72,8 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
>  
>  static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
>  {
> +	if (is_private_sp(sp))
> +		kvm_mmu_free_private_sp(sp);
>  	free_page((unsigned long)sp->spt);
>  	kmem_cache_free(mmu_page_header_cache, sp);
>  }
> @@ -295,6 +297,7 @@ static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep,
>  	sp->gfn = gfn;
>  	sp->ptep = sptep;
>  	sp->tdp_mmu_page = true;
> +	kvm_mmu_init_private_sp(sp);
>  
>  	trace_kvm_mmu_get_page(sp, true);
>  }
> -- 
> 2.25.1
>
Isaku Yamahata Aug. 9, 2022, 11:50 p.m. UTC | #6
On Thu, Jul 28, 2022 at 01:13:35PM -0700,
David Matlack <dmatlack@google.com> wrote:

> On Mon, Jun 27, 2022 at 02:53:36PM -0700, isaku.yamahata@intel.com wrote:
> > From: Isaku Yamahata <isaku.yamahata@intel.com>
> > 
> > For private GPA, CPU refers a private page table whose contents are
> > encrypted.  The dedicated APIs to operate on it (e.g. updating/reading its
> > PTE entry) are used and their cost is expensive.
> > 
> > When KVM resolves KVM page fault, it walks the page tables.  To reuse the
> > existing KVM MMU code and mitigate the heavy cost to directly walk
> > encrypted private page table, allocate a more page to mirror the existing
> > KVM page table.  Resolve KVM page fault with the existing code, and do
> > additional operations necessary for the mirrored private page table.  To
> > distinguish such cases, the existing KVM page table is called a shared page
> > table (i.e. no mirrored private page table), and the KVM page table with
> > mirrored private page table is called a private page table.  The
> > relationship is depicted below.
> > 
> > Add private pointer to struct kvm_mmu_page for mirrored private page table
> > and add helper functions to allocate/initialize/free a mirrored private
> > page table page.  Also, add helper functions to check if a given
> > kvm_mmu_page is private.  The later patch introduces hooks to operate on
> > the mirrored private page table.
> > 
> >               KVM page fault                     |
> >                      |                           |
> >                      V                           |
> >         -------------+----------                 |
> >         |                      |                 |
> >         V                      V                 |
> >      shared GPA           private GPA            |
> >         |                      |                 |
> >         V                      V                 |
> >  CPU/KVM shared PT root  KVM private PT root     |  CPU private PT root
> >         |                      |                 |           |
> >         V                      V                 |           V
> >      shared PT            private PT <----mirror----> mirrored private PT
> >         |                      |                 |           |
> >         |                      \-----------------+------\    |
> >         |                                        |      |    |
> >         V                                        |      V    V
> >   shared guest page                              |    private guest page
> >                                                  |
> >                            non-encrypted memory  |    encrypted memory
> >                                                  |
> > PT: page table
> > 
> > Both CPU and KVM refer to CPU/KVM shared page table.  Private page table
> > is used only by KVM.  CPU refers to mirrored private page table.
> > 
> > Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
> > ---
> >  arch/x86/include/asm/kvm_host.h |  1 +
> >  arch/x86/kvm/mmu/mmu.c          |  9 ++++
> >  arch/x86/kvm/mmu/mmu_internal.h | 84 +++++++++++++++++++++++++++++++++
> >  arch/x86/kvm/mmu/tdp_mmu.c      |  3 ++
> >  4 files changed, 97 insertions(+)
> > 
> > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> > index f4d4ed41641b..bfc934dc9a33 100644
> > --- a/arch/x86/include/asm/kvm_host.h
> > +++ b/arch/x86/include/asm/kvm_host.h
> > @@ -716,6 +716,7 @@ struct kvm_vcpu_arch {
> >  	struct kvm_mmu_memory_cache mmu_shadow_page_cache;
> >  	struct kvm_mmu_memory_cache mmu_gfn_array_cache;
> >  	struct kvm_mmu_memory_cache mmu_page_header_cache;
> > +	struct kvm_mmu_memory_cache mmu_private_sp_cache;
> >  
> >  	/*
> >  	 * QEMU userspace and the guest each have their own FPU state.
> > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> > index c517c7bca105..a5bf3e40e209 100644
> > --- a/arch/x86/kvm/mmu/mmu.c
> > +++ b/arch/x86/kvm/mmu/mmu.c
> > @@ -691,6 +691,13 @@ static int mmu_topup_shadow_page_cache(struct kvm_vcpu *vcpu)
> >  	int start, end, i, r;
> >  	bool is_tdp_mmu = is_tdp_mmu_enabled(vcpu->kvm);
> >  
> > +	if (kvm_gfn_shared_mask(vcpu->kvm)) {
> > +		r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_private_sp_cache,
> > +					       PT64_ROOT_MAX_LEVEL);
> > +		if (r)
> > +			return r;
> > +	}
> > +
> >  	if (is_tdp_mmu && shadow_nonpresent_value)
> >  		start = kvm_mmu_memory_cache_nr_free_objects(mc);
> >  
> > @@ -732,6 +739,7 @@ static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
> >  {
> >  	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache);
> >  	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadow_page_cache);
> > +	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_private_sp_cache);
> >  	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_gfn_array_cache);
> >  	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
> >  }
> > @@ -1736,6 +1744,7 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct
> >  	if (!direct)
> >  		sp->gfns = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_gfn_array_cache);
> >  	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
> > +	kvm_mmu_init_private_sp(sp, NULL);
> >  
> >  	/*
> >  	 * active_mmu_pages must be a FIFO list, as kvm_zap_obsolete_pages()
> > diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
> > index 44a04fad4bed..9f3a6bea60a3 100644
> > --- a/arch/x86/kvm/mmu/mmu_internal.h
> > +++ b/arch/x86/kvm/mmu/mmu_internal.h
> > @@ -55,6 +55,10 @@ struct kvm_mmu_page {
> >  	u64 *spt;
> >  	/* hold the gfn of each spte inside spt */
> >  	gfn_t *gfns;
> > +#ifdef CONFIG_KVM_MMU_PRIVATE
> > +	/* associated private shadow page, e.g. SEPT page. */
> > +	void *private_sp;
> > +#endif
> 
> write_flooding_count and unsync_children are only used in shadow MMU SPs
> and private_sp is only used in TDP MMU SPs. So it seems like we could
> put these together in a union and drop CONFIG_KVM_MMU_PRIVATE without
> increasing the size of kvm_mmu_page. i.e.

I introduced KVM_MMU_PRIVATE as a alias to INTEL_TDX_HOST because I don't want
to use it in kvm/mmu and I'd like KVM_MMU_PRIVATE (a sort of) independent from
INTEL_TDX_HOST.  Anyway once the patch series is merged, we can drop
KVM_MMU_PRIVATE.


> 	union {
> 		struct {
> 			unsigned int unsync_children;
> 			/* Number of writes since the last time traversal visited this page.  */
> 			atomic_t write_flooding_count;
> 		};
> 		/*
> 		 * The associated private shadow page table, e.g. for Secure EPT.
> 		 * Only valid if tdp_mmu_page is true.
> 		 */
> 		void *private_spt;
> 	};
> 
> Then change is_private_sp() to:
> 
> static inline bool is_private_sp(struct kvm_mmu_page *sp)
> {
> 	return sp->tdp_mmu_page && sp->private_sp;
> }
> 
> This will allow us to drop CONFIG_KVM_MMU_PRIVATE, the only benefit of
> which I see is to avoid increasing the size of kvm_mmu_page. However
> to actually realize that benefit Cloud vendors (for example) would have
> to create separate kernel builds for TDX and non-TDX hosts, which seems
> like a huge hassel.

Good idea. I'll use union.
Isaku Yamahata Aug. 9, 2022, 11:52 p.m. UTC | #7
On Thu, Jul 28, 2022 at 12:41:51PM -0700,
David Matlack <dmatlack@google.com> wrote:

> > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> > index f4d4ed41641b..bfc934dc9a33 100644
> > --- a/arch/x86/include/asm/kvm_host.h
> > +++ b/arch/x86/include/asm/kvm_host.h
> > @@ -716,6 +716,7 @@ struct kvm_vcpu_arch {
> >  	struct kvm_mmu_memory_cache mmu_shadow_page_cache;
> >  	struct kvm_mmu_memory_cache mmu_gfn_array_cache;
> >  	struct kvm_mmu_memory_cache mmu_page_header_cache;
> > +	struct kvm_mmu_memory_cache mmu_private_sp_cache;
> 
> I notice that mmu_private_sp_cache.gfp_zero is left unset so these pages
> may contain garbage. Is this by design because the TDX module can't rely
> on the contents being zero and has to take care of initializing the page
> itself? i.e. GFP_ZERO would be a waste of cycles?
> 
> If I'm correct please include a comment here in the next revision to
> explain why GFP_ZERO is not necessary.

Yes, exactly.  Here is the added comments.
 /*
  * This cache is to allocate pages used for Secure-EPT used by the TDX
  * module.  Because the TDX module doesn't trust VMM and initializes the
  * pages itself, KVM doesn't initialize them.  Allocate pages with
  * garbage and give them to the TDX module.
  */

> >  	/*
> >  	 * QEMU userspace and the guest each have their own FPU state.
> > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> > index c517c7bca105..a5bf3e40e209 100644
> > --- a/arch/x86/kvm/mmu/mmu.c
> > +++ b/arch/x86/kvm/mmu/mmu.c
> > @@ -691,6 +691,13 @@ static int mmu_topup_shadow_page_cache(struct kvm_vcpu *vcpu)
> >  	int start, end, i, r;
> >  	bool is_tdp_mmu = is_tdp_mmu_enabled(vcpu->kvm);
> >  
> > +	if (kvm_gfn_shared_mask(vcpu->kvm)) {
> > +		r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_private_sp_cache,
> > +					       PT64_ROOT_MAX_LEVEL);
> > +		if (r)
> > +			return r;
> > +	}
> > +
> >  	if (is_tdp_mmu && shadow_nonpresent_value)
> >  		start = kvm_mmu_memory_cache_nr_free_objects(mc);
> >  
> > @@ -732,6 +739,7 @@ static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
> >  {
> >  	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache);
> >  	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadow_page_cache);
> > +	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_private_sp_cache);
> >  	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_gfn_array_cache);
> >  	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
> >  }
> > @@ -1736,6 +1744,7 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct
> >  	if (!direct)
> >  		sp->gfns = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_gfn_array_cache);
> >  	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
> > +	kvm_mmu_init_private_sp(sp, NULL);
> 
> This is unnecessary. kvm_mmu_page structs are zero-initialized so
> private_sp will already be NULL.

Ok. 


> >  
> >  	/*
> >  	 * active_mmu_pages must be a FIFO list, as kvm_zap_obsolete_pages()
> > diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
> > index 44a04fad4bed..9f3a6bea60a3 100644
> > --- a/arch/x86/kvm/mmu/mmu_internal.h
> > +++ b/arch/x86/kvm/mmu/mmu_internal.h
> > @@ -55,6 +55,10 @@ struct kvm_mmu_page {
> >  	u64 *spt;
> >  	/* hold the gfn of each spte inside spt */
> >  	gfn_t *gfns;
> > +#ifdef CONFIG_KVM_MMU_PRIVATE
> > +	/* associated private shadow page, e.g. SEPT page. */
> 
> Can we use "Secure EPT" instead of SEPT in KVM code and comments? (i.e.
> also including variable names like sept_page -> secure_ept_page)
> 
> "SEPT" looks like a mispelling of SPTE, which is used all over KVM. It
> will be difficult to read code that contains both acronyms.

Makes sense. Will update it.


> > +	void *private_sp;
> 
> Please name this "private_spt" and move it up next to "spt".
> 
> sp" or "shadow page" is used to refer to kvm_mmu_page structs. For
> example, look at all the code in KVM that uses `struct kvm_mmu_page *sp`.
> 
> "spt" is "shadow page table", i.e. the actual page table memory. See
> kvm_mmu_page.spt. Calling this field "private_spt" makes it obvious that
> this pointer is pointing to a page table.
> 
> Also please update the language in the comment accordingly to "private
> shadow page table".

I'll rename as follows

private_sp => private_spt
spet_page => private_spt
mmu_private_sp_cache => mmu_private_spt_cache
kvm_mmu_init_private_sp => kvm_mmu_inite_private_spt
kvm_mmu_alloc_private_sp => kvm_mmu_alloc_private_spt
kvm_mmu_free_private_sp => kvm_mmu_free_private_spt
kvm_alloc_private_sp_for_split => kvm_alloc_private_spt_for_split

Thanks,
diff mbox series

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index f4d4ed41641b..bfc934dc9a33 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -716,6 +716,7 @@  struct kvm_vcpu_arch {
 	struct kvm_mmu_memory_cache mmu_shadow_page_cache;
 	struct kvm_mmu_memory_cache mmu_gfn_array_cache;
 	struct kvm_mmu_memory_cache mmu_page_header_cache;
+	struct kvm_mmu_memory_cache mmu_private_sp_cache;
 
 	/*
 	 * QEMU userspace and the guest each have their own FPU state.
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index c517c7bca105..a5bf3e40e209 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -691,6 +691,13 @@  static int mmu_topup_shadow_page_cache(struct kvm_vcpu *vcpu)
 	int start, end, i, r;
 	bool is_tdp_mmu = is_tdp_mmu_enabled(vcpu->kvm);
 
+	if (kvm_gfn_shared_mask(vcpu->kvm)) {
+		r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_private_sp_cache,
+					       PT64_ROOT_MAX_LEVEL);
+		if (r)
+			return r;
+	}
+
 	if (is_tdp_mmu && shadow_nonpresent_value)
 		start = kvm_mmu_memory_cache_nr_free_objects(mc);
 
@@ -732,6 +739,7 @@  static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
 {
 	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache);
 	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadow_page_cache);
+	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_private_sp_cache);
 	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_gfn_array_cache);
 	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
 }
@@ -1736,6 +1744,7 @@  static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct
 	if (!direct)
 		sp->gfns = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_gfn_array_cache);
 	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
+	kvm_mmu_init_private_sp(sp, NULL);
 
 	/*
 	 * active_mmu_pages must be a FIFO list, as kvm_zap_obsolete_pages()
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index 44a04fad4bed..9f3a6bea60a3 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -55,6 +55,10 @@  struct kvm_mmu_page {
 	u64 *spt;
 	/* hold the gfn of each spte inside spt */
 	gfn_t *gfns;
+#ifdef CONFIG_KVM_MMU_PRIVATE
+	/* associated private shadow page, e.g. SEPT page. */
+	void *private_sp;
+#endif
 	/* Currently serving as active root */
 	union {
 		int root_count;
@@ -115,6 +119,86 @@  static inline int kvm_mmu_page_as_id(struct kvm_mmu_page *sp)
 	return kvm_mmu_role_as_id(sp->role);
 }
 
+/*
+ * TDX vcpu allocates page for root Secure EPT page and assigns to CPU secure
+ * EPT pointer.  KVM doesn't need to allocate and link to the secure EPT.
+ * Dummy value to make is_pivate_sp() return true.
+ */
+#define KVM_MMU_PRIVATE_SP_ROOT	((void *)1)
+
+#ifdef CONFIG_KVM_MMU_PRIVATE
+static inline bool is_private_sp(struct kvm_mmu_page *sp)
+{
+	return !!sp->private_sp;
+}
+
+static inline bool is_private_sptep(u64 *sptep)
+{
+	WARN_ON(!sptep);
+	return is_private_sp(sptep_to_sp(sptep));
+}
+
+static inline void *kvm_mmu_private_sp(struct kvm_mmu_page *sp)
+{
+	return sp->private_sp;
+}
+
+static inline void kvm_mmu_init_private_sp(struct kvm_mmu_page *sp, void *private_sp)
+{
+	sp->private_sp = private_sp;
+}
+
+/* Valid sp->role.level is required. */
+static inline void kvm_mmu_alloc_private_sp(
+	struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, bool is_root)
+{
+	if (is_root)
+		sp->private_sp = KVM_MMU_PRIVATE_SP_ROOT;
+	else
+		sp->private_sp = kvm_mmu_memory_cache_alloc(
+			&vcpu->arch.mmu_private_sp_cache);
+	/*
+	 * Because mmu_private_sp_cache is topped up before staring kvm page
+	 * fault resolving, the allocation above shouldn't fail.
+	 */
+	WARN_ON_ONCE(!sp->private_sp);
+}
+
+static inline void kvm_mmu_free_private_sp(struct kvm_mmu_page *sp)
+{
+	if (sp->private_sp != KVM_MMU_PRIVATE_SP_ROOT)
+		free_page((unsigned long)sp->private_sp);
+}
+#else
+static inline bool is_private_sp(struct kvm_mmu_page *sp)
+{
+	return false;
+}
+
+static inline bool is_private_sptep(u64 *sptep)
+{
+	return false;
+}
+
+static inline void *kvm_mmu_private_sp(struct kvm_mmu_page *sp)
+{
+	return NULL;
+}
+
+static inline void kvm_mmu_init_private_sp(struct kvm_mmu_page *sp, void *private_sp)
+{
+}
+
+static inline void kvm_mmu_alloc_private_sp(
+	struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, bool is_root)
+{
+}
+
+static inline void kvm_mmu_free_private_sp(struct kvm_mmu_page *sp)
+{
+}
+#endif
+
 static inline bool kvm_mmu_page_ad_need_write_protect(struct kvm_mmu_page *sp)
 {
 	/*
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 7eb41b176d1e..b2568b062faa 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -72,6 +72,8 @@  void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
 
 static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
 {
+	if (is_private_sp(sp))
+		kvm_mmu_free_private_sp(sp);
 	free_page((unsigned long)sp->spt);
 	kmem_cache_free(mmu_page_header_cache, sp);
 }
@@ -295,6 +297,7 @@  static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep,
 	sp->gfn = gfn;
 	sp->ptep = sptep;
 	sp->tdp_mmu_page = true;
+	kvm_mmu_init_private_sp(sp);
 
 	trace_kvm_mmu_get_page(sp, true);
 }