diff mbox series

[v4,03/20] KVM: x86/mmu: Derive shadow MMU page role from parent

Message ID 20220422210546.458943-4-dmatlack@google.com (mailing list archive)
State Superseded
Headers show
Series KVM: Extend Eager Page Splitting to the shadow MMU | expand

Commit Message

David Matlack April 22, 2022, 9:05 p.m. UTC
Instead of computing the shadow page role from scratch for every new
page, derive most of the information from the parent shadow page.  This
avoids redundant calculations and reduces the number of parameters to
kvm_mmu_get_page().

Preemptively split out the role calculation to a separate function for
use in a following commit.

No functional change intended.

Reviewed-by: Peter Xu <peterx@redhat.com>
Signed-off-by: David Matlack <dmatlack@google.com>
---
 arch/x86/kvm/mmu/mmu.c         | 96 +++++++++++++++++++++++-----------
 arch/x86/kvm/mmu/paging_tmpl.h |  9 ++--
 2 files changed, 71 insertions(+), 34 deletions(-)

Comments

Sean Christopherson May 5, 2022, 9:50 p.m. UTC | #1
On Fri, Apr 22, 2022, David Matlack wrote:
> Instead of computing the shadow page role from scratch for every new
> page, derive most of the information from the parent shadow page.  This
> avoids redundant calculations and reduces the number of parameters to
> kvm_mmu_get_page().
> 
> Preemptively split out the role calculation to a separate function for
> use in a following commit.
> 
> No functional change intended.
> 
> Reviewed-by: Peter Xu <peterx@redhat.com>
> Signed-off-by: David Matlack <dmatlack@google.com>
> ---
>  arch/x86/kvm/mmu/mmu.c         | 96 +++++++++++++++++++++++-----------
>  arch/x86/kvm/mmu/paging_tmpl.h |  9 ++--
>  2 files changed, 71 insertions(+), 34 deletions(-)
> 
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index dc20eccd6a77..4249a771818b 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -2021,31 +2021,15 @@ static void clear_sp_write_flooding_count(u64 *spte)
>  	__clear_sp_write_flooding_count(sptep_to_sp(spte));
>  }
>  
> -static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
> -					     gfn_t gfn,
> -					     gva_t gaddr,
> -					     unsigned level,
> -					     bool direct,
> -					     unsigned int access)
> +static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, gfn_t gfn,
> +					     union kvm_mmu_page_role role)
>  {
> -	union kvm_mmu_page_role role;
>  	struct hlist_head *sp_list;
> -	unsigned quadrant;
>  	struct kvm_mmu_page *sp;
>  	int ret;
>  	int collisions = 0;
>  	LIST_HEAD(invalid_list);
>  
> -	role = vcpu->arch.mmu->root_role;
> -	role.level = level;
> -	role.direct = direct;
> -	role.access = access;
> -	if (role.has_4_byte_gpte) {
> -		quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
> -		quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
> -		role.quadrant = quadrant;
> -	}
> -

When you rebase to kvm/queue, the helper will need to deal with

	if (level <= vcpu->arch.mmu->cpu_role.base.level)
		role.passthrough = 0;

KVM should never create a passthrough huge page, so I believe it's just a matter
of adding yet another boolean param to kvm_mmu_child_role().


>  	sp_list = &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)];
>  	for_each_valid_sp(vcpu->kvm, sp, sp_list) {
>  		if (sp->gfn != gfn) {
> @@ -2063,7 +2047,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
>  			 * Unsync pages must not be left as is, because the new
>  			 * upper-level page will be write-protected.
>  			 */
> -			if (level > PG_LEVEL_4K && sp->unsync)
> +			if (role.level > PG_LEVEL_4K && sp->unsync)
>  				kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
>  							 &invalid_list);
>  			continue;

...

> @@ -3310,12 +3338,21 @@ static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
>  	return ret;
>  }
>  
> -static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gva,
> +static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, int quadrant,
>  			    u8 level, bool direct)
>  {
> +	union kvm_mmu_page_role role;
>  	struct kvm_mmu_page *sp;
>  
> -	sp = kvm_mmu_get_page(vcpu, gfn, gva, level, direct, ACC_ALL);
> +	role = vcpu->arch.mmu->root_role;
> +	role.level = level;
> +	role.direct = direct;
> +	role.access = ACC_ALL;
> +
> +	if (role.has_4_byte_gpte)
> +		role.quadrant = quadrant;

Maybe add a comment explaining the PAE and 32-bit paging paths share a call for
allocating PDPTEs?  Otherwise it looks like passing a non-zero quadrant when the
guest doesn't have 4-byte PTEs should be a bug.

Hmm, even better, if the check is moved to the caller, then this can be:

	role.level = level;
	role.direct = direct;
	role.access = ACC_ALL;
	role.quadrant = quadrant;

	WARN_ON_ONCE(quadrant && !role.has_4_byte_gpte));
	WARN_ON_ONCE(direct && role.has_4_byte_gpte));

and no comment is necessary.

> +
> +	sp = kvm_mmu_get_page(vcpu, gfn, role);
>  	++sp->root_count;
>  
>  	return __pa(sp->spt);
> @@ -3349,8 +3386,8 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
>  		for (i = 0; i < 4; ++i) {
>  			WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i]));
>  
> -			root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT),
> -					      i << 30, PT32_ROOT_LEVEL, true);
> +			root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT), i,

The @quadrant here can be hardcoded to '0', has_4_byte_gpte is guaranteed to be
false if the MMU is direct.  And then in the indirect path, set gva (and then
quadrant) based on 'i' iff the guest is using 32-bit paging.

Probably worth making it a separate patch just in case I'm forgetting something.
Lightly tested...

--
From: Sean Christopherson <seanjc@google.com>
Date: Thu, 5 May 2022 14:19:35 -0700
Subject: [PATCH] KVM: x86/mmu: Pass '0' for @gva when allocating root with
 8-byte gpte

Pass '0' instead of the "real" gva when allocating a direct PAE root,
a.k.a. a direct PDPTE, and when allocating indirect roots that shadow
64-bit / 8-byte GPTEs.

Thee @gva is only needed if the root is shadowing 32-bit paging in the
guest, in which case KVM needs to use different shadow pages for each of
the two 4-byte GPTEs covered by KVM's 8-byte PAE SPTE.

For direct MMUs, there's obviously no shadowing, and for indirect MMU

In anticipation of moving the quadrant logic into mmu_alloc_root(), WARN
if a non-zero @gva is passed for !4-byte GPTEs, and WARN if 4-byte GPTEs
are ever combined with a direct root (there's no shadowing, so TDP roots
should ignore the GPTE size).

Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/mmu/mmu.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index dc20eccd6a77..6dfa3cfa8394 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3313,8 +3313,12 @@ static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
 static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gva,
 			    u8 level, bool direct)
 {
+	union kvm_mmu_page_role role = vcpu->arch.mmu->root_role;
 	struct kvm_mmu_page *sp;

+	WARN_ON_ONCE(gva && !role.has_4_byte_gpte);
+	WARN_ON_ONCE(direct && role.has_4_byte_gpte);
+
 	sp = kvm_mmu_get_page(vcpu, gfn, gva, level, direct, ACC_ALL);
 	++sp->root_count;

@@ -3349,8 +3353,8 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
 		for (i = 0; i < 4; ++i) {
 			WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i]));

-			root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT),
-					      i << 30, PT32_ROOT_LEVEL, true);
+			root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT), 0,
+					      PT32_ROOT_LEVEL, true);
 			mmu->pae_root[i] = root | PT_PRESENT_MASK |
 					   shadow_me_mask;
 		}
@@ -3435,6 +3439,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 	u64 pdptrs[4], pm_mask;
 	gfn_t root_gfn, root_pgd;
 	hpa_t root;
+	gva_t gva;
 	unsigned i;
 	int r;

@@ -3508,6 +3513,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 		}
 	}

+	gva = 0;
 	for (i = 0; i < 4; ++i) {
 		WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i]));

@@ -3517,9 +3523,11 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 				continue;
 			}
 			root_gfn = pdptrs[i] >> PAGE_SHIFT;
+		} else if (mmu->cpu_role.base.level == PT32_ROOT_LEVEL) {
+			gva = i << 30;
 		}

-		root = mmu_alloc_root(vcpu, root_gfn, i << 30,
+		root = mmu_alloc_root(vcpu, root_gfn, gva,
 				      PT32_ROOT_LEVEL, false);
 		mmu->pae_root[i] = root | pm_mask;
 	}

base-commit: 8bae380ad7dd3c31266d3685841ea4ce574d462d
--
Lai Jiangshan May 7, 2022, 8:28 a.m. UTC | #2
On 2022/4/23 05:05, David Matlack wrote:
> Instead of computing the shadow page role from scratch for every new
> page, derive most of the information from the parent shadow page.  This
> avoids redundant calculations and reduces the number of parameters to
> kvm_mmu_get_page().
> 
> Preemptively split out the role calculation to a separate function for
> use in a following commit.
> 
> No functional change intended.
> 
> Reviewed-by: Peter Xu <peterx@redhat.com>
> Signed-off-by: David Matlack <dmatlack@google.com>
> ---
>   arch/x86/kvm/mmu/mmu.c         | 96 +++++++++++++++++++++++-----------
>   arch/x86/kvm/mmu/paging_tmpl.h |  9 ++--
>   2 files changed, 71 insertions(+), 34 deletions(-)
> 
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index dc20eccd6a77..4249a771818b 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -2021,31 +2021,15 @@ static void clear_sp_write_flooding_count(u64 *spte)
>   	__clear_sp_write_flooding_count(sptep_to_sp(spte));
>   }
>   
> -static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
> -					     gfn_t gfn,
> -					     gva_t gaddr,
> -					     unsigned level,
> -					     bool direct,
> -					     unsigned int access)
> +static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, gfn_t gfn,
> +					     union kvm_mmu_page_role role)
>   {
> -	union kvm_mmu_page_role role;
>   	struct hlist_head *sp_list;
> -	unsigned quadrant;
>   	struct kvm_mmu_page *sp;
>   	int ret;
>   	int collisions = 0;
>   	LIST_HEAD(invalid_list);
>   
> -	role = vcpu->arch.mmu->root_role;
> -	role.level = level;
> -	role.direct = direct;
> -	role.access = access;
> -	if (role.has_4_byte_gpte) {
> -		quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
> -		quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
> -		role.quadrant = quadrant;
> -	}


I don't think replacing it with kvm_mmu_child_role() can reduce any calculations.

role.level, role.direct, role.access and role.quadrant still need to be
calculated.  And the old code is still in mmu_alloc_root().

I think kvm_mmu_get_shadow_page() can keep the those parameters and
kvm_mmu_child_role() is only introduced for nested_mmu_get_sp_for_split().

Both kvm_mmu_get_shadow_page() and nested_mmu_get_sp_for_split() call
__kvm_mmu_get_shadow_page() which uses role as a parameter.



> -
>   	sp_list = &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)];
>   	for_each_valid_sp(vcpu->kvm, sp, sp_list) {
>   		if (sp->gfn != gfn) {
> @@ -2063,7 +2047,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
>   			 * Unsync pages must not be left as is, because the new
>   			 * upper-level page will be write-protected.
>   			 */
> -			if (level > PG_LEVEL_4K && sp->unsync)
> +			if (role.level > PG_LEVEL_4K && sp->unsync)
>   				kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
>   							 &invalid_list);
>   			continue;
> @@ -2104,14 +2088,14 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
>   
>   	++vcpu->kvm->stat.mmu_cache_miss;
>   
> -	sp = kvm_mmu_alloc_page(vcpu, direct);
> +	sp = kvm_mmu_alloc_page(vcpu, role.direct);
>   
>   	sp->gfn = gfn;
>   	sp->role = role;
>   	hlist_add_head(&sp->hash_link, sp_list);
> -	if (!direct) {
> +	if (!role.direct) {
>   		account_shadowed(vcpu->kvm, sp);
> -		if (level == PG_LEVEL_4K && kvm_vcpu_write_protect_gfn(vcpu, gfn))
> +		if (role.level == PG_LEVEL_4K && kvm_vcpu_write_protect_gfn(vcpu, gfn))
>   			kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, 1);
>   	}
>   	trace_kvm_mmu_get_page(sp, true);
> @@ -2123,6 +2107,51 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
>   	return sp;
>   }
>   
> +static union kvm_mmu_page_role kvm_mmu_child_role(u64 *sptep, bool direct, u32 access)
> +{
> +	struct kvm_mmu_page *parent_sp = sptep_to_sp(sptep);
> +	union kvm_mmu_page_role role;
> +
> +	role = parent_sp->role;
> +	role.level--;
> +	role.access = access;
> +	role.direct = direct;
> +
> +	/*
> +	 * If the guest has 4-byte PTEs then that means it's using 32-bit,
> +	 * 2-level, non-PAE paging. KVM shadows such guests using 4 PAE page
> +	 * directories, each mapping 1/4 of the guest's linear address space
> +	 * (1GiB). The shadow pages for those 4 page directories are
> +	 * pre-allocated and assigned a separate quadrant in their role.


It is not going to be true in patchset:
[PATCH V2 0/7] KVM: X86/MMU: Use one-off special shadow page for special roots

https://lore.kernel.org/lkml/20220503150735.32723-1-jiangshanlai@gmail.com/

The shadow pages for those 4 page directories are also allocated on demand.
David Matlack May 9, 2022, 9:04 p.m. UTC | #3
On Sat, May 7, 2022 at 1:28 AM Lai Jiangshan <jiangshanlai@gmail.com> wrote:
>
>
>
> On 2022/4/23 05:05, David Matlack wrote:
> > Instead of computing the shadow page role from scratch for every new
> > page, derive most of the information from the parent shadow page.  This
> > avoids redundant calculations and reduces the number of parameters to
> > kvm_mmu_get_page().
> >
> > Preemptively split out the role calculation to a separate function for
> > use in a following commit.
> >
> > No functional change intended.
> >
> > Reviewed-by: Peter Xu <peterx@redhat.com>
> > Signed-off-by: David Matlack <dmatlack@google.com>
> > ---
> >   arch/x86/kvm/mmu/mmu.c         | 96 +++++++++++++++++++++++-----------
> >   arch/x86/kvm/mmu/paging_tmpl.h |  9 ++--
> >   2 files changed, 71 insertions(+), 34 deletions(-)
> >
> > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> > index dc20eccd6a77..4249a771818b 100644
> > --- a/arch/x86/kvm/mmu/mmu.c
> > +++ b/arch/x86/kvm/mmu/mmu.c
> > @@ -2021,31 +2021,15 @@ static void clear_sp_write_flooding_count(u64 *spte)
> >       __clear_sp_write_flooding_count(sptep_to_sp(spte));
> >   }
> >
> > -static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
> > -                                          gfn_t gfn,
> > -                                          gva_t gaddr,
> > -                                          unsigned level,
> > -                                          bool direct,
> > -                                          unsigned int access)
> > +static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, gfn_t gfn,
> > +                                          union kvm_mmu_page_role role)
> >   {
> > -     union kvm_mmu_page_role role;
> >       struct hlist_head *sp_list;
> > -     unsigned quadrant;
> >       struct kvm_mmu_page *sp;
> >       int ret;
> >       int collisions = 0;
> >       LIST_HEAD(invalid_list);
> >
> > -     role = vcpu->arch.mmu->root_role;
> > -     role.level = level;
> > -     role.direct = direct;
> > -     role.access = access;
> > -     if (role.has_4_byte_gpte) {
> > -             quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
> > -             quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
> > -             role.quadrant = quadrant;
> > -     }
>
>
> I don't think replacing it with kvm_mmu_child_role() can reduce any calculations.
>
> role.level, role.direct, role.access and role.quadrant still need to be
> calculated.  And the old code is still in mmu_alloc_root().

You are correct. Instead of saying "less calculations" I should have
said "eliminates the dependency on vcpu->arch.mmu->root_role".

>
> I think kvm_mmu_get_shadow_page() can keep the those parameters and
> kvm_mmu_child_role() is only introduced for nested_mmu_get_sp_for_split().
>
> Both kvm_mmu_get_shadow_page() and nested_mmu_get_sp_for_split() call
> __kvm_mmu_get_shadow_page() which uses role as a parameter.

I agree this would work, but I think the end result would be more
difficult to read.

The way I've implemented it there are two ways the SP roles are calculated:

1. For root SPs: Derive the role from vCPU root_role and caller-provided inputs.
2. For child SPs: Derive the role from parent SP and caller-provided inputs.

Your proposal would still have two ways to calculate the SP role, but
split along a different dimension:

1. For vCPUs creating SPs: Derive the role from vCPU root_role and
caller-provided inputs.
2. For Eager Page Splitting creating SPs: Derive the role from parent
SP and caller-provided inputs.

With your proposal, it is less obvious that eager page splitting is
going to end up with the correct role. Whereas if we use the same
calculation for all child SPs, it is immediately obvious.

>
>
>
> > -
> >       sp_list = &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)];
> >       for_each_valid_sp(vcpu->kvm, sp, sp_list) {
> >               if (sp->gfn != gfn) {
> > @@ -2063,7 +2047,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
> >                        * Unsync pages must not be left as is, because the new
> >                        * upper-level page will be write-protected.
> >                        */
> > -                     if (level > PG_LEVEL_4K && sp->unsync)
> > +                     if (role.level > PG_LEVEL_4K && sp->unsync)
> >                               kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
> >                                                        &invalid_list);
> >                       continue;
> > @@ -2104,14 +2088,14 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
> >
> >       ++vcpu->kvm->stat.mmu_cache_miss;
> >
> > -     sp = kvm_mmu_alloc_page(vcpu, direct);
> > +     sp = kvm_mmu_alloc_page(vcpu, role.direct);
> >
> >       sp->gfn = gfn;
> >       sp->role = role;
> >       hlist_add_head(&sp->hash_link, sp_list);
> > -     if (!direct) {
> > +     if (!role.direct) {
> >               account_shadowed(vcpu->kvm, sp);
> > -             if (level == PG_LEVEL_4K && kvm_vcpu_write_protect_gfn(vcpu, gfn))
> > +             if (role.level == PG_LEVEL_4K && kvm_vcpu_write_protect_gfn(vcpu, gfn))
> >                       kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, 1);
> >       }
> >       trace_kvm_mmu_get_page(sp, true);
> > @@ -2123,6 +2107,51 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
> >       return sp;
> >   }
> >
> > +static union kvm_mmu_page_role kvm_mmu_child_role(u64 *sptep, bool direct, u32 access)
> > +{
> > +     struct kvm_mmu_page *parent_sp = sptep_to_sp(sptep);
> > +     union kvm_mmu_page_role role;
> > +
> > +     role = parent_sp->role;
> > +     role.level--;
> > +     role.access = access;
> > +     role.direct = direct;
> > +
> > +     /*
> > +      * If the guest has 4-byte PTEs then that means it's using 32-bit,
> > +      * 2-level, non-PAE paging. KVM shadows such guests using 4 PAE page
> > +      * directories, each mapping 1/4 of the guest's linear address space
> > +      * (1GiB). The shadow pages for those 4 page directories are
> > +      * pre-allocated and assigned a separate quadrant in their role.
>
>
> It is not going to be true in patchset:
> [PATCH V2 0/7] KVM: X86/MMU: Use one-off special shadow page for special roots
>
> https://lore.kernel.org/lkml/20220503150735.32723-1-jiangshanlai@gmail.com/
>
> The shadow pages for those 4 page directories are also allocated on demand.

Ack. I can even just drop this sentence in v5, it's just background information.
David Matlack May 9, 2022, 10:10 p.m. UTC | #4
On Thu, May 5, 2022 at 2:50 PM Sean Christopherson <seanjc@google.com> wrote:
>
> On Fri, Apr 22, 2022, David Matlack wrote:
> > Instead of computing the shadow page role from scratch for every new
> > page, derive most of the information from the parent shadow page.  This
> > avoids redundant calculations and reduces the number of parameters to
> > kvm_mmu_get_page().
> >
> > Preemptively split out the role calculation to a separate function for
> > use in a following commit.
> >
> > No functional change intended.
> >
> > Reviewed-by: Peter Xu <peterx@redhat.com>
> > Signed-off-by: David Matlack <dmatlack@google.com>
> > ---
> >  arch/x86/kvm/mmu/mmu.c         | 96 +++++++++++++++++++++++-----------
> >  arch/x86/kvm/mmu/paging_tmpl.h |  9 ++--
> >  2 files changed, 71 insertions(+), 34 deletions(-)
> >
> > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> > index dc20eccd6a77..4249a771818b 100644
> > --- a/arch/x86/kvm/mmu/mmu.c
> > +++ b/arch/x86/kvm/mmu/mmu.c
> > @@ -2021,31 +2021,15 @@ static void clear_sp_write_flooding_count(u64 *spte)
> >       __clear_sp_write_flooding_count(sptep_to_sp(spte));
> >  }
> >
> > -static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
> > -                                          gfn_t gfn,
> > -                                          gva_t gaddr,
> > -                                          unsigned level,
> > -                                          bool direct,
> > -                                          unsigned int access)
> > +static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, gfn_t gfn,
> > +                                          union kvm_mmu_page_role role)
> >  {
> > -     union kvm_mmu_page_role role;
> >       struct hlist_head *sp_list;
> > -     unsigned quadrant;
> >       struct kvm_mmu_page *sp;
> >       int ret;
> >       int collisions = 0;
> >       LIST_HEAD(invalid_list);
> >
> > -     role = vcpu->arch.mmu->root_role;
> > -     role.level = level;
> > -     role.direct = direct;
> > -     role.access = access;
> > -     if (role.has_4_byte_gpte) {
> > -             quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
> > -             quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
> > -             role.quadrant = quadrant;
> > -     }
> > -
>
> When you rebase to kvm/queue, the helper will need to deal with
>
>         if (level <= vcpu->arch.mmu->cpu_role.base.level)
>                 role.passthrough = 0;
>
> KVM should never create a passthrough huge page, so I believe it's just a matter
> of adding yet another boolean param to kvm_mmu_child_role().

+Lai Jiangshan

It looks like only root pages can be passthrough, so
kvm_mmu_child_role() can hard-code passthrough to 0. Lai do you agree?

>
>
> >       sp_list = &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)];
> >       for_each_valid_sp(vcpu->kvm, sp, sp_list) {
> >               if (sp->gfn != gfn) {
> > @@ -2063,7 +2047,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
> >                        * Unsync pages must not be left as is, because the new
> >                        * upper-level page will be write-protected.
> >                        */
> > -                     if (level > PG_LEVEL_4K && sp->unsync)
> > +                     if (role.level > PG_LEVEL_4K && sp->unsync)
> >                               kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
> >                                                        &invalid_list);
> >                       continue;
>
> ...
>
> > @@ -3310,12 +3338,21 @@ static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
> >       return ret;
> >  }
> >
> > -static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gva,
> > +static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, int quadrant,
> >                           u8 level, bool direct)
> >  {
> > +     union kvm_mmu_page_role role;
> >       struct kvm_mmu_page *sp;
> >
> > -     sp = kvm_mmu_get_page(vcpu, gfn, gva, level, direct, ACC_ALL);
> > +     role = vcpu->arch.mmu->root_role;
> > +     role.level = level;
> > +     role.direct = direct;
> > +     role.access = ACC_ALL;
> > +
> > +     if (role.has_4_byte_gpte)
> > +             role.quadrant = quadrant;
>
> Maybe add a comment explaining the PAE and 32-bit paging paths share a call for
> allocating PDPTEs?  Otherwise it looks like passing a non-zero quadrant when the
> guest doesn't have 4-byte PTEs should be a bug.
>
> Hmm, even better, if the check is moved to the caller, then this can be:
>
>         role.level = level;
>         role.direct = direct;
>         role.access = ACC_ALL;
>         role.quadrant = quadrant;
>
>         WARN_ON_ONCE(quadrant && !role.has_4_byte_gpte));
>         WARN_ON_ONCE(direct && role.has_4_byte_gpte));
>
> and no comment is necessary.

Sure.

>
> > +
> > +     sp = kvm_mmu_get_page(vcpu, gfn, role);
> >       ++sp->root_count;
> >
> >       return __pa(sp->spt);
> > @@ -3349,8 +3386,8 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
> >               for (i = 0; i < 4; ++i) {
> >                       WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i]));
> >
> > -                     root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT),
> > -                                           i << 30, PT32_ROOT_LEVEL, true);
> > +                     root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT), i,
>
> The @quadrant here can be hardcoded to '0', has_4_byte_gpte is guaranteed to be
> false if the MMU is direct.  And then in the indirect path, set gva (and then
> quadrant) based on 'i' iff the guest is using 32-bit paging.
>
> Probably worth making it a separate patch just in case I'm forgetting something.
> Lightly tested...

Looks sane. I'll incorporate something like this in v5.

>
> --
> From: Sean Christopherson <seanjc@google.com>
> Date: Thu, 5 May 2022 14:19:35 -0700
> Subject: [PATCH] KVM: x86/mmu: Pass '0' for @gva when allocating root with
>  8-byte gpte
>
> Pass '0' instead of the "real" gva when allocating a direct PAE root,
> a.k.a. a direct PDPTE, and when allocating indirect roots that shadow
> 64-bit / 8-byte GPTEs.
>
> Thee @gva is only needed if the root is shadowing 32-bit paging in the
> guest, in which case KVM needs to use different shadow pages for each of
> the two 4-byte GPTEs covered by KVM's 8-byte PAE SPTE.
>
> For direct MMUs, there's obviously no shadowing, and for indirect MMU
>
> In anticipation of moving the quadrant logic into mmu_alloc_root(), WARN
> if a non-zero @gva is passed for !4-byte GPTEs, and WARN if 4-byte GPTEs
> are ever combined with a direct root (there's no shadowing, so TDP roots
> should ignore the GPTE size).
>
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> ---
>  arch/x86/kvm/mmu/mmu.c | 14 +++++++++++---
>  1 file changed, 11 insertions(+), 3 deletions(-)
>
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index dc20eccd6a77..6dfa3cfa8394 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -3313,8 +3313,12 @@ static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
>  static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gva,
>                             u8 level, bool direct)
>  {
> +       union kvm_mmu_page_role role = vcpu->arch.mmu->root_role;
>         struct kvm_mmu_page *sp;
>
> +       WARN_ON_ONCE(gva && !role.has_4_byte_gpte);
> +       WARN_ON_ONCE(direct && role.has_4_byte_gpte);
> +
>         sp = kvm_mmu_get_page(vcpu, gfn, gva, level, direct, ACC_ALL);
>         ++sp->root_count;
>
> @@ -3349,8 +3353,8 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
>                 for (i = 0; i < 4; ++i) {
>                         WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i]));
>
> -                       root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT),
> -                                             i << 30, PT32_ROOT_LEVEL, true);
> +                       root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT), 0,
> +                                             PT32_ROOT_LEVEL, true);
>                         mmu->pae_root[i] = root | PT_PRESENT_MASK |
>                                            shadow_me_mask;
>                 }
> @@ -3435,6 +3439,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
>         u64 pdptrs[4], pm_mask;
>         gfn_t root_gfn, root_pgd;
>         hpa_t root;
> +       gva_t gva;
>         unsigned i;
>         int r;
>
> @@ -3508,6 +3513,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
>                 }
>         }
>
> +       gva = 0;
>         for (i = 0; i < 4; ++i) {
>                 WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i]));
>
> @@ -3517,9 +3523,11 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
>                                 continue;
>                         }
>                         root_gfn = pdptrs[i] >> PAGE_SHIFT;
> +               } else if (mmu->cpu_role.base.level == PT32_ROOT_LEVEL) {
> +                       gva = i << 30;
>                 }
>
> -               root = mmu_alloc_root(vcpu, root_gfn, i << 30,
> +               root = mmu_alloc_root(vcpu, root_gfn, gva,
>                                       PT32_ROOT_LEVEL, false);
>                 mmu->pae_root[i] = root | pm_mask;
>         }
>
> base-commit: 8bae380ad7dd3c31266d3685841ea4ce574d462d
> --
>
Lai Jiangshan May 10, 2022, 2:38 a.m. UTC | #5
On Tue, May 10, 2022 at 6:10 AM David Matlack <dmatlack@google.com> wrote:

>
> +Lai Jiangshan
>
> It looks like only root pages can be passthrough, so
> kvm_mmu_child_role() can hard-code passthrough to 0. Lai do you agree?
>


Agree.

Thanks
Lai
Lai Jiangshan May 10, 2022, 2:58 a.m. UTC | #6
()

On Tue, May 10, 2022 at 5:04 AM David Matlack <dmatlack@google.com> wrote:
>
> On Sat, May 7, 2022 at 1:28 AM Lai Jiangshan <jiangshanlai@gmail.com> wrote:
> >
> >
> >
> > On 2022/4/23 05:05, David Matlack wrote:
> > > Instead of computing the shadow page role from scratch for every new
> > > page, derive most of the information from the parent shadow page.  This
> > > avoids redundant calculations and reduces the number of parameters to
> > > kvm_mmu_get_page().
> > >
> > > Preemptively split out the role calculation to a separate function for
> > > use in a following commit.
> > >
> > > No functional change intended.
> > >
> > > Reviewed-by: Peter Xu <peterx@redhat.com>
> > > Signed-off-by: David Matlack <dmatlack@google.com>
> > > ---
> > >   arch/x86/kvm/mmu/mmu.c         | 96 +++++++++++++++++++++++-----------
> > >   arch/x86/kvm/mmu/paging_tmpl.h |  9 ++--
> > >   2 files changed, 71 insertions(+), 34 deletions(-)
> > >
> > > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> > > index dc20eccd6a77..4249a771818b 100644
> > > --- a/arch/x86/kvm/mmu/mmu.c
> > > +++ b/arch/x86/kvm/mmu/mmu.c
> > > @@ -2021,31 +2021,15 @@ static void clear_sp_write_flooding_count(u64 *spte)
> > >       __clear_sp_write_flooding_count(sptep_to_sp(spte));
> > >   }
> > >
> > > -static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
> > > -                                          gfn_t gfn,
> > > -                                          gva_t gaddr,
> > > -                                          unsigned level,
> > > -                                          bool direct,
> > > -                                          unsigned int access)
> > > +static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, gfn_t gfn,
> > > +                                          union kvm_mmu_page_role role)
> > >   {
> > > -     union kvm_mmu_page_role role;
> > >       struct hlist_head *sp_list;
> > > -     unsigned quadrant;
> > >       struct kvm_mmu_page *sp;
> > >       int ret;
> > >       int collisions = 0;
> > >       LIST_HEAD(invalid_list);
> > >
> > > -     role = vcpu->arch.mmu->root_role;
> > > -     role.level = level;
> > > -     role.direct = direct;
> > > -     role.access = access;
> > > -     if (role.has_4_byte_gpte) {
> > > -             quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
> > > -             quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
> > > -             role.quadrant = quadrant;
> > > -     }
> >
> >
> > I don't think replacing it with kvm_mmu_child_role() can reduce any calculations.
> >
> > role.level, role.direct, role.access and role.quadrant still need to be
> > calculated.  And the old code is still in mmu_alloc_root().
>
> You are correct. Instead of saying "less calculations" I should have
> said "eliminates the dependency on vcpu->arch.mmu->root_role".
>
> >
> > I think kvm_mmu_get_shadow_page() can keep the those parameters and
> > kvm_mmu_child_role() is only introduced for nested_mmu_get_sp_for_split().
> >
> > Both kvm_mmu_get_shadow_page() and nested_mmu_get_sp_for_split() call
> > __kvm_mmu_get_shadow_page() which uses role as a parameter.
>
> I agree this would work, but I think the end result would be more
> difficult to read.
>
> The way I've implemented it there are two ways the SP roles are calculated:
>
> 1. For root SPs: Derive the role from vCPU root_role and caller-provided inputs.
> 2. For child SPs: Derive the role from parent SP and caller-provided inputs.
>
> Your proposal would still have two ways to calculate the SP role, but
> split along a different dimension:
>
> 1. For vCPUs creating SPs: Derive the role from vCPU root_role and
> caller-provided inputs.
> 2. For Eager Page Splitting creating SPs: Derive the role from parent
> SP and caller-provided inputs.
>
> With your proposal, it is less obvious that eager page splitting is
> going to end up with the correct role. Whereas if we use the same
> calculation for all child SPs, it is immediately obvious.


In this patchset, there are still two ways to calculate the SP role
including the one in mmu_alloc_root() which I dislike.

The old code is just moved into mmu_alloc_root() even kvm_mmu_child_role()
is introduced.

>
> >
> >
> >
> > > -
> > >       sp_list = &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)];
> > >       for_each_valid_sp(vcpu->kvm, sp, sp_list) {
> > >               if (sp->gfn != gfn) {
> > > @@ -2063,7 +2047,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
> > >                        * Unsync pages must not be left as is, because the new
> > >                        * upper-level page will be write-protected.
> > >                        */
> > > -                     if (level > PG_LEVEL_4K && sp->unsync)
> > > +                     if (role.level > PG_LEVEL_4K && sp->unsync)
> > >                               kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
> > >                                                        &invalid_list);
> > >                       continue;
> > > @@ -2104,14 +2088,14 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
> > >
> > >       ++vcpu->kvm->stat.mmu_cache_miss;
> > >
> > > -     sp = kvm_mmu_alloc_page(vcpu, direct);
> > > +     sp = kvm_mmu_alloc_page(vcpu, role.direct);
> > >
> > >       sp->gfn = gfn;
> > >       sp->role = role;
> > >       hlist_add_head(&sp->hash_link, sp_list);
> > > -     if (!direct) {
> > > +     if (!role.direct) {
> > >               account_shadowed(vcpu->kvm, sp);
> > > -             if (level == PG_LEVEL_4K && kvm_vcpu_write_protect_gfn(vcpu, gfn))
> > > +             if (role.level == PG_LEVEL_4K && kvm_vcpu_write_protect_gfn(vcpu, gfn))
> > >                       kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, 1);
> > >       }
> > >       trace_kvm_mmu_get_page(sp, true);
> > > @@ -2123,6 +2107,51 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
> > >       return sp;
> > >   }
> > >
> > > +static union kvm_mmu_page_role kvm_mmu_child_role(u64 *sptep, bool direct, u32 access)
> > > +{
> > > +     struct kvm_mmu_page *parent_sp = sptep_to_sp(sptep);
> > > +     union kvm_mmu_page_role role;
> > > +
> > > +     role = parent_sp->role;
> > > +     role.level--;
> > > +     role.access = access;
> > > +     role.direct = direct;
> > > +
> > > +     /*
> > > +      * If the guest has 4-byte PTEs then that means it's using 32-bit,
> > > +      * 2-level, non-PAE paging. KVM shadows such guests using 4 PAE page
> > > +      * directories, each mapping 1/4 of the guest's linear address space
> > > +      * (1GiB). The shadow pages for those 4 page directories are
> > > +      * pre-allocated and assigned a separate quadrant in their role.
> >
> >
> > It is not going to be true in patchset:
> > [PATCH V2 0/7] KVM: X86/MMU: Use one-off special shadow page for special roots
> >
> > https://lore.kernel.org/lkml/20220503150735.32723-1-jiangshanlai@gmail.com/
> >
> > The shadow pages for those 4 page directories are also allocated on demand.
>
> Ack. I can even just drop this sentence in v5, it's just background information.

No, if one-off special shadow pages are used.

kvm_mmu_child_role() should be:

+       if (role.has_4_byte_gpte) {
+               if (role.level == PG_LEVEL_4K)
+                       role.quadrant = (sptep - parent_sp->spt) % 2;
+               if (role.level == PG_LEVEL_2M)
+                       role.quadrant = (sptep - parent_sp->spt) % 4;
+       }


And if one-off special shadow pages are merged first.  You don't
need any calculation in mmu_alloc_root(), you can just directly use
    sp = kvm_mmu_get_page(vcpu, gfn, vcpu->arch.mmu->root_role);
because vcpu->arch.mmu->root_role is always the real role of the root
sp no matter if it is a normall root sp or an one-off special sp.

I hope you will pardon me for my touting my patchset and asking
people to review them in your threads.

Thanks
Lai
Sean Christopherson May 10, 2022, 1:31 p.m. UTC | #7
On Tue, May 10, 2022, Lai Jiangshan wrote:
> ()
> 
> On Tue, May 10, 2022 at 5:04 AM David Matlack <dmatlack@google.com> wrote:
> >
> > On Sat, May 7, 2022 at 1:28 AM Lai Jiangshan <jiangshanlai@gmail.com> wrote:
> > > > +static union kvm_mmu_page_role kvm_mmu_child_role(u64 *sptep, bool direct, u32 access)
> > > > +{
> > > > +     struct kvm_mmu_page *parent_sp = sptep_to_sp(sptep);
> > > > +     union kvm_mmu_page_role role;
> > > > +
> > > > +     role = parent_sp->role;
> > > > +     role.level--;
> > > > +     role.access = access;
> > > > +     role.direct = direct;
> > > > +
> > > > +     /*
> > > > +      * If the guest has 4-byte PTEs then that means it's using 32-bit,
> > > > +      * 2-level, non-PAE paging. KVM shadows such guests using 4 PAE page
> > > > +      * directories, each mapping 1/4 of the guest's linear address space
> > > > +      * (1GiB). The shadow pages for those 4 page directories are
> > > > +      * pre-allocated and assigned a separate quadrant in their role.
> > >
> > >
> > > It is not going to be true in patchset:
> > > [PATCH V2 0/7] KVM: X86/MMU: Use one-off special shadow page for special roots
> > >
> > > https://lore.kernel.org/lkml/20220503150735.32723-1-jiangshanlai@gmail.com/
> > >
> > > The shadow pages for those 4 page directories are also allocated on demand.
> >
> > Ack. I can even just drop this sentence in v5, it's just background information.
> 
> No, if one-off special shadow pages are used.
> 
> kvm_mmu_child_role() should be:
> 
> +       if (role.has_4_byte_gpte) {
> +               if (role.level == PG_LEVEL_4K)
> +                       role.quadrant = (sptep - parent_sp->spt) % 2;
> +               if (role.level == PG_LEVEL_2M)

If the code ends up looking anything like this, please use PT32_ROOT_LEVEL instead
of PG_LEVEL_2M.  PSE paging has 4M huge pages, using PG_LEVEL_2M is confusing.

Or even better might be to do:

		if (role.level == PG_LEVEL_4k)
			...
		else
			...

Or arithmetic using role.level directly, a la the current code.
David Matlack May 12, 2022, 4:10 p.m. UTC | #8
On Mon, May 9, 2022 at 7:58 PM Lai Jiangshan <jiangshanlai@gmail.com> wrote:
>
> ()
>
> On Tue, May 10, 2022 at 5:04 AM David Matlack <dmatlack@google.com> wrote:
> >
> > On Sat, May 7, 2022 at 1:28 AM Lai Jiangshan <jiangshanlai@gmail.com> wrote:
> > >
> > >
> > >
> > > On 2022/4/23 05:05, David Matlack wrote:
> > > > Instead of computing the shadow page role from scratch for every new
> > > > page, derive most of the information from the parent shadow page.  This
> > > > avoids redundant calculations and reduces the number of parameters to
> > > > kvm_mmu_get_page().
> > > >
> > > > Preemptively split out the role calculation to a separate function for
> > > > use in a following commit.
> > > >
> > > > No functional change intended.
> > > >
> > > > Reviewed-by: Peter Xu <peterx@redhat.com>
> > > > Signed-off-by: David Matlack <dmatlack@google.com>
> > > > ---
> > > >   arch/x86/kvm/mmu/mmu.c         | 96 +++++++++++++++++++++++-----------
> > > >   arch/x86/kvm/mmu/paging_tmpl.h |  9 ++--
> > > >   2 files changed, 71 insertions(+), 34 deletions(-)
> > > >
> > > > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> > > > index dc20eccd6a77..4249a771818b 100644
> > > > --- a/arch/x86/kvm/mmu/mmu.c
> > > > +++ b/arch/x86/kvm/mmu/mmu.c
> > > > @@ -2021,31 +2021,15 @@ static void clear_sp_write_flooding_count(u64 *spte)
> > > >       __clear_sp_write_flooding_count(sptep_to_sp(spte));
> > > >   }
> > > >
> > > > -static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
> > > > -                                          gfn_t gfn,
> > > > -                                          gva_t gaddr,
> > > > -                                          unsigned level,
> > > > -                                          bool direct,
> > > > -                                          unsigned int access)
> > > > +static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, gfn_t gfn,
> > > > +                                          union kvm_mmu_page_role role)
> > > >   {
> > > > -     union kvm_mmu_page_role role;
> > > >       struct hlist_head *sp_list;
> > > > -     unsigned quadrant;
> > > >       struct kvm_mmu_page *sp;
> > > >       int ret;
> > > >       int collisions = 0;
> > > >       LIST_HEAD(invalid_list);
> > > >
> > > > -     role = vcpu->arch.mmu->root_role;
> > > > -     role.level = level;
> > > > -     role.direct = direct;
> > > > -     role.access = access;
> > > > -     if (role.has_4_byte_gpte) {
> > > > -             quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
> > > > -             quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
> > > > -             role.quadrant = quadrant;
> > > > -     }
> > >
> > >
> > > I don't think replacing it with kvm_mmu_child_role() can reduce any calculations.
> > >
> > > role.level, role.direct, role.access and role.quadrant still need to be
> > > calculated.  And the old code is still in mmu_alloc_root().
> >
> > You are correct. Instead of saying "less calculations" I should have
> > said "eliminates the dependency on vcpu->arch.mmu->root_role".
> >
> > >
> > > I think kvm_mmu_get_shadow_page() can keep the those parameters and
> > > kvm_mmu_child_role() is only introduced for nested_mmu_get_sp_for_split().
> > >
> > > Both kvm_mmu_get_shadow_page() and nested_mmu_get_sp_for_split() call
> > > __kvm_mmu_get_shadow_page() which uses role as a parameter.
> >
> > I agree this would work, but I think the end result would be more
> > difficult to read.
> >
> > The way I've implemented it there are two ways the SP roles are calculated:
> >
> > 1. For root SPs: Derive the role from vCPU root_role and caller-provided inputs.
> > 2. For child SPs: Derive the role from parent SP and caller-provided inputs.
> >
> > Your proposal would still have two ways to calculate the SP role, but
> > split along a different dimension:
> >
> > 1. For vCPUs creating SPs: Derive the role from vCPU root_role and
> > caller-provided inputs.
> > 2. For Eager Page Splitting creating SPs: Derive the role from parent
> > SP and caller-provided inputs.
> >
> > With your proposal, it is less obvious that eager page splitting is
> > going to end up with the correct role. Whereas if we use the same
> > calculation for all child SPs, it is immediately obvious.
>
>
> In this patchset, there are still two ways to calculate the SP role
> including the one in mmu_alloc_root() which I dislike.

My point is there will be two ways to calculate the SP role either
way. Can you explain why you dislike calculating the role in
mmu_alloc_root()? As you point out later, that code will disappear
anyway as soon as your series is merged.

>
> The old code is just moved into mmu_alloc_root() even kvm_mmu_child_role()
> is introduced.
>
> >
> > >
> > >
> > >
> > > > -
> > > >       sp_list = &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)];
> > > >       for_each_valid_sp(vcpu->kvm, sp, sp_list) {
> > > >               if (sp->gfn != gfn) {
> > > > @@ -2063,7 +2047,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
> > > >                        * Unsync pages must not be left as is, because the new
> > > >                        * upper-level page will be write-protected.
> > > >                        */
> > > > -                     if (level > PG_LEVEL_4K && sp->unsync)
> > > > +                     if (role.level > PG_LEVEL_4K && sp->unsync)
> > > >                               kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
> > > >                                                        &invalid_list);
> > > >                       continue;
> > > > @@ -2104,14 +2088,14 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
> > > >
> > > >       ++vcpu->kvm->stat.mmu_cache_miss;
> > > >
> > > > -     sp = kvm_mmu_alloc_page(vcpu, direct);
> > > > +     sp = kvm_mmu_alloc_page(vcpu, role.direct);
> > > >
> > > >       sp->gfn = gfn;
> > > >       sp->role = role;
> > > >       hlist_add_head(&sp->hash_link, sp_list);
> > > > -     if (!direct) {
> > > > +     if (!role.direct) {
> > > >               account_shadowed(vcpu->kvm, sp);
> > > > -             if (level == PG_LEVEL_4K && kvm_vcpu_write_protect_gfn(vcpu, gfn))
> > > > +             if (role.level == PG_LEVEL_4K && kvm_vcpu_write_protect_gfn(vcpu, gfn))
> > > >                       kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, 1);
> > > >       }
> > > >       trace_kvm_mmu_get_page(sp, true);
> > > > @@ -2123,6 +2107,51 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
> > > >       return sp;
> > > >   }
> > > >
> > > > +static union kvm_mmu_page_role kvm_mmu_child_role(u64 *sptep, bool direct, u32 access)
> > > > +{
> > > > +     struct kvm_mmu_page *parent_sp = sptep_to_sp(sptep);
> > > > +     union kvm_mmu_page_role role;
> > > > +
> > > > +     role = parent_sp->role;
> > > > +     role.level--;
> > > > +     role.access = access;
> > > > +     role.direct = direct;
> > > > +
> > > > +     /*
> > > > +      * If the guest has 4-byte PTEs then that means it's using 32-bit,
> > > > +      * 2-level, non-PAE paging. KVM shadows such guests using 4 PAE page
> > > > +      * directories, each mapping 1/4 of the guest's linear address space
> > > > +      * (1GiB). The shadow pages for those 4 page directories are
> > > > +      * pre-allocated and assigned a separate quadrant in their role.
> > >
> > >
> > > It is not going to be true in patchset:
> > > [PATCH V2 0/7] KVM: X86/MMU: Use one-off special shadow page for special roots
> > >
> > > https://lore.kernel.org/lkml/20220503150735.32723-1-jiangshanlai@gmail.com/
> > >
> > > The shadow pages for those 4 page directories are also allocated on demand.
> >
> > Ack. I can even just drop this sentence in v5, it's just background information.
>
> No, if one-off special shadow pages are used.
>
> kvm_mmu_child_role() should be:
>
> +       if (role.has_4_byte_gpte) {
> +               if (role.level == PG_LEVEL_4K)
> +                       role.quadrant = (sptep - parent_sp->spt) % 2;
> +               if (role.level == PG_LEVEL_2M)
> +                       role.quadrant = (sptep - parent_sp->spt) % 4;
> +       }
>
>
> And if one-off special shadow pages are merged first.  You don't
> need any calculation in mmu_alloc_root(), you can just directly use
>     sp = kvm_mmu_get_page(vcpu, gfn, vcpu->arch.mmu->root_role);
> because vcpu->arch.mmu->root_role is always the real role of the root
> sp no matter if it is a normall root sp or an one-off special sp.
>
> I hope you will pardon me for my touting my patchset and asking
> people to review them in your threads.

I see what you mean now. If your series is queued I will rebase on top
with the appropriate changes. But for now I will continue to code
against kvm/queue.

>
> Thanks
> Lai
David Matlack May 13, 2022, 6:26 p.m. UTC | #9
On Thu, May 12, 2022 at 09:10:59AM -0700, David Matlack wrote:
> On Mon, May 9, 2022 at 7:58 PM Lai Jiangshan <jiangshanlai@gmail.com> wrote:
> > On Tue, May 10, 2022 at 5:04 AM David Matlack <dmatlack@google.com> wrote:
> > > On Sat, May 7, 2022 at 1:28 AM Lai Jiangshan <jiangshanlai@gmail.com> wrote:
> > > > On 2022/4/23 05:05, David Matlack wrote:
> > > > > +     /*
> > > > > +      * If the guest has 4-byte PTEs then that means it's using 32-bit,
> > > > > +      * 2-level, non-PAE paging. KVM shadows such guests using 4 PAE page
> > > > > +      * directories, each mapping 1/4 of the guest's linear address space
> > > > > +      * (1GiB). The shadow pages for those 4 page directories are
> > > > > +      * pre-allocated and assigned a separate quadrant in their role.
> > > >
> > > >
> > > > It is not going to be true in patchset:
> > > > [PATCH V2 0/7] KVM: X86/MMU: Use one-off special shadow page for special roots
> > > >
> > > > https://lore.kernel.org/lkml/20220503150735.32723-1-jiangshanlai@gmail.com/
> > > >
> > > > The shadow pages for those 4 page directories are also allocated on demand.
> > >
> > > Ack. I can even just drop this sentence in v5, it's just background information.
> >
> > No, if one-off special shadow pages are used.
> >
> > kvm_mmu_child_role() should be:
> >
> > +       if (role.has_4_byte_gpte) {
> > +               if (role.level == PG_LEVEL_4K)
> > +                       role.quadrant = (sptep - parent_sp->spt) % 2;
> > +               if (role.level == PG_LEVEL_2M)
> > +                       role.quadrant = (sptep - parent_sp->spt) % 4;
> > +       }
> >
> >
> > And if one-off special shadow pages are merged first.  You don't
> > need any calculation in mmu_alloc_root(), you can just directly use
> >     sp = kvm_mmu_get_page(vcpu, gfn, vcpu->arch.mmu->root_role);
> > because vcpu->arch.mmu->root_role is always the real role of the root
> > sp no matter if it is a normall root sp or an one-off special sp.
> >
> > I hope you will pardon me for my touting my patchset and asking
> > people to review them in your threads.
> 
> I see what you mean now. If your series is queued I will rebase on top
> with the appropriate changes. But for now I will continue to code
> against kvm/queue.

Here is what I'm going with for v5:

        /*
         * If the guest has 4-byte PTEs then that means it's using 32-bit,
         * 2-level, non-PAE paging. KVM shadows such guests with PAE paging
         * (i.e. 8-byte PTEs). The difference in PTE size means that
         * KVM must shadow each guest page table with multiple shadow page
         * tables, which requires extra bookkeeping in the role.
         *
         * Specifically, to shadow the guest's page directory (which covers a
         * 4GiB address space), KVM uses 4 PAE page directories, each mapping
         * 1GiB of the address space. @role.quadrant encodes which quarter of
         * the address space each maps.
         *
         * To shadow the guest's page tables (which each map a 4MiB region),
         * KVM uses 2 PAE page tables, each mapping a 2MiB region. For these,
         * @role.quadrant encodes which half of the region they map.
         *
         * Note, the 4 PAE page directories are pre-allocated and the quadrant
         * assigned in mmu_alloc_root(). So only page tables need to be handled
         * here.
         */
        if (role.has_4_byte_gpte) {
                WARN_ON_ONCE(role.level != PG_LEVEL_4K);
                role.quadrant = (sptep - parent_sp->spt) % 2;
        }

Then to make it work with your series we can just apply this diff:

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index f7c4f08e8a69..0e0e2da2f37d 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -2131,14 +2131,10 @@ static union kvm_mmu_page_role kvm_mmu_child_role(u64 *sptep, bool direct, u32 a
         * To shadow the guest's page tables (which each map a 4MiB region),
         * KVM uses 2 PAE page tables, each mapping a 2MiB region. For these,
         * @role.quadrant encodes which half of the region they map.
-        *
-        * Note, the 4 PAE page directories are pre-allocated and the quadrant
-        * assigned in mmu_alloc_root(). So only page tables need to be handled
-        * here.
         */
        if (role.has_4_byte_gpte) {
-               WARN_ON_ONCE(role.level != PG_LEVEL_4K);
-               role.quadrant = (sptep - parent_sp->spt) % 2;
+               WARN_ON_ONCE(role.level > PG_LEVEL_2M);
+               role.quadrant = (sptep - parent_sp->spt) % (1 << role.level);
        }

        return role;

If your series is queued first, I can resend a v6 with this change or Paolo can
apply it. If mine is queued first then you can include this as part of your
series.
diff mbox series

Patch

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index dc20eccd6a77..4249a771818b 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -2021,31 +2021,15 @@  static void clear_sp_write_flooding_count(u64 *spte)
 	__clear_sp_write_flooding_count(sptep_to_sp(spte));
 }
 
-static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
-					     gfn_t gfn,
-					     gva_t gaddr,
-					     unsigned level,
-					     bool direct,
-					     unsigned int access)
+static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, gfn_t gfn,
+					     union kvm_mmu_page_role role)
 {
-	union kvm_mmu_page_role role;
 	struct hlist_head *sp_list;
-	unsigned quadrant;
 	struct kvm_mmu_page *sp;
 	int ret;
 	int collisions = 0;
 	LIST_HEAD(invalid_list);
 
-	role = vcpu->arch.mmu->root_role;
-	role.level = level;
-	role.direct = direct;
-	role.access = access;
-	if (role.has_4_byte_gpte) {
-		quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
-		quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
-		role.quadrant = quadrant;
-	}
-
 	sp_list = &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)];
 	for_each_valid_sp(vcpu->kvm, sp, sp_list) {
 		if (sp->gfn != gfn) {
@@ -2063,7 +2047,7 @@  static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 			 * Unsync pages must not be left as is, because the new
 			 * upper-level page will be write-protected.
 			 */
-			if (level > PG_LEVEL_4K && sp->unsync)
+			if (role.level > PG_LEVEL_4K && sp->unsync)
 				kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
 							 &invalid_list);
 			continue;
@@ -2104,14 +2088,14 @@  static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 
 	++vcpu->kvm->stat.mmu_cache_miss;
 
-	sp = kvm_mmu_alloc_page(vcpu, direct);
+	sp = kvm_mmu_alloc_page(vcpu, role.direct);
 
 	sp->gfn = gfn;
 	sp->role = role;
 	hlist_add_head(&sp->hash_link, sp_list);
-	if (!direct) {
+	if (!role.direct) {
 		account_shadowed(vcpu->kvm, sp);
-		if (level == PG_LEVEL_4K && kvm_vcpu_write_protect_gfn(vcpu, gfn))
+		if (role.level == PG_LEVEL_4K && kvm_vcpu_write_protect_gfn(vcpu, gfn))
 			kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, 1);
 	}
 	trace_kvm_mmu_get_page(sp, true);
@@ -2123,6 +2107,51 @@  static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 	return sp;
 }
 
+static union kvm_mmu_page_role kvm_mmu_child_role(u64 *sptep, bool direct, u32 access)
+{
+	struct kvm_mmu_page *parent_sp = sptep_to_sp(sptep);
+	union kvm_mmu_page_role role;
+
+	role = parent_sp->role;
+	role.level--;
+	role.access = access;
+	role.direct = direct;
+
+	/*
+	 * If the guest has 4-byte PTEs then that means it's using 32-bit,
+	 * 2-level, non-PAE paging. KVM shadows such guests using 4 PAE page
+	 * directories, each mapping 1/4 of the guest's linear address space
+	 * (1GiB). The shadow pages for those 4 page directories are
+	 * pre-allocated and assigned a separate quadrant in their role.
+	 *
+	 * Since this role is for a child shadow page and there are only 2
+	 * levels, this must be a PG_LEVEL_4K shadow page. Here the quadrant
+	 * will either be 0 or 1 because it maps 1/2 of the address space mapped
+	 * by the guest's PG_LEVEL_4K page table (or 4MiB huge page) that it is
+	 * shadowing. In this case, the quadrant can be derived by the index of
+	 * the SPTE that points to the new child shadow page in the page
+	 * directory (parent_sp). Specifically, every 2 SPTEs in parent_sp
+	 * shadow one half of a guest's page table (or 4MiB huge page) so the
+	 * quadrant is just the parity of the index of the SPTE.
+	 */
+	if (role.has_4_byte_gpte) {
+		WARN_ON_ONCE(role.level != PG_LEVEL_4K);
+		role.quadrant = (sptep - parent_sp->spt) % 2;
+	}
+
+	return role;
+}
+
+static struct kvm_mmu_page *kvm_mmu_get_child_sp(struct kvm_vcpu *vcpu,
+						 u64 *sptep, gfn_t gfn,
+						 bool direct, u32 access)
+{
+	union kvm_mmu_page_role role;
+
+	role = kvm_mmu_child_role(sptep, direct, access);
+	return kvm_mmu_get_page(vcpu, gfn, role);
+}
+
 static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterator,
 					struct kvm_vcpu *vcpu, hpa_t root,
 					u64 addr)
@@ -2927,8 +2956,7 @@  static int __direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
 		if (is_shadow_present_pte(*it.sptep))
 			continue;
 
-		sp = kvm_mmu_get_page(vcpu, base_gfn, it.addr,
-				      it.level - 1, true, ACC_ALL);
+		sp = kvm_mmu_get_child_sp(vcpu, it.sptep, base_gfn, true, ACC_ALL);
 
 		link_shadow_page(vcpu, it.sptep, sp);
 		if (fault->is_tdp && fault->huge_page_disallowed &&
@@ -3310,12 +3338,21 @@  static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
 	return ret;
 }
 
-static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gva,
+static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, int quadrant,
 			    u8 level, bool direct)
 {
+	union kvm_mmu_page_role role;
 	struct kvm_mmu_page *sp;
 
-	sp = kvm_mmu_get_page(vcpu, gfn, gva, level, direct, ACC_ALL);
+	role = vcpu->arch.mmu->root_role;
+	role.level = level;
+	role.direct = direct;
+	role.access = ACC_ALL;
+
+	if (role.has_4_byte_gpte)
+		role.quadrant = quadrant;
+
+	sp = kvm_mmu_get_page(vcpu, gfn, role);
 	++sp->root_count;
 
 	return __pa(sp->spt);
@@ -3349,8 +3386,8 @@  static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
 		for (i = 0; i < 4; ++i) {
 			WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i]));
 
-			root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT),
-					      i << 30, PT32_ROOT_LEVEL, true);
+			root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT), i,
+					      PT32_ROOT_LEVEL, true);
 			mmu->pae_root[i] = root | PT_PRESENT_MASK |
 					   shadow_me_mask;
 		}
@@ -3519,8 +3556,7 @@  static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 			root_gfn = pdptrs[i] >> PAGE_SHIFT;
 		}
 
-		root = mmu_alloc_root(vcpu, root_gfn, i << 30,
-				      PT32_ROOT_LEVEL, false);
+		root = mmu_alloc_root(vcpu, root_gfn, i, PT32_ROOT_LEVEL, false);
 		mmu->pae_root[i] = root | pm_mask;
 	}
 
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 66f1acf153c4..a8a755e1561d 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -648,8 +648,9 @@  static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
 		if (!is_shadow_present_pte(*it.sptep)) {
 			table_gfn = gw->table_gfn[it.level - 2];
 			access = gw->pt_access[it.level - 2];
-			sp = kvm_mmu_get_page(vcpu, table_gfn, fault->addr,
-					      it.level-1, false, access);
+			sp = kvm_mmu_get_child_sp(vcpu, it.sptep, table_gfn,
+						  false, access);
+
 			/*
 			 * We must synchronize the pagetable before linking it
 			 * because the guest doesn't need to flush tlb when
@@ -705,8 +706,8 @@  static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
 		drop_large_spte(vcpu, it.sptep);
 
 		if (!is_shadow_present_pte(*it.sptep)) {
-			sp = kvm_mmu_get_page(vcpu, base_gfn, fault->addr,
-					      it.level - 1, true, direct_access);
+			sp = kvm_mmu_get_child_sp(vcpu, it.sptep, base_gfn,
+						  true, direct_access);
 			link_shadow_page(vcpu, it.sptep, sp);
 			if (fault->huge_page_disallowed &&
 			    fault->req_level >= it.level)