diff mbox series

[2/7] KVM: X86: Synchronize the shadow pagetable before link it

Message ID 20210824075524.3354-3-jiangshanlai@gmail.com (mailing list archive)
State New, archived
Headers show
Series [1/7] KVM: X86: Fix missed remote tlb flush in rmap_write_protect() | expand

Commit Message

Lai Jiangshan Aug. 24, 2021, 7:55 a.m. UTC
From: Lai Jiangshan <laijs@linux.alibaba.com>

If gpte is changed from non-present to present, the guest doesn't need
to flush tlb per SDM.  So the host must synchronze sp before
link it.  Otherwise the guest might use a wrong mapping.

For example: the guest first changes a level-1 pagetable, and then
links its parent to a new place where the original gpte is non-present.
Finally the guest can access the remapped area without flushing
the tlb.  The guest's behavior should be allowed per SDM, but the host
kvm mmu makes it wrong.

Fixes: 4731d4c7a077 ("KVM: MMU: out of sync shadow core")
Signed-off-by: Lai Jiangshan <laijs@linux.alibaba.com>
---
 arch/x86/kvm/mmu/mmu.c         | 21 ++++++++++++++-------
 arch/x86/kvm/mmu/paging_tmpl.h | 28 +++++++++++++++++++++++++---
 2 files changed, 39 insertions(+), 10 deletions(-)

Comments

Sean Christopherson Sept. 2, 2021, 11:40 p.m. UTC | #1
On Tue, Aug 24, 2021, Lai Jiangshan wrote:
> From: Lai Jiangshan <laijs@linux.alibaba.com>
> 
> If gpte is changed from non-present to present, the guest doesn't need
> to flush tlb per SDM.  So the host must synchronze sp before
> link it.  Otherwise the guest might use a wrong mapping.
> 
> For example: the guest first changes a level-1 pagetable, and then
> links its parent to a new place where the original gpte is non-present.
> Finally the guest can access the remapped area without flushing
> the tlb.  The guest's behavior should be allowed per SDM, but the host
> kvm mmu makes it wrong.

Ah, are you saying, given:

VA_x = PML4_A -> PDP_B -> PD_C -> PT_D

the guest can modify PT_D, then link it with

VA_y = PML4_A -> PDP_B -> PD_E -> PT_D

and access it via VA_y without flushing, and so KVM must sync PT_D.  Is that
correct?

> Fixes: 4731d4c7a077 ("KVM: MMU: out of sync shadow core")
> Signed-off-by: Lai Jiangshan <laijs@linux.alibaba.com>
> ---

...

> diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
> index 50ade6450ace..48c7fe1b2d50 100644
> --- a/arch/x86/kvm/mmu/paging_tmpl.h
> +++ b/arch/x86/kvm/mmu/paging_tmpl.h
> @@ -664,7 +664,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
>   * emulate this operation, return 1 to indicate this case.
>   */
>  static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
> -			 struct guest_walker *gw)
> +			 struct guest_walker *gw, unsigned long mmu_seq)
>  {
>  	struct kvm_mmu_page *sp = NULL;
>  	struct kvm_shadow_walk_iterator it;
> @@ -678,6 +678,8 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
>  	top_level = vcpu->arch.mmu->root_level;
>  	if (top_level == PT32E_ROOT_LEVEL)
>  		top_level = PT32_ROOT_LEVEL;
> +
> +again:
>  	/*
>  	 * Verify that the top-level gpte is still there.  Since the page
>  	 * is a root page, it is either write protected (and cannot be
> @@ -713,8 +715,28 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
>  		if (FNAME(gpte_changed)(vcpu, gw, it.level - 1))
>  			goto out_gpte_changed;
>  
> -		if (sp)
> +		if (sp) {
> +			/*
> +			 * We must synchronize the pagetable before link it
> +			 * because the guest doens't need to flush tlb when
> +			 * gpte is changed from non-present to present.
> +			 * Otherwise, the guest may use the wrong mapping.
> +			 *
> +			 * For PG_LEVEL_4K, kvm_mmu_get_page() has already
> +			 * synchronized it transiently via kvm_sync_page().
> +			 *
> +			 * For higher level pagetable, we synchronize it
> +			 * via slower mmu_sync_children().  If it once
> +			 * released the mmu_lock, we need to restart from
> +			 * the root since we don't have reference to @sp.
> +			 */
> +			if (sp->unsync_children && !mmu_sync_children(vcpu, sp, false)) {

I don't like dropping mmu_lock in the page fault path.  I agree that it's not
all that different than grabbing various things in kvm_mmu_do_page_fault() long
before acquiring mmu_lock, but I'm not 100% convinced we don't have a latent
bug hiding somehwere in there :-), and (b) there's a possibility, however small,
that something in FNAME(fetch) that we're missing.  Case in point, this technically
needs to do make_mmu_pages_available().

And I believe kvm_mmu_get_page() already tries to handle this case by requesting
KVM_REQ_MMU_SYNC if it uses a sp with unsync_children, it just doesn't handle SMP
interaction, e.g. can link a sp that's immediately available to other vCPUs before
the sync.

Rather than force the sync here, what about kicking all vCPUs and retrying the
page fault?  The only gross part is that kvm_mmu_get_page() can now fail :-(

---
 arch/x86/include/asm/kvm_host.h | 3 ++-
 arch/x86/kvm/mmu/mmu.c          | 9 +++++++--
 arch/x86/kvm/mmu/paging_tmpl.h  | 4 ++++
 3 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 09b256db394a..332b9fb3454c 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -57,7 +57,8 @@
 #define KVM_REQ_MIGRATE_TIMER		KVM_ARCH_REQ(0)
 #define KVM_REQ_REPORT_TPR_ACCESS	KVM_ARCH_REQ(1)
 #define KVM_REQ_TRIPLE_FAULT		KVM_ARCH_REQ(2)
-#define KVM_REQ_MMU_SYNC		KVM_ARCH_REQ(3)
+#define KVM_REQ_MMU_SYNC \
+	KVM_ARCH_REQ_FLAGS(3, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
 #define KVM_REQ_CLOCK_UPDATE		KVM_ARCH_REQ(4)
 #define KVM_REQ_LOAD_MMU_PGD		KVM_ARCH_REQ(5)
 #define KVM_REQ_EVENT			KVM_ARCH_REQ(6)
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 4853c033e6ce..03293cd3c7ae 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -2143,8 +2143,10 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 			kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
 		}

-		if (sp->unsync_children)
-			kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
+		if (sp->unsync_children) {
+			kvm_make_all_cpus_request(KVM_REQ_MMU_SYNC, vcpu);
+			return NULL;
+		}

 		__clear_sp_write_flooding_count(sp);

@@ -2999,6 +3001,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)

 		sp = kvm_mmu_get_page(vcpu, base_gfn, it.addr,
 				      it.level - 1, true, ACC_ALL);
+		BUG_ON(!sp);

 		link_shadow_page(vcpu, it.sptep, sp);
 		if (fault->is_tdp && fault->huge_page_disallowed &&
@@ -3383,6 +3386,8 @@ static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gva,
 	struct kvm_mmu_page *sp;

 	sp = kvm_mmu_get_page(vcpu, gfn, gva, level, direct, ACC_ALL);
+	BUG_ON(!sp);
+
 	++sp->root_count;

 	return __pa(sp->spt);
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 50ade6450ace..f573d45e2c6f 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -704,6 +704,8 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
 			access = gw->pt_access[it.level - 2];
 			sp = kvm_mmu_get_page(vcpu, table_gfn, fault->addr,
 					      it.level-1, false, access);
+			if (!sp)
+				return RET_PF_RETRY;
 		}

 		/*
@@ -742,6 +744,8 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
 		if (!is_shadow_present_pte(*it.sptep)) {
 			sp = kvm_mmu_get_page(vcpu, base_gfn, fault->addr,
 					      it.level - 1, true, direct_access);
+			BUG_ON(!sp);
+
 			link_shadow_page(vcpu, it.sptep, sp);
 			if (fault->huge_page_disallowed &&
 			    fault->req_level >= it.level)
--
Sean Christopherson Sept. 2, 2021, 11:54 p.m. UTC | #2
On Thu, Sep 02, 2021, Sean Christopherson wrote:
> On Tue, Aug 24, 2021, Lai Jiangshan wrote:
> Rather than force the sync here, what about kicking all vCPUs and retrying the
> page fault?  The only gross part is that kvm_mmu_get_page() can now fail :-(
> 
> ---
>  arch/x86/include/asm/kvm_host.h | 3 ++-
>  arch/x86/kvm/mmu/mmu.c          | 9 +++++++--
>  arch/x86/kvm/mmu/paging_tmpl.h  | 4 ++++
>  3 files changed, 13 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 09b256db394a..332b9fb3454c 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -57,7 +57,8 @@
>  #define KVM_REQ_MIGRATE_TIMER		KVM_ARCH_REQ(0)
>  #define KVM_REQ_REPORT_TPR_ACCESS	KVM_ARCH_REQ(1)
>  #define KVM_REQ_TRIPLE_FAULT		KVM_ARCH_REQ(2)
> -#define KVM_REQ_MMU_SYNC		KVM_ARCH_REQ(3)
> +#define KVM_REQ_MMU_SYNC \
> +	KVM_ARCH_REQ_FLAGS(3, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
>  #define KVM_REQ_CLOCK_UPDATE		KVM_ARCH_REQ(4)
>  #define KVM_REQ_LOAD_MMU_PGD		KVM_ARCH_REQ(5)
>  #define KVM_REQ_EVENT			KVM_ARCH_REQ(6)
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 4853c033e6ce..03293cd3c7ae 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -2143,8 +2143,10 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
>  			kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
>  		}
> 
> -		if (sp->unsync_children)
> -			kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
> +		if (sp->unsync_children) {
> +			kvm_make_all_cpus_request(KVM_REQ_MMU_SYNC, vcpu);
> +			return NULL;
> +		}
> 
>  		__clear_sp_write_flooding_count(sp);
> 
> @@ -2999,6 +3001,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
> 
>  		sp = kvm_mmu_get_page(vcpu, base_gfn, it.addr,
>  				      it.level - 1, true, ACC_ALL);
> +		BUG_ON(!sp);
> 
>  		link_shadow_page(vcpu, it.sptep, sp);
>  		if (fault->is_tdp && fault->huge_page_disallowed &&
> @@ -3383,6 +3386,8 @@ static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gva,
>  	struct kvm_mmu_page *sp;
> 
>  	sp = kvm_mmu_get_page(vcpu, gfn, gva, level, direct, ACC_ALL);
> +	BUG_ON(!sp);

Gah, this is obviously wrong when allocating an indirect root.  On the happy side,
it points out a cleaner approach.  I think this is what we want?

---
 arch/x86/kvm/mmu/mmu.c         | 3 ---
 arch/x86/kvm/mmu/paging_tmpl.h | 4 ++++
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 4853c033e6ce..f24e8088192c 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -2143,9 +2143,6 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 			kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
 		}

-		if (sp->unsync_children)
-			kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
-
 		__clear_sp_write_flooding_count(sp);

 trace_get_page:
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 50ade6450ace..5b13918a55c2 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -704,6 +704,10 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
 			access = gw->pt_access[it.level - 2];
 			sp = kvm_mmu_get_page(vcpu, table_gfn, fault->addr,
 					      it.level-1, false, access);
+			if (sp->unsync_children) {
+				kvm_make_all_cpus_request(KVM_REQ_MMU_SYNC, vcpu);
+				return RET_PF_RETRY;
+			}
 		}

 		/*
--
Lai Jiangshan Sept. 3, 2021, 12:44 a.m. UTC | #3
On 2021/9/3 07:54, Sean Christopherson wrote:
> 
>   trace_get_page:
> diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
> index 50ade6450ace..5b13918a55c2 100644
> --- a/arch/x86/kvm/mmu/paging_tmpl.h
> +++ b/arch/x86/kvm/mmu/paging_tmpl.h
> @@ -704,6 +704,10 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
>   			access = gw->pt_access[it.level - 2];
>   			sp = kvm_mmu_get_page(vcpu, table_gfn, fault->addr,
>   					      it.level-1, false, access);
> +			if (sp->unsync_children) {
> +				kvm_make_all_cpus_request(KVM_REQ_MMU_SYNC, vcpu);
> +				return RET_PF_RETRY;

Making KVM_REQ_MMU_SYNC be able remotely is good idea.
But if the sp is not linked, the @sp might not be synced even we
tried many times. So we should continue to link it.

But if we continue to link it, KVM_REQ_MMU_SYNC should be extended to
sync all roots (current root and prev_roots).  And maybe add a
KVM_REQ_MMU_SYNC_CURRENT for current root syncing.

It is not going to be a simple.  I have a new way to sync pages
and also fix the problem,  but that include several non-fix patches.

We need to fix this problem in the simplest way.  In my patch
mmu_sync_children() has a @root argument.  I think we can disallow
releasing the lock when @root is false. Is it OK?



> +			}
>   		}
> 
>   		/*
> --
>
Lai Jiangshan Sept. 3, 2021, 12:51 a.m. UTC | #4
On 2021/9/3 07:40, Sean Christopherson wrote:
> On Tue, Aug 24, 2021, Lai Jiangshan wrote:
>> From: Lai Jiangshan <laijs@linux.alibaba.com>
>>
>> If gpte is changed from non-present to present, the guest doesn't need
>> to flush tlb per SDM.  So the host must synchronze sp before
>> link it.  Otherwise the guest might use a wrong mapping.
>>
>> For example: the guest first changes a level-1 pagetable, and then
>> links its parent to a new place where the original gpte is non-present.
>> Finally the guest can access the remapped area without flushing
>> the tlb.  The guest's behavior should be allowed per SDM, but the host
>> kvm mmu makes it wrong.
> 
> Ah, are you saying, given:
> 
> VA_x = PML4_A -> PDP_B -> PD_C -> PT_D
> 
> the guest can modify PT_D, then link it with
> 
> VA_y = PML4_A -> PDP_B -> PD_E -> PT_D
> 
> and access it via VA_y without flushing, and so KVM must sync PT_D.  Is that
> correct?

Yes. and another vcpu accesses it via VA_y without flushing.

> 
>> Fixes: 4731d4c7a077 ("KVM: MMU: out of sync shadow core")
>> Signed-off-by: Lai Jiangshan <laijs@linux.alibaba.com>
>> ---
> 
> ...
> 
>> diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
>> index 50ade6450ace..48c7fe1b2d50 100644
>> --- a/arch/x86/kvm/mmu/paging_tmpl.h
>> +++ b/arch/x86/kvm/mmu/paging_tmpl.h
>> @@ -664,7 +664,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
>>    * emulate this operation, return 1 to indicate this case.
>>    */
>>   static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
>> -			 struct guest_walker *gw)
>> +			 struct guest_walker *gw, unsigned long mmu_seq)
>>   {
>>   	struct kvm_mmu_page *sp = NULL;
>>   	struct kvm_shadow_walk_iterator it;
>> @@ -678,6 +678,8 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
>>   	top_level = vcpu->arch.mmu->root_level;
>>   	if (top_level == PT32E_ROOT_LEVEL)
>>   		top_level = PT32_ROOT_LEVEL;
>> +
>> +again:
>>   	/*
>>   	 * Verify that the top-level gpte is still there.  Since the page
>>   	 * is a root page, it is either write protected (and cannot be
>> @@ -713,8 +715,28 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
>>   		if (FNAME(gpte_changed)(vcpu, gw, it.level - 1))
>>   			goto out_gpte_changed;
>>   
>> -		if (sp)
>> +		if (sp) {
>> +			/*
>> +			 * We must synchronize the pagetable before link it
>> +			 * because the guest doens't need to flush tlb when
>> +			 * gpte is changed from non-present to present.
>> +			 * Otherwise, the guest may use the wrong mapping.
>> +			 *
>> +			 * For PG_LEVEL_4K, kvm_mmu_get_page() has already
>> +			 * synchronized it transiently via kvm_sync_page().
>> +			 *
>> +			 * For higher level pagetable, we synchronize it
>> +			 * via slower mmu_sync_children().  If it once
>> +			 * released the mmu_lock, we need to restart from
>> +			 * the root since we don't have reference to @sp.
>> +			 */
>> +			if (sp->unsync_children && !mmu_sync_children(vcpu, sp, false)) {
> 
> I don't like dropping mmu_lock in the page fault path.  I agree that it's not
> all that different than grabbing various things in kvm_mmu_do_page_fault() long
> before acquiring mmu_lock, but I'm not 100% convinced we don't have a latent
> bug hiding somehwere in there :-), and (b) there's a possibility, however small,
> that something in FNAME(fetch) that we're missing.  Case in point, this technically
> needs to do make_mmu_pages_available().
> 
> And I believe kvm_mmu_get_page() already tries to handle this case by requesting
> KVM_REQ_MMU_SYNC if it uses a sp with unsync_children, it just doesn't handle SMP
> interaction, e.g. can link a sp that's immediately available to other vCPUs before
> the sync.
> 
> Rather than force the sync here, what about kicking all vCPUs and retrying the
> page fault?  The only gross part is that kvm_mmu_get_page() can now fail :-(
> 
> ---
>   arch/x86/include/asm/kvm_host.h | 3 ++-
>   arch/x86/kvm/mmu/mmu.c          | 9 +++++++--
>   arch/x86/kvm/mmu/paging_tmpl.h  | 4 ++++
>   3 files changed, 13 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 09b256db394a..332b9fb3454c 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -57,7 +57,8 @@
>   #define KVM_REQ_MIGRATE_TIMER		KVM_ARCH_REQ(0)
>   #define KVM_REQ_REPORT_TPR_ACCESS	KVM_ARCH_REQ(1)
>   #define KVM_REQ_TRIPLE_FAULT		KVM_ARCH_REQ(2)
> -#define KVM_REQ_MMU_SYNC		KVM_ARCH_REQ(3)
> +#define KVM_REQ_MMU_SYNC \
> +	KVM_ARCH_REQ_FLAGS(3, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
>   #define KVM_REQ_CLOCK_UPDATE		KVM_ARCH_REQ(4)
>   #define KVM_REQ_LOAD_MMU_PGD		KVM_ARCH_REQ(5)
>   #define KVM_REQ_EVENT			KVM_ARCH_REQ(6)
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 4853c033e6ce..03293cd3c7ae 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -2143,8 +2143,10 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
>   			kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
>   		}
> 
> -		if (sp->unsync_children)
> -			kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
> +		if (sp->unsync_children) {
> +			kvm_make_all_cpus_request(KVM_REQ_MMU_SYNC, vcpu);
> +			return NULL;
> +		}
> 
>   		__clear_sp_write_flooding_count(sp);
> 
> @@ -2999,6 +3001,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
> 
>   		sp = kvm_mmu_get_page(vcpu, base_gfn, it.addr,
>   				      it.level - 1, true, ACC_ALL);
> +		BUG_ON(!sp);
> 
>   		link_shadow_page(vcpu, it.sptep, sp);
>   		if (fault->is_tdp && fault->huge_page_disallowed &&
> @@ -3383,6 +3386,8 @@ static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gva,
>   	struct kvm_mmu_page *sp;
> 
>   	sp = kvm_mmu_get_page(vcpu, gfn, gva, level, direct, ACC_ALL);
> +	BUG_ON(!sp);
> +
>   	++sp->root_count;
> 
>   	return __pa(sp->spt);
> diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
> index 50ade6450ace..f573d45e2c6f 100644
> --- a/arch/x86/kvm/mmu/paging_tmpl.h
> +++ b/arch/x86/kvm/mmu/paging_tmpl.h
> @@ -704,6 +704,8 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
>   			access = gw->pt_access[it.level - 2];
>   			sp = kvm_mmu_get_page(vcpu, table_gfn, fault->addr,
>   					      it.level-1, false, access);
> +			if (!sp)
> +				return RET_PF_RETRY;
>   		}
> 
>   		/*
> @@ -742,6 +744,8 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
>   		if (!is_shadow_present_pte(*it.sptep)) {
>   			sp = kvm_mmu_get_page(vcpu, base_gfn, fault->addr,
>   					      it.level - 1, true, direct_access);
> +			BUG_ON(!sp);
> +
>   			link_shadow_page(vcpu, it.sptep, sp);
>   			if (fault->huge_page_disallowed &&
>   			    fault->req_level >= it.level)
> --
>
Sean Christopherson Sept. 3, 2021, 4:06 p.m. UTC | #5
On Fri, Sep 03, 2021, Lai Jiangshan wrote:
> 
> On 2021/9/3 07:54, Sean Christopherson wrote:
> > 
> >   trace_get_page:
> > diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
> > index 50ade6450ace..5b13918a55c2 100644
> > --- a/arch/x86/kvm/mmu/paging_tmpl.h
> > +++ b/arch/x86/kvm/mmu/paging_tmpl.h
> > @@ -704,6 +704,10 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
> >   			access = gw->pt_access[it.level - 2];
> >   			sp = kvm_mmu_get_page(vcpu, table_gfn, fault->addr,
> >   					      it.level-1, false, access);
> > +			if (sp->unsync_children) {
> > +				kvm_make_all_cpus_request(KVM_REQ_MMU_SYNC, vcpu);
> > +				return RET_PF_RETRY;
> 
> Making KVM_REQ_MMU_SYNC be able remotely is good idea.
> But if the sp is not linked, the @sp might not be synced even we
> tried many times. So we should continue to link it.

Hrm, yeah.  The sp has to be linked in at least one mmu, but it may not be linked
in the current mmu, so KVM would have to sync all roots across all current and
previous mmus in order to guarantee the target page is linked.  Eww.

> But if we continue to link it, KVM_REQ_MMU_SYNC should be extended to
> sync all roots (current root and prev_roots).  And maybe add a
> KVM_REQ_MMU_SYNC_CURRENT for current root syncing.
> 
> It is not going to be a simple.  I have a new way to sync pages
> and also fix the problem,  but that include several non-fix patches.
> 
> We need to fix this problem in the simplest way.  In my patch
> mmu_sync_children() has a @root argument.  I think we can disallow
> releasing the lock when @root is false. Is it OK?

With a caveat, it should work.  I was exploring that option before the remote
sync idea.

The danger is inducing a stall in the host (RCU, etc...) if sp is an upper level
entry, e.g. with 5-level paging it can even be a PML4.  My thought for that is to
skip the yield if there are less than N unsync children remaining, and then bail
out if the caller doesn't allow yielding.  If mmu_sync_children() fails, restart
the guest and go through the entire page fault path.  Worst case scenario, it will
take a "few" rounds for the vCPU to finally resolve the page fault.

Regarding params, please use "can_yield" instead of "root" to match similar logic
in the TDP MMU, and return an int instead of a bool.

Thanks!

---
 arch/x86/kvm/mmu/mmu.c         | 18 ++++++++++++------
 arch/x86/kvm/mmu/paging_tmpl.h |  3 +++
 2 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 4853c033e6ce..5be990cdb2be 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -2024,8 +2024,8 @@ static void mmu_pages_clear_parents(struct mmu_page_path *parents)
 	} while (!sp->unsync_children);
 }

-static void mmu_sync_children(struct kvm_vcpu *vcpu,
-			      struct kvm_mmu_page *parent)
+static int mmu_sync_children(struct kvm_vcpu *vcpu,
+			     struct kvm_mmu_page *parent, bool can_yield)
 {
 	int i;
 	struct kvm_mmu_page *sp;
@@ -2050,7 +2050,15 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
 			flush |= kvm_sync_page(vcpu, sp, &invalid_list);
 			mmu_pages_clear_parents(&parents);
 		}
-		if (need_resched() || rwlock_needbreak(&vcpu->kvm->mmu_lock)) {
+		/*
+		 * Don't yield if there are fewer than <N> unsync children
+		 * remaining, just finish up and get out.
+		 */
+		if (parent->unsync_children > SOME_ARBITRARY_THRESHOLD &&
+		    (need_resched() || rwlock_needbreak(&vcpu->kvm->mmu_lock))) {
+			if (!can_yield)
+				return -EINTR;
+
 			kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
 			cond_resched_rwlock_write(&vcpu->kvm->mmu_lock);
 			flush = false;
@@ -2058,6 +2066,7 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
 	}

 	kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
+	return 0;
 }

 static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp)
@@ -2143,9 +2152,6 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 			kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
 		}

-		if (sp->unsync_children)
-			kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
-
 		__clear_sp_write_flooding_count(sp);

 trace_get_page:
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 50ade6450ace..2ff123ec0d64 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -704,6 +704,9 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
 			access = gw->pt_access[it.level - 2];
 			sp = kvm_mmu_get_page(vcpu, table_gfn, fault->addr,
 					      it.level-1, false, access);
+			if (sp->unsync_children &&
+			    mmu_sync_children(vcpu, sp, false))
+				return RET_PF_RETRY;
 		}

 		/*
--
Lai Jiangshan Sept. 3, 2021, 4:25 p.m. UTC | #6
On 2021/9/4 00:06, Sean Christopherson wrote:

> 
>   trace_get_page:
> diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
> index 50ade6450ace..2ff123ec0d64 100644
> --- a/arch/x86/kvm/mmu/paging_tmpl.h
> +++ b/arch/x86/kvm/mmu/paging_tmpl.h
> @@ -704,6 +704,9 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
>   			access = gw->pt_access[it.level - 2];
>   			sp = kvm_mmu_get_page(vcpu, table_gfn, fault->addr,
>   					      it.level-1, false, access);
> +			if (sp->unsync_children &&
> +			    mmu_sync_children(vcpu, sp, false))
> +				return RET_PF_RETRY;

It was like my first (unsent) fix.  Just return RET_PF_RETRY when break.

And then I thought that it'd be better to retry fetching directly rather than
retry guest when the conditions are still valid/unchanged to avoid all the
next guest page walking and GUP().  Although the code does not check all
conditions such as interrupt event pending. (we can add that too)

I think it is a good design to allow break mmu_lock when mmu is handling
heavy work.


>   		}
> 
>   		/*
> --
>
Lai Jiangshan Sept. 3, 2021, 4:33 p.m. UTC | #7
On 2021/9/4 00:06, Sean Christopherson wrote:

> -static void mmu_sync_children(struct kvm_vcpu *vcpu,
> -			      struct kvm_mmu_page *parent)
> +static int mmu_sync_children(struct kvm_vcpu *vcpu,
> +			     struct kvm_mmu_page *parent, bool can_yield)
>   {
>   	int i;
>   	struct kvm_mmu_page *sp;
> @@ -2050,7 +2050,15 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
>   			flush |= kvm_sync_page(vcpu, sp, &invalid_list);
>   			mmu_pages_clear_parents(&parents);
>   		}
> -		if (need_resched() || rwlock_needbreak(&vcpu->kvm->mmu_lock)) {
> +		/*
> +		 * Don't yield if there are fewer than <N> unsync children
> +		 * remaining, just finish up and get out.
> +		 */
> +		if (parent->unsync_children > SOME_ARBITRARY_THRESHOLD &&
> +		    (need_resched() || rwlock_needbreak(&vcpu->kvm->mmu_lock))) {
> +			if (!can_yield)
> +				return -EINTR;
> +


Another thought about this function.

It is courtesy to break when rwlock_needbreak(), but the price is quite high,
with remote flushing to interrupt several pCPUs.  I think we can only break
when need_resched().
Sean Christopherson Sept. 3, 2021, 4:40 p.m. UTC | #8
On Sat, Sep 04, 2021, Lai Jiangshan wrote:
> 
> On 2021/9/4 00:06, Sean Christopherson wrote:
> 
> > diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
> > index 50ade6450ace..2ff123ec0d64 100644
> > --- a/arch/x86/kvm/mmu/paging_tmpl.h
> > +++ b/arch/x86/kvm/mmu/paging_tmpl.h
> > @@ -704,6 +704,9 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
> >   			access = gw->pt_access[it.level - 2];
> >   			sp = kvm_mmu_get_page(vcpu, table_gfn, fault->addr,
> >   					      it.level-1, false, access);
> > +			if (sp->unsync_children &&
> > +			    mmu_sync_children(vcpu, sp, false))
> > +				return RET_PF_RETRY;
> 
> It was like my first (unsent) fix.  Just return RET_PF_RETRY when break.
> 
> And then I thought that it'd be better to retry fetching directly rather than
> retry guest when the conditions are still valid/unchanged to avoid all the
> next guest page walking and GUP().  Although the code does not check all
> conditions such as interrupt event pending. (we can add that too)

But not in a bug fix that needs to go to stable branches.  
 
> I think it is a good design to allow break mmu_lock when mmu is handling
> heavy work.

I don't disagree in principle, but I question the relevance/need.  I doubt this
code is relevant to nested TDP performance as hypervisors generally don't do the
type of PTE manipulations that would lead to linking an existing unsync sp.  And
for legacy shadow paging, my preference would be to put it into maintenance-only
mode as much as possible.  I'm not dead set against new features/functionality
for shadow paging, but for something like dropping mmu_lock in the page fault path,
IMO there needs to be performance numbers to justify such a change.
Lai Jiangshan Sept. 3, 2021, 5 p.m. UTC | #9
On 2021/9/4 00:40, Sean Christopherson wrote:
> On Sat, Sep 04, 2021, Lai Jiangshan wrote:
>>
>> On 2021/9/4 00:06, Sean Christopherson wrote:
>>
>>> diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
>>> index 50ade6450ace..2ff123ec0d64 100644
>>> --- a/arch/x86/kvm/mmu/paging_tmpl.h
>>> +++ b/arch/x86/kvm/mmu/paging_tmpl.h
>>> @@ -704,6 +704,9 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
>>>    			access = gw->pt_access[it.level - 2];
>>>    			sp = kvm_mmu_get_page(vcpu, table_gfn, fault->addr,
>>>    					      it.level-1, false, access);
>>> +			if (sp->unsync_children &&
>>> +			    mmu_sync_children(vcpu, sp, false))
>>> +				return RET_PF_RETRY;
>>
>> It was like my first (unsent) fix.  Just return RET_PF_RETRY when break.
>>
>> And then I thought that it'd be better to retry fetching directly rather than
>> retry guest when the conditions are still valid/unchanged to avoid all the
>> next guest page walking and GUP().  Although the code does not check all
>> conditions such as interrupt event pending. (we can add that too)
> 
> But not in a bug fix that needs to go to stable branches.

Good point, it is too complicated for a fix, I accept just "return RET_PF_RETRY".
(and don't need "SOME_ARBITRARY_THRESHOLD").

Is it Ok? I will update the patch as it.

>   
>> I think it is a good design to allow break mmu_lock when mmu is handling
>> heavy work.
> 
> I don't disagree in principle, but I question the relevance/need.  I doubt this
> code is relevant to nested TDP performance as hypervisors generally don't do the
> type of PTE manipulations that would lead to linking an existing unsync sp.  And
> for legacy shadow paging, my preference would be to put it into maintenance-only
> mode as much as possible.  I'm not dead set against new features/functionality
> for shadow paging, but for something like dropping mmu_lock in the page fault path,
> IMO there needs to be performance numbers to justify such a change.
> 

I understood the concern and the relevance/need.
Maxim Levitsky Sept. 13, 2021, 11:30 a.m. UTC | #10
On Thu, 2021-09-02 at 23:40 +0000, Sean Christopherson wrote:
> On Tue, Aug 24, 2021, Lai Jiangshan wrote:
> > From: Lai Jiangshan <laijs@linux.alibaba.com>
> > 
> > If gpte is changed from non-present to present, the guest doesn't need
> > to flush tlb per SDM.  So the host must synchronze sp before
> > link it.  Otherwise the guest might use a wrong mapping.
> > 
> > For example: the guest first changes a level-1 pagetable, and then
> > links its parent to a new place where the original gpte is non-present.
> > Finally the guest can access the remapped area without flushing
> > the tlb.  The guest's behavior should be allowed per SDM, but the host
> > kvm mmu makes it wrong.
> 
> Ah, are you saying, given:
> 
> VA_x = PML4_A -> PDP_B -> PD_C -> PT_D
> 
> the guest can modify PT_D, then link it with
> 
> VA_y = PML4_A -> PDP_B -> PD_E -> PT_D
> 
> and access it via VA_y without flushing, and so KVM must sync PT_D.  Is that
> correct?

Looks like this. However 


> 
> > Fixes: 4731d4c7a077 ("KVM: MMU: out of sync shadow core")
> > Signed-off-by: Lai Jiangshan <laijs@linux.alibaba.com>
> > ---
> 
> ...
> 
> > diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
> > index 50ade6450ace..48c7fe1b2d50 100644
> > --- a/arch/x86/kvm/mmu/paging_tmpl.h
> > +++ b/arch/x86/kvm/mmu/paging_tmpl.h
> > @@ -664,7 +664,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
> >   * emulate this operation, return 1 to indicate this case.
> >   */
> >  static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
> > -			 struct guest_walker *gw)
> > +			 struct guest_walker *gw, unsigned long mmu_seq)
> >  {
> >  	struct kvm_mmu_page *sp = NULL;
> >  	struct kvm_shadow_walk_iterator it;
> > @@ -678,6 +678,8 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
> >  	top_level = vcpu->arch.mmu->root_level;
> >  	if (top_level == PT32E_ROOT_LEVEL)
> >  		top_level = PT32_ROOT_LEVEL;
> > +
> > +again:
> >  	/*
> >  	 * Verify that the top-level gpte is still there.  Since the page
> >  	 * is a root page, it is either write protected (and cannot be
> > @@ -713,8 +715,28 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
> >  		if (FNAME(gpte_changed)(vcpu, gw, it.level - 1))
> >  			goto out_gpte_changed;
> >  
> > -		if (sp)
> > +		if (sp) {
> > +			/*
> > +			 * We must synchronize the pagetable before link it
> > +			 * because the guest doens't need to flush tlb when
> > +			 * gpte is changed from non-present to present.
> > +			 * Otherwise, the guest may use the wrong mapping.
> > +			 *
> > +			 * For PG_LEVEL_4K, kvm_mmu_get_page() has already
> > +			 * synchronized it transiently via kvm_sync_page().
> > +			 *
> > +			 * For higher level pagetable, we synchronize it
> > +			 * via slower mmu_sync_children().  If it once
> > +			 * released the mmu_lock, we need to restart from
> > +			 * the root since we don't have reference to @sp.
> > +			 */
> > +			if (sp->unsync_children && !mmu_sync_children(vcpu, sp, false)) {
> 
> I don't like dropping mmu_lock in the page fault path.  I agree that it's not
> all that different than grabbing various things in kvm_mmu_do_page_fault() long
> before acquiring mmu_lock, but I'm not 100% convinced we don't have a latent
> bug hiding somehwere in there :-), and (b) there's a possibility, however small,
> that something in FNAME(fetch) that we're missing.  Case in point, this technically
> needs to do make_mmu_pages_available().
> 
> And I believe kvm_mmu_get_page() already tries to handle this case by requesting
> KVM_REQ_MMU_SYNC if it uses a sp with unsync_children, it just doesn't handle SMP
> interaction, e.g. can link a sp that's immediately available to other vCPUs before
> the sync.
> 
> Rather than force the sync here, what about kicking all vCPUs and retrying the
> page fault?  The only gross part is that kvm_mmu_get_page() can now fail :-(
> 
> ---
>  arch/x86/include/asm/kvm_host.h | 3 ++-
>  arch/x86/kvm/mmu/mmu.c          | 9 +++++++--
>  arch/x86/kvm/mmu/paging_tmpl.h  | 4 ++++
>  3 files changed, 13 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 09b256db394a..332b9fb3454c 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -57,7 +57,8 @@
>  #define KVM_REQ_MIGRATE_TIMER		KVM_ARCH_REQ(0)
>  #define KVM_REQ_REPORT_TPR_ACCESS	KVM_ARCH_REQ(1)
>  #define KVM_REQ_TRIPLE_FAULT		KVM_ARCH_REQ(2)
> -#define KVM_REQ_MMU_SYNC		KVM_ARCH_REQ(3)
> +#define KVM_REQ_MMU_SYNC \
> +	KVM_ARCH_REQ_FLAGS(3, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
>  #define KVM_REQ_CLOCK_UPDATE		KVM_ARCH_REQ(4)
>  #define KVM_REQ_LOAD_MMU_PGD		KVM_ARCH_REQ(5)
>  #define KVM_REQ_EVENT			KVM_ARCH_REQ(6)
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 4853c033e6ce..03293cd3c7ae 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -2143,8 +2143,10 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
>  			kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
>  		}
> 
> -		if (sp->unsync_children)
> -			kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
> +		if (sp->unsync_children) {
> +			kvm_make_all_cpus_request(KVM_REQ_MMU_SYNC, vcpu);

I don't know the KVM mmu well so I miss something here most likely,
but why to switch to kvm_make_all_cpus_request?

MMU is shared by all VCPUs, and the process of its syncing should also do remote TLB flushes
when needed?


Another thing I don't fully understand is why this patch is needed. If we link an SP which has unsync
children, we raise KVM_REQ_MMU_SYNC, which I think means that *this* vCPU will sync the whole MMU on next
guest entry, including these unsync child SPs. Could you explain this?

I am just curious, and I would like to understand the KVM's mmu better.

Best regards,
	Maxim Levitsky


> +			return NULL;
> +		}
> 
>  		__clear_sp_write_flooding_count(sp);
> 
> @@ -2999,6 +3001,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
> 
>  		sp = kvm_mmu_get_page(vcpu, base_gfn, it.addr,
>  				      it.level - 1, true, ACC_ALL);
> +		BUG_ON(!sp);
> 
>  		link_shadow_page(vcpu, it.sptep, sp);
>  		if (fault->is_tdp && fault->huge_page_disallowed &&
> @@ -3383,6 +3386,8 @@ static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gva,
>  	struct kvm_mmu_page *sp;
> 
>  	sp = kvm_mmu_get_page(vcpu, gfn, gva, level, direct, ACC_ALL);
> +	BUG_ON(!sp);
> +
>  	++sp->root_count;
> 
>  	return __pa(sp->spt);
> diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
> index 50ade6450ace..f573d45e2c6f 100644
> --- a/arch/x86/kvm/mmu/paging_tmpl.h
> +++ b/arch/x86/kvm/mmu/paging_tmpl.h
> @@ -704,6 +704,8 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
>  			access = gw->pt_access[it.level - 2];
>  			sp = kvm_mmu_get_page(vcpu, table_gfn, fault->addr,
>  					      it.level-1, false, access);
> +			if (!sp)
> +				return RET_PF_RETRY;
>  		}
> 
>  		/*
> @@ -742,6 +744,8 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
>  		if (!is_shadow_present_pte(*it.sptep)) {
>  			sp = kvm_mmu_get_page(vcpu, base_gfn, fault->addr,
>  					      it.level - 1, true, direct_access);
> +			BUG_ON(!sp);
> +
>  			link_shadow_page(vcpu, it.sptep, sp);
>  			if (fault->huge_page_disallowed &&
>  			    fault->req_level >= it.level)
> --
>
Sean Christopherson Sept. 13, 2021, 8:49 p.m. UTC | #11
On Mon, Sep 13, 2021, Maxim Levitsky wrote:
> On Thu, 2021-09-02 at 23:40 +0000, Sean Christopherson wrote:
> > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> > index 4853c033e6ce..03293cd3c7ae 100644
> > --- a/arch/x86/kvm/mmu/mmu.c
> > +++ b/arch/x86/kvm/mmu/mmu.c
> > @@ -2143,8 +2143,10 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
> >  			kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
> >  		}
> > 
> > -		if (sp->unsync_children)
> > -			kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
> > +		if (sp->unsync_children) {
> > +			kvm_make_all_cpus_request(KVM_REQ_MMU_SYNC, vcpu);
> 
> I don't know the KVM mmu well so I miss something here most likely,
> but why to switch to kvm_make_all_cpus_request?
> 
> MMU is shared by all VCPUs, and the process of its syncing should also do
> remote TLB flushes when needed?
> 
> Another thing I don't fully understand is why this patch is needed. If we
> link an SP which has unsync children, we raise KVM_REQ_MMU_SYNC, which I
> think means that *this* vCPU will sync the whole MMU on next guest entry,
> including these unsync child SPs. Could you explain this?

Answering all three questions at once, the problem is that KVM links in a new SP
that points at unsync'd SPs _before_ servicing KVM_REQ_MMU_SYNC.  While the vCPU
is guaranteed to service KVM_REQ_MMU_SYNC before entering the guest, that doesn't
hold true for other vCPUs.  As a result, there's a window where a different vCPU
can consume the stale, unsync SP via the new SP.
Maxim Levitsky Sept. 13, 2021, 10:31 p.m. UTC | #12
On Mon, 2021-09-13 at 20:49 +0000, Sean Christopherson wrote:
> On Mon, Sep 13, 2021, Maxim Levitsky wrote:
> > On Thu, 2021-09-02 at 23:40 +0000, Sean Christopherson wrote:
> > > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> > > index 4853c033e6ce..03293cd3c7ae 100644
> > > --- a/arch/x86/kvm/mmu/mmu.c
> > > +++ b/arch/x86/kvm/mmu/mmu.c
> > > @@ -2143,8 +2143,10 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
> > >  			kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
> > >  		}
> > > 
> > > -		if (sp->unsync_children)
> > > -			kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
> > > +		if (sp->unsync_children) {
> > > +			kvm_make_all_cpus_request(KVM_REQ_MMU_SYNC, vcpu);
> > 
> > I don't know the KVM mmu well so I miss something here most likely,
> > but why to switch to kvm_make_all_cpus_request?
> > 
> > MMU is shared by all VCPUs, and the process of its syncing should also do
> > remote TLB flushes when needed?
> > 
> > Another thing I don't fully understand is why this patch is needed. If we
> > link an SP which has unsync children, we raise KVM_REQ_MMU_SYNC, which I
> > think means that *this* vCPU will sync the whole MMU on next guest entry,
> > including these unsync child SPs. Could you explain this?
> 
> Answering all three questions at once, the problem is that KVM links in a new SP
> that points at unsync'd SPs _before_ servicing KVM_REQ_MMU_SYNC.  While the vCPU
> is guaranteed to service KVM_REQ_MMU_SYNC before entering the guest, that doesn't
> hold true for other vCPUs.  As a result, there's a window where a different vCPU
> can consume the stale, unsync SP via the new SP.
> 
Thank you, now I understand!

Best regards,
	Maxim Levitsky
diff mbox series

Patch

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 313918df1a10..987953a901d2 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -2032,8 +2032,9 @@  static void mmu_pages_clear_parents(struct mmu_page_path *parents)
 	} while (!sp->unsync_children);
 }
 
-static void mmu_sync_children(struct kvm_vcpu *vcpu,
-			      struct kvm_mmu_page *parent)
+static bool mmu_sync_children(struct kvm_vcpu *vcpu,
+			      struct kvm_mmu_page *parent,
+			      bool root)
 {
 	int i;
 	struct kvm_mmu_page *sp;
@@ -2061,11 +2062,20 @@  static void mmu_sync_children(struct kvm_vcpu *vcpu,
 		if (need_resched() || rwlock_needbreak(&vcpu->kvm->mmu_lock)) {
 			kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
 			cond_resched_rwlock_write(&vcpu->kvm->mmu_lock);
+			/*
+			 * If @parent is not root, the caller doesn't have
+			 * any reference to it.  And we couldn't access to
+			 * @parent and continue synchronizing after the
+			 * mmu_lock was once released.
+			 */
+			if (!root)
+				return false;
 			flush = false;
 		}
 	}
 
 	kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
+	return true;
 }
 
 static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp)
@@ -2151,9 +2161,6 @@  static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 			kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
 		}
 
-		if (sp->unsync_children)
-			kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
-
 		__clear_sp_write_flooding_count(sp);
 
 trace_get_page:
@@ -3650,7 +3657,7 @@  void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
 		write_lock(&vcpu->kvm->mmu_lock);
 		kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
 
-		mmu_sync_children(vcpu, sp);
+		mmu_sync_children(vcpu, sp, true);
 
 		kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
 		write_unlock(&vcpu->kvm->mmu_lock);
@@ -3666,7 +3673,7 @@  void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
 		if (IS_VALID_PAE_ROOT(root)) {
 			root &= PT64_BASE_ADDR_MASK;
 			sp = to_shadow_page(root);
-			mmu_sync_children(vcpu, sp);
+			mmu_sync_children(vcpu, sp, true);
 		}
 	}
 
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 50ade6450ace..48c7fe1b2d50 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -664,7 +664,7 @@  static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
  * emulate this operation, return 1 to indicate this case.
  */
 static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
-			 struct guest_walker *gw)
+			 struct guest_walker *gw, unsigned long mmu_seq)
 {
 	struct kvm_mmu_page *sp = NULL;
 	struct kvm_shadow_walk_iterator it;
@@ -678,6 +678,8 @@  static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
 	top_level = vcpu->arch.mmu->root_level;
 	if (top_level == PT32E_ROOT_LEVEL)
 		top_level = PT32_ROOT_LEVEL;
+
+again:
 	/*
 	 * Verify that the top-level gpte is still there.  Since the page
 	 * is a root page, it is either write protected (and cannot be
@@ -713,8 +715,28 @@  static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
 		if (FNAME(gpte_changed)(vcpu, gw, it.level - 1))
 			goto out_gpte_changed;
 
-		if (sp)
+		if (sp) {
+			/*
+			 * We must synchronize the pagetable before link it
+			 * because the guest doens't need to flush tlb when
+			 * gpte is changed from non-present to present.
+			 * Otherwise, the guest may use the wrong mapping.
+			 *
+			 * For PG_LEVEL_4K, kvm_mmu_get_page() has already
+			 * synchronized it transiently via kvm_sync_page().
+			 *
+			 * For higher level pagetable, we synchronize it
+			 * via slower mmu_sync_children().  If it once
+			 * released the mmu_lock, we need to restart from
+			 * the root since we don't have reference to @sp.
+			 */
+			if (sp->unsync_children && !mmu_sync_children(vcpu, sp, false)) {
+				if (mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, fault->hva))
+					goto out_gpte_changed;
+				goto again;
+			}
 			link_shadow_page(vcpu, it.sptep, sp);
+		}
 	}
 
 	kvm_mmu_hugepage_adjust(vcpu, fault);
@@ -905,7 +927,7 @@  static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
 	r = make_mmu_pages_available(vcpu);
 	if (r)
 		goto out_unlock;
-	r = FNAME(fetch)(vcpu, fault, &walker);
+	r = FNAME(fetch)(vcpu, fault, &walker, mmu_seq);
 	kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
 
 out_unlock: