diff mbox

[2/5] KVM: MMU: simplify mmu_set_spte

Message ID 5097ACA0.7080408@linux.vnet.ibm.com (mailing list archive)
State New, archived
Headers show

Commit Message

Xiao Guangrong Nov. 5, 2012, 12:10 p.m. UTC
In order to detecting spte remapping, we can simply check whether the
spte has already been pointing to the pfn even if the spte is not the
last spte for middle spte is pointing to the kernel pfn which can not
be mapped to userspace

Also, update slot and stat.lpages iff the spte is not remapped

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
---
 arch/x86/kvm/mmu.c |   40 +++++++++++++---------------------------
 1 files changed, 13 insertions(+), 27 deletions(-)

Comments

Marcelo Tosatti Nov. 12, 2012, 11:12 p.m. UTC | #1
On Mon, Nov 05, 2012 at 08:10:08PM +0800, Xiao Guangrong wrote:
> In order to detecting spte remapping, we can simply check whether the
> spte has already been pointing to the pfn even if the spte is not the
> last spte for middle spte is pointing to the kernel pfn which can not
> be mapped to userspace
> 
> Also, update slot and stat.lpages iff the spte is not remapped
> 
> Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
> ---
>  arch/x86/kvm/mmu.c |   40 +++++++++++++---------------------------
>  1 files changed, 13 insertions(+), 27 deletions(-)
> 
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index 692ebb1..4ea731e 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -2420,8 +2420,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
>  			 pfn_t pfn, bool speculative,
>  			 bool host_writable)
>  {
> -	int was_rmapped = 0;
> -	int rmap_count;
> +	bool was_rmapped = false;
> 
>  	pgprintk("%s: spte %llx access %x write_fault %d"
>  		 " user_fault %d gfn %llx\n",
> @@ -2429,25 +2428,13 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
>  		 write_fault, user_fault, gfn);
> 
>  	if (is_rmap_spte(*sptep)) {
> -		/*
> -		 * If we overwrite a PTE page pointer with a 2MB PMD, unlink
> -		 * the parent of the now unreachable PTE.
> -		 */
> -		if (level > PT_PAGE_TABLE_LEVEL &&
> -		    !is_large_pte(*sptep)) {
> -			struct kvm_mmu_page *child;
> -			u64 pte = *sptep;
> +		if (pfn != spte_to_pfn(*sptep)) {
> +			struct kvm_mmu_page *sp = page_header(__pa(sptep));
> 
> -			child = page_header(pte & PT64_BASE_ADDR_MASK);
> -			drop_parent_pte(child, sptep);
> -			kvm_flush_remote_tlbs(vcpu->kvm);

How come its safe to drop this case?

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Xiao Guangrong Nov. 13, 2012, 8:39 a.m. UTC | #2
On 11/13/2012 07:12 AM, Marcelo Tosatti wrote:
> On Mon, Nov 05, 2012 at 08:10:08PM +0800, Xiao Guangrong wrote:
>> In order to detecting spte remapping, we can simply check whether the
>> spte has already been pointing to the pfn even if the spte is not the
>> last spte for middle spte is pointing to the kernel pfn which can not
>> be mapped to userspace
>>
>> Also, update slot and stat.lpages iff the spte is not remapped
>>
>> Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
>> ---
>>  arch/x86/kvm/mmu.c |   40 +++++++++++++---------------------------
>>  1 files changed, 13 insertions(+), 27 deletions(-)
>>
>> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
>> index 692ebb1..4ea731e 100644
>> --- a/arch/x86/kvm/mmu.c
>> +++ b/arch/x86/kvm/mmu.c
>> @@ -2420,8 +2420,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
>>  			 pfn_t pfn, bool speculative,
>>  			 bool host_writable)
>>  {
>> -	int was_rmapped = 0;
>> -	int rmap_count;
>> +	bool was_rmapped = false;
>>
>>  	pgprintk("%s: spte %llx access %x write_fault %d"
>>  		 " user_fault %d gfn %llx\n",
>> @@ -2429,25 +2428,13 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
>>  		 write_fault, user_fault, gfn);
>>
>>  	if (is_rmap_spte(*sptep)) {
>> -		/*
>> -		 * If we overwrite a PTE page pointer with a 2MB PMD, unlink
>> -		 * the parent of the now unreachable PTE.
>> -		 */
>> -		if (level > PT_PAGE_TABLE_LEVEL &&
>> -		    !is_large_pte(*sptep)) {
>> -			struct kvm_mmu_page *child;
>> -			u64 pte = *sptep;
>> +		if (pfn != spte_to_pfn(*sptep)) {
>> +			struct kvm_mmu_page *sp = page_header(__pa(sptep));
>>
>> -			child = page_header(pte & PT64_BASE_ADDR_MASK);
>> -			drop_parent_pte(child, sptep);
>> -			kvm_flush_remote_tlbs(vcpu->kvm);
> 
> How come its safe to drop this case?

We use "if (pfn != spte_to_pfn(*sptep))" to simplify the thing.
There are two cases:
1) the sptep is not the last mapping.
   under this case, sptep must point to a shadow page table, that means
   spte_to_pfn(*sptep)) is used by KVM module, and 'pfn' is used by userspace.
   so, 'if' condition must be satisfied, the sptep will be dropped.

   Actually, This is the origin case:
  | if (level > PT_PAGE_TABLE_LEVEL &&
  |	    !is_large_pte(*sptep))"

2) the sptep is the last mapping.
   under this case, the level of spte (sp.level) must equal the 'level' which
   we pass to mmu_set_spte. If they point to the same pfn, it is 'remap', otherwise
   we drop it.

I think this is safe. :)

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Marcelo Tosatti Nov. 20, 2012, 10:18 p.m. UTC | #3
On Tue, Nov 13, 2012 at 04:39:44PM +0800, Xiao Guangrong wrote:
> On 11/13/2012 07:12 AM, Marcelo Tosatti wrote:
> > On Mon, Nov 05, 2012 at 08:10:08PM +0800, Xiao Guangrong wrote:
> >> In order to detecting spte remapping, we can simply check whether the
> >> spte has already been pointing to the pfn even if the spte is not the
> >> last spte for middle spte is pointing to the kernel pfn which can not
> >> be mapped to userspace
> >>
> >> Also, update slot and stat.lpages iff the spte is not remapped
> >>
> >> Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
> >> ---
> >>  arch/x86/kvm/mmu.c |   40 +++++++++++++---------------------------
> >>  1 files changed, 13 insertions(+), 27 deletions(-)
> >>
> >> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> >> index 692ebb1..4ea731e 100644
> >> --- a/arch/x86/kvm/mmu.c
> >> +++ b/arch/x86/kvm/mmu.c
> >> @@ -2420,8 +2420,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
> >>  			 pfn_t pfn, bool speculative,
> >>  			 bool host_writable)
> >>  {
> >> -	int was_rmapped = 0;
> >> -	int rmap_count;
> >> +	bool was_rmapped = false;
> >>
> >>  	pgprintk("%s: spte %llx access %x write_fault %d"
> >>  		 " user_fault %d gfn %llx\n",
> >> @@ -2429,25 +2428,13 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
> >>  		 write_fault, user_fault, gfn);
> >>
> >>  	if (is_rmap_spte(*sptep)) {
> >> -		/*
> >> -		 * If we overwrite a PTE page pointer with a 2MB PMD, unlink
> >> -		 * the parent of the now unreachable PTE.
> >> -		 */
> >> -		if (level > PT_PAGE_TABLE_LEVEL &&
> >> -		    !is_large_pte(*sptep)) {
> >> -			struct kvm_mmu_page *child;
> >> -			u64 pte = *sptep;
> >> +		if (pfn != spte_to_pfn(*sptep)) {
> >> +			struct kvm_mmu_page *sp = page_header(__pa(sptep));
> >>
> >> -			child = page_header(pte & PT64_BASE_ADDR_MASK);
> >> -			drop_parent_pte(child, sptep);
> >> -			kvm_flush_remote_tlbs(vcpu->kvm);
> > 
> > How come its safe to drop this case?
> 
> We use "if (pfn != spte_to_pfn(*sptep))" to simplify the thing.
> There are two cases:
> 1) the sptep is not the last mapping.
>    under this case, sptep must point to a shadow page table, that means
>    spte_to_pfn(*sptep)) is used by KVM module, and 'pfn' is used by userspace.
>    so, 'if' condition must be satisfied, the sptep will be dropped.
> 
>    Actually, This is the origin case:
>   | if (level > PT_PAGE_TABLE_LEVEL &&
>   |	    !is_large_pte(*sptep))"
> 
> 2) the sptep is the last mapping.
>    under this case, the level of spte (sp.level) must equal the 'level' which
>    we pass to mmu_set_spte. If they point to the same pfn, it is 'remap', otherwise
>    we drop it.
> 
> I think this is safe. :)

mmu_page_zap_pte takes care of it, OK.

What if was_rmapped=true but gfn is different? Say if the spte comes
from an unsync shadow page, the guest modifies that shadow page (but
does not invalidate it with invlpg), then faults. gfn can still point
to the same gfn (but in that case, with your patch,
page_header_update_slot is not called.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Xiao Guangrong Nov. 20, 2012, 11:23 p.m. UTC | #4
On 11/21/2012 06:18 AM, Marcelo Tosatti wrote:

>>>> -			child = page_header(pte & PT64_BASE_ADDR_MASK);
>>>> -			drop_parent_pte(child, sptep);
>>>> -			kvm_flush_remote_tlbs(vcpu->kvm);
>>>
>>> How come its safe to drop this case?
>>
>> We use "if (pfn != spte_to_pfn(*sptep))" to simplify the thing.
>> There are two cases:
>> 1) the sptep is not the last mapping.
>>    under this case, sptep must point to a shadow page table, that means
>>    spte_to_pfn(*sptep)) is used by KVM module, and 'pfn' is used by userspace.
>>    so, 'if' condition must be satisfied, the sptep will be dropped.
>>
>>    Actually, This is the origin case:
>>   | if (level > PT_PAGE_TABLE_LEVEL &&
>>   |	    !is_large_pte(*sptep))"
>>
>> 2) the sptep is the last mapping.
>>    under this case, the level of spte (sp.level) must equal the 'level' which
>>    we pass to mmu_set_spte. If they point to the same pfn, it is 'remap', otherwise
>>    we drop it.
>>
>> I think this is safe. :)
> 
> mmu_page_zap_pte takes care of it, OK.
> 
> What if was_rmapped=true but gfn is different? Say if the spte comes
> from an unsync shadow page, the guest modifies that shadow page (but
> does not invalidate it with invlpg), then faults. gfn can still point
> to the same gfn (but in that case, with your patch,
> page_header_update_slot is not called.

Marcelo,

Page fault path and other sync/prefetch paths will reread guest page table,
then it get a different target pfn.

The scenario is like this:

gfn1 = pfn1, gfn2 = pfn2
gpte = pfn1, spte is shadowed by gpte and it is a unsync spte

Guest                               Host
                                     spte = (gfn1, pfn1)

modify gpte to let it point to gfn2
                                    spte = (gfn1, pfn1)
page-fault on gpte
                                    intercept the page-fault, then
                                    want to update spte to (gfn2, pfn2)

                                    in mmu_set_spte, we can detect
                                    pfn2 != pfn1, then drop it.

Hmm, the interesting thing is what if different gfns map to the same pfn.
For example, spte1 is shadowed by gfn1 and spte2 is shadowed by pfn2, both
gfn1 and gfn2 map to pfn, the code (including the current code) will set
spte1 to the gfn2's rmap and spte2 to the gfn1's rmap. But i think it is ok.




--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Marcelo Tosatti Nov. 20, 2012, 11:51 p.m. UTC | #5
On Wed, Nov 21, 2012 at 07:23:26AM +0800, Xiao Guangrong wrote:
> On 11/21/2012 06:18 AM, Marcelo Tosatti wrote:
> 
> >>>> -			child = page_header(pte & PT64_BASE_ADDR_MASK);
> >>>> -			drop_parent_pte(child, sptep);
> >>>> -			kvm_flush_remote_tlbs(vcpu->kvm);
> >>>
> >>> How come its safe to drop this case?
> >>
> >> We use "if (pfn != spte_to_pfn(*sptep))" to simplify the thing.
> >> There are two cases:
> >> 1) the sptep is not the last mapping.
> >>    under this case, sptep must point to a shadow page table, that means
> >>    spte_to_pfn(*sptep)) is used by KVM module, and 'pfn' is used by userspace.
> >>    so, 'if' condition must be satisfied, the sptep will be dropped.
> >>
> >>    Actually, This is the origin case:
> >>   | if (level > PT_PAGE_TABLE_LEVEL &&
> >>   |	    !is_large_pte(*sptep))"
> >>
> >> 2) the sptep is the last mapping.
> >>    under this case, the level of spte (sp.level) must equal the 'level' which
> >>    we pass to mmu_set_spte. If they point to the same pfn, it is 'remap', otherwise
> >>    we drop it.
> >>
> >> I think this is safe. :)
> > 
> > mmu_page_zap_pte takes care of it, OK.
> > 
> > What if was_rmapped=true but gfn is different? Say if the spte comes
> > from an unsync shadow page, the guest modifies that shadow page (but
> > does not invalidate it with invlpg), then faults. gfn can still point
> > to the same gfn (but in that case, with your patch,
> > page_header_update_slot is not called.
> 
> Marcelo,
> 
> Page fault path and other sync/prefetch paths will reread guest page table,
> then it get a different target pfn.
> 
> The scenario is like this:
> 
> gfn1 = pfn1, gfn2 = pfn2
> gpte = pfn1, spte is shadowed by gpte and it is a unsync spte
> 
> Guest                               Host
>                                      spte = (gfn1, pfn1)
> 
> modify gpte to let it point to gfn2
>                                     spte = (gfn1, pfn1)
> page-fault on gpte
>                                     intercept the page-fault, then
>                                     want to update spte to (gfn2, pfn2)
> 
>                                     in mmu_set_spte, we can detect
>                                     pfn2 != pfn1, then drop it.
> 
> Hmm, the interesting thing is what if different gfns map to the same pfn.
> For example, spte1 is shadowed by gfn1 and spte2 is shadowed by pfn2, both
> gfn1 and gfn2 map to pfn, the code (including the current code) will set
> spte1 to the gfn2's rmap and spte2 to the gfn1's rmap. But i think it is ok.

Current code updates gfn properly in set_spte by
page_header_update_slot. 

Better keep state properly.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Xiao Guangrong Nov. 21, 2012, 3:19 a.m. UTC | #6
On 11/21/2012 07:51 AM, Marcelo Tosatti wrote:
> On Wed, Nov 21, 2012 at 07:23:26AM +0800, Xiao Guangrong wrote:
>> On 11/21/2012 06:18 AM, Marcelo Tosatti wrote:
>>
>>>>>> -			child = page_header(pte & PT64_BASE_ADDR_MASK);
>>>>>> -			drop_parent_pte(child, sptep);
>>>>>> -			kvm_flush_remote_tlbs(vcpu->kvm);
>>>>>
>>>>> How come its safe to drop this case?
>>>>
>>>> We use "if (pfn != spte_to_pfn(*sptep))" to simplify the thing.
>>>> There are two cases:
>>>> 1) the sptep is not the last mapping.
>>>>    under this case, sptep must point to a shadow page table, that means
>>>>    spte_to_pfn(*sptep)) is used by KVM module, and 'pfn' is used by userspace.
>>>>    so, 'if' condition must be satisfied, the sptep will be dropped.
>>>>
>>>>    Actually, This is the origin case:
>>>>   | if (level > PT_PAGE_TABLE_LEVEL &&
>>>>   |	    !is_large_pte(*sptep))"
>>>>
>>>> 2) the sptep is the last mapping.
>>>>    under this case, the level of spte (sp.level) must equal the 'level' which
>>>>    we pass to mmu_set_spte. If they point to the same pfn, it is 'remap', otherwise
>>>>    we drop it.
>>>>
>>>> I think this is safe. :)
>>>
>>> mmu_page_zap_pte takes care of it, OK.
>>>
>>> What if was_rmapped=true but gfn is different? Say if the spte comes
>>> from an unsync shadow page, the guest modifies that shadow page (but
>>> does not invalidate it with invlpg), then faults. gfn can still point
>>> to the same gfn (but in that case, with your patch,
>>> page_header_update_slot is not called.
>>
>> Marcelo,
>>
>> Page fault path and other sync/prefetch paths will reread guest page table,
>> then it get a different target pfn.
>>
>> The scenario is like this:
>>
>> gfn1 = pfn1, gfn2 = pfn2
>> gpte = pfn1, spte is shadowed by gpte and it is a unsync spte
>>
>> Guest                               Host
>>                                      spte = (gfn1, pfn1)
>>
>> modify gpte to let it point to gfn2
>>                                     spte = (gfn1, pfn1)
>> page-fault on gpte
>>                                     intercept the page-fault, then
>>                                     want to update spte to (gfn2, pfn2)
>>
>>                                     in mmu_set_spte, we can detect
>>                                     pfn2 != pfn1, then drop it.
>>
>> Hmm, the interesting thing is what if different gfns map to the same pfn.
>> For example, spte1 is shadowed by gfn1 and spte2 is shadowed by pfn2, both
>> gfn1 and gfn2 map to pfn, the code (including the current code) will set
>> spte1 to the gfn2's rmap and spte2 to the gfn1's rmap. But i think it is ok.
> 
> Current code updates gfn properly in set_spte by
> page_header_update_slot. 
> 
> Better keep state properly.

Okay, i will not change the position of page_header_update_slot in the
next version. Thank you, Marcelo!


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 692ebb1..4ea731e 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2420,8 +2420,7 @@  static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 			 pfn_t pfn, bool speculative,
 			 bool host_writable)
 {
-	int was_rmapped = 0;
-	int rmap_count;
+	bool was_rmapped = false;

 	pgprintk("%s: spte %llx access %x write_fault %d"
 		 " user_fault %d gfn %llx\n",
@@ -2429,25 +2428,13 @@  static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 		 write_fault, user_fault, gfn);

 	if (is_rmap_spte(*sptep)) {
-		/*
-		 * If we overwrite a PTE page pointer with a 2MB PMD, unlink
-		 * the parent of the now unreachable PTE.
-		 */
-		if (level > PT_PAGE_TABLE_LEVEL &&
-		    !is_large_pte(*sptep)) {
-			struct kvm_mmu_page *child;
-			u64 pte = *sptep;
+		if (pfn != spte_to_pfn(*sptep)) {
+			struct kvm_mmu_page *sp = page_header(__pa(sptep));

-			child = page_header(pte & PT64_BASE_ADDR_MASK);
-			drop_parent_pte(child, sptep);
-			kvm_flush_remote_tlbs(vcpu->kvm);
-		} else if (pfn != spte_to_pfn(*sptep)) {
-			pgprintk("hfn old %llx new %llx\n",
-				 spte_to_pfn(*sptep), pfn);
-			drop_spte(vcpu->kvm, sptep);
-			kvm_flush_remote_tlbs(vcpu->kvm);
+			if (mmu_page_zap_pte(vcpu->kvm, sp, sptep))
+				kvm_flush_remote_tlbs(vcpu->kvm);
 		} else
-			was_rmapped = 1;
+			was_rmapped = true;
 	}

 	if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault,
@@ -2466,16 +2453,15 @@  static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 		 is_large_pte(*sptep)? "2MB" : "4kB",
 		 *sptep & PT_PRESENT_MASK ?"RW":"R", gfn,
 		 *sptep, sptep);
-	if (!was_rmapped && is_large_pte(*sptep))
-		++vcpu->kvm->stat.lpages;

-	if (is_shadow_present_pte(*sptep)) {
+	if (is_shadow_present_pte(*sptep) && !was_rmapped) {
+		if (is_large_pte(*sptep))
+			++vcpu->kvm->stat.lpages;
+
 		page_header_update_slot(vcpu->kvm, sptep, gfn);
-		if (!was_rmapped) {
-			rmap_count = rmap_add(vcpu, sptep, gfn);
-			if (rmap_count > RMAP_RECYCLE_THRESHOLD)
-				rmap_recycle(vcpu, sptep, gfn);
-		}
+
+		if (rmap_add(vcpu, sptep, gfn) > RMAP_RECYCLE_THRESHOLD)
+			rmap_recycle(vcpu, sptep, gfn);
 	}

 	kvm_release_pfn_clean(pfn);