diff mbox

KVM: Defer remote tlb flushes on invlpg (v2)

Message ID 1237201670-5572-1-git-send-email-avi@redhat.com (mailing list archive)
State Not Applicable
Headers show

Commit Message

Avi Kivity March 16, 2009, 11:07 a.m. UTC
KVM flushes tlbs on remote cpus for two purposes: to protect guest pages
that it needs to collect information about, and to prevent stale tlb entries
from pointing to pages that no longer belong to the guest.

We can defer the latter flushes to the point when we actually free the pages,
which is during an mmu notifier invocation.  To this end, we add a new state
remote_tlbs_dirty which marks whether the guest tlb might be inconsistent
with the the shadow page tables.  Whenever we do a conditional flush of
remote tlbs, we check this state, and if the remote tlbs are dirty we flush
them to ensure no inconsistency.

[v2: add help kvm_flush_remote_tlbs_cond() to remove the need for callers
     to care about the new logic]

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |    1 -
 arch/x86/kvm/mmu.c              |   14 ++++++++------
 arch/x86/kvm/paging_tmpl.h      |    6 ++++--
 include/linux/kvm_host.h        |    2 ++
 virt/kvm/kvm_main.c             |   18 ++++++++++++++++--
 5 files changed, 30 insertions(+), 11 deletions(-)

Comments

Marcelo Tosatti March 17, 2009, 7:47 p.m. UTC | #1
On Mon, Mar 16, 2009 at 01:07:50PM +0200, Avi Kivity wrote:
> KVM flushes tlbs on remote cpus for two purposes: to protect guest pages
> that it needs to collect information about, and to prevent stale tlb entries
> from pointing to pages that no longer belong to the guest.
> 
> We can defer the latter flushes to the point when we actually free the pages,
> which is during an mmu notifier invocation.  To this end, we add a new state
> remote_tlbs_dirty which marks whether the guest tlb might be inconsistent
> with the the shadow page tables.  Whenever we do a conditional flush of
> remote tlbs, we check this state, and if the remote tlbs are dirty we flush
> them to ensure no inconsistency.
> 
> [v2: add help kvm_flush_remote_tlbs_cond() to remove the need for callers
>      to care about the new logic]

Looks good, but Andrea should ack.


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andi Kleen March 18, 2009, 12:41 a.m. UTC | #2
Avi Kivity <avi@redhat.com> writes:

> KVM flushes tlbs on remote cpus for two purposes: to protect guest pages
> that it needs to collect information about, and to prevent stale tlb entries
> from pointing to pages that no longer belong to the guest.

I assume you have numbers with some benchmark how much that 
improves performance. It would be nice to add them to the 
commit log.

-Andi
Avi Kivity March 18, 2009, 6:23 a.m. UTC | #3
Andi Kleen wrote:
>> KVM flushes tlbs on remote cpus for two purposes: to protect guest pages
>> that it needs to collect information about, and to prevent stale tlb entries
>> from pointing to pages that no longer belong to the guest.
>>     
>
> I assume you have numbers with some benchmark how much that 
> improves performance. It would be nice to add them to the 
> commit log.
>   

Your assumption is incorrect; I did not benchmark this patch.
Andrea Arcangeli March 18, 2009, 10:53 p.m. UTC | #4
On Mon, Mar 16, 2009 at 01:07:50PM +0200, Avi Kivity wrote:
> KVM flushes tlbs on remote cpus for two purposes: to protect guest pages
> that it needs to collect information about, and to prevent stale tlb entries
> from pointing to pages that no longer belong to the guest.
> 
> We can defer the latter flushes to the point when we actually free the pages,
> which is during an mmu notifier invocation.  To this end, we add a new state
> remote_tlbs_dirty which marks whether the guest tlb might be inconsistent
> with the the shadow page tables.  Whenever we do a conditional flush of
> remote tlbs, we check this state, and if the remote tlbs are dirty we flush
> them to ensure no inconsistency.
> 
> [v2: add help kvm_flush_remote_tlbs_cond() to remove the need for callers
>      to care about the new logic]

The idea of using mmu notifier to avoid smp-tlb-flush for
guest-local-tlb-flush is very cute indeed.

>  static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
>  {
> +	bool need_flush;
> +
>  	if (sp->role.glevels != vcpu->arch.mmu.root_level) {
>  		kvm_mmu_zap_page(vcpu->kvm, sp);
>  		return 1;
>  	}
>  
> -	if (rmap_write_protect(vcpu->kvm, sp->gfn))
> -		kvm_flush_remote_tlbs(vcpu->kvm);
> +	need_flush = rmap_write_protect(vcpu->kvm, sp->gfn);
> +	kvm_flush_remote_tlbs_cond(vcpu->kvm, need_flush);

Why exactly do we need to flush remote and local tlbs here (in the cr3
overwrite path which is a local flush) if no change happened to sptes
related to the current process? How is it relevant that a previous
invlpg has run and we did only a local flush at the time invlpg run?

> @@ -1184,8 +1186,7 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
>  		for_each_sp(pages, sp, parents, i)
>  			protected |= rmap_write_protect(vcpu->kvm, sp->gfn);
>  
> -		if (protected)
> -			kvm_flush_remote_tlbs(vcpu->kvm);
> +		kvm_flush_remote_tlbs_cond(vcpu->kvm, protected);
>  
>  		for_each_sp(pages, sp, parents, i) {
>  			kvm_sync_page(vcpu, sp);

Same here.

> @@ -1251,8 +1253,8 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
>  	sp->global = role.cr4_pge;
>  	hlist_add_head(&sp->hash_link, bucket);
>  	if (!direct) {
> -		if (rmap_write_protect(vcpu->kvm, gfn))
> -			kvm_flush_remote_tlbs(vcpu->kvm);
> +		need_flush = rmap_write_protect(vcpu->kvm, gfn);
> +		kvm_flush_remote_tlbs_cond(vcpu->kvm, need_flush);
>  		account_shadowed(vcpu->kvm, gfn);
>  	}
>  	if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte)

Same here.

> diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
> index 855eb71..18abdf9 100644
> --- a/arch/x86/kvm/paging_tmpl.h
> +++ b/arch/x86/kvm/paging_tmpl.h
> @@ -475,8 +475,10 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
>  			break;
>  	}
>  
> -	if (need_flush)
> -		kvm_flush_remote_tlbs(vcpu->kvm);
> +	if (need_flush) {
> +		vcpu->kvm->remote_tlbs_dirty = true;
> +		kvm_x86_ops->tlb_flush(vcpu);
> +	}
>  	spin_unlock(&vcpu->kvm->mmu_lock);
>  
>  	if (pte_gpa == -1)

I think it'd be more consistent to set_bit(KVM_REQ_TLB_FLUSH,
&vcpu->requests) instead of invoking tlb_flush from a different place.

> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 68b217e..2ee6a6d 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -758,10 +758,22 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
>  
>  void kvm_flush_remote_tlbs(struct kvm *kvm)
>  {
> +	kvm->remote_tlbs_dirty = false;
> +	wmb();
>  	if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
>  		++kvm->stat.remote_tlb_flush;
>  }
>  
> +void kvm_flush_remote_tlbs_cond(struct kvm *kvm, bool cond)
> +{
> +	if (!cond) {
> +		rmb();
> +		cond = kvm->remote_tlbs_dirty;
> +	}
> +	if (cond)
> +		kvm_flush_remote_tlbs(kvm);
> +}
> +
>  void kvm_reload_remote_mmus(struct kvm *kvm)
>  {
>  	make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);
> @@ -840,8 +852,9 @@ static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
>  	need_tlb_flush = kvm_unmap_hva(kvm, address);
>  	spin_unlock(&kvm->mmu_lock);
>  
> +	rmb();
>  	/* we've to flush the tlb before the pages can be freed */
> -	if (need_tlb_flush)
> +	if (need_tlb_flush || kvm->remote_tlbs_dirty)
>  		kvm_flush_remote_tlbs(kvm);
>  
>  }
> @@ -865,8 +878,9 @@ static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
>  		need_tlb_flush |= kvm_unmap_hva(kvm, start);
>  	spin_unlock(&kvm->mmu_lock);
>  
> +	rmb();
>  	/* we've to flush the tlb before the pages can be freed */
> -	if (need_tlb_flush)
> +	if (need_tlb_flush || kvm->remote_tlbs_dirty)
>  		kvm_flush_remote_tlbs(kvm);
>  }

I think the only memory barrier required is a smp_mb() between setting
remote_tlbs_dirty true and make_all_cpus_request.

All other memory barriers seems unnecessary. It can't be a invlpg
relative to the page the mmu notifier is working on that is setting
remote_tlbs_dirty after mmu_lock is released in the mmu notifier
method, so as long as the remote_tlbs_dirty bit isn't lost we're
ok. We basically have the remote_tlbs_dirty randomly going up from a
mmu_lock protected section and the mmu notifier only must make sure
that after clearing it, it always flushes (so only race is if we flush
before clearing the flag).

However the setting must be done with set_bit and the clearing with
test_and_clear_bit, otherwise when one cpus sets it inside mmu_lock
protected section and other cpu clears it without any lock held from
the mmu notifier, the result is undefined. AFIK without lock prefix on
the mov instruction what can happen is like:

cpu0		     	 cpu1
--------	     	 ---------
remote_tlbs_dirty = 0
			 remote_tlbs_dirty = 1
remote_tlbs_dirty == 0

I don't think it only applies to bitops, surely it would more
obviously apply to bitops but I think it also applies to plain mov.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity March 19, 2009, 10:27 a.m. UTC | #5
Andrea Arcangeli wrote:
>>  static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
>>  {
>> +	bool need_flush;
>> +
>>  	if (sp->role.glevels != vcpu->arch.mmu.root_level) {
>>  		kvm_mmu_zap_page(vcpu->kvm, sp);
>>  		return 1;
>>  	}
>>  
>> -	if (rmap_write_protect(vcpu->kvm, sp->gfn))
>> -		kvm_flush_remote_tlbs(vcpu->kvm);
>> +	need_flush = rmap_write_protect(vcpu->kvm, sp->gfn);
>> +	kvm_flush_remote_tlbs_cond(vcpu->kvm, need_flush);
>>     
>
> Why exactly do we need to flush remote and local tlbs here (in the cr3
> overwrite path which is a local flush) if no change happened to sptes
> related to the current process? How is it relevant that a previous
> invlpg has run and we did only a local flush at the time invlpg run?
>   

An invlpg on a remote cpu may have not done a local_tlb_flush() because 
the spte was not present. So we have a poisoned tlb remotely which needs 
to be flushed when protecting pages.

However a better fix for this is to make the local tlb flush on invlpg 
unconditional.

(historical note - the kvm mmu used to depend on the shadow page tables 
and guest page tables being consistent, so we were pretty paranoid about 
guest tlb management bugs (or exploits) killing the host. but with the 
gfn tables maintained in shadow pages, that's no longer the case)

> Same here.
>   
> Same here.
>
>   

Same explanations apply.

>> @@ -475,8 +475,10 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
>>  			break;
>>  	}
>>  
>> -	if (need_flush)
>> -		kvm_flush_remote_tlbs(vcpu->kvm);
>> +	if (need_flush) {
>> +		vcpu->kvm->remote_tlbs_dirty = true;
>> +		kvm_x86_ops->tlb_flush(vcpu);
>> +	}
>>  	spin_unlock(&vcpu->kvm->mmu_lock);
>>  
>>  	if (pte_gpa == -1)
>>     
>
> I think it'd be more consistent to set_bit(KVM_REQ_TLB_FLUSH,
> &vcpu->requests) instead of invoking tlb_flush from a different place.
>   

Agree, it can also fold an IPI as the remote cpu will notice the request 
bit is set.

>> @@ -758,10 +758,22 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
>>  
>>  void kvm_flush_remote_tlbs(struct kvm *kvm)
>>  {
>> +	kvm->remote_tlbs_dirty = false;
>> +	wmb();
>>  	if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
>>  		++kvm->stat.remote_tlb_flush;
>>  }
>>  
>> +void kvm_flush_remote_tlbs_cond(struct kvm *kvm, bool cond)
>> +{
>> +	if (!cond) {
>> +		rmb();
>> +		cond = kvm->remote_tlbs_dirty;
>> +	}
>> +	if (cond)
>> +		kvm_flush_remote_tlbs(kvm);
>> +}
>> +
>>  void kvm_reload_remote_mmus(struct kvm *kvm)
>>  {
>>  	make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);
>> @@ -840,8 +852,9 @@ static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
>>  	need_tlb_flush = kvm_unmap_hva(kvm, address);
>>  	spin_unlock(&kvm->mmu_lock);
>>  
>> +	rmb();
>>  	/* we've to flush the tlb before the pages can be freed */
>> -	if (need_tlb_flush)
>> +	if (need_tlb_flush || kvm->remote_tlbs_dirty)
>>  		kvm_flush_remote_tlbs(kvm);
>>  
>>  }
>> @@ -865,8 +878,9 @@ static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
>>  		need_tlb_flush |= kvm_unmap_hva(kvm, start);
>>  	spin_unlock(&kvm->mmu_lock);
>>  
>> +	rmb();
>>  	/* we've to flush the tlb before the pages can be freed */
>> -	if (need_tlb_flush)
>> +	if (need_tlb_flush || kvm->remote_tlbs_dirty)
>>  		kvm_flush_remote_tlbs(kvm);
>>  }
>>     
>
> I think the only memory barrier required is a smp_mb() between setting
> remote_tlbs_dirty true and make_all_cpus_request.
>
> All other memory barriers seems unnecessary. It can't be a invlpg
> relative to the page the mmu notifier is working on that is setting
> remote_tlbs_dirty after mmu_lock is released in the mmu notifier
> method, so as long as the remote_tlbs_dirty bit isn't lost we're
> ok. We basically have the remote_tlbs_dirty randomly going up from a
> mmu_lock protected section and the mmu notifier only must make sure
> that after clearing it, it always flushes (so only race is if we flush
> before clearing the flag).
>
> However the setting must be done with set_bit and the clearing with
> test_and_clear_bit, otherwise when one cpus sets it inside mmu_lock
> protected section and other cpu clears it without any lock held from
> the mmu notifier, the result is undefined. AFIK without lock prefix on
> the mov instruction what can happen is like:
>
> cpu0		     	 cpu1
> --------	     	 ---------
> remote_tlbs_dirty = 0
> 			 remote_tlbs_dirty = 1
> remote_tlbs_dirty == 0
>
> I don't think it only applies to bitops, surely it would more
> obviously apply to bitops but I think it also applies to plain mov.
>   

I think we no longer need remote_tlbs_dirty. I'll send a new version.
Avi Kivity March 19, 2009, 10:29 a.m. UTC | #6
Avi Kivity wrote:
>
> I think we no longer need remote_tlbs_dirty. I'll send a new version.
>

Oh, we do, for the main use case on mmu notifiers.
diff mbox

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 4627627..eb1ab29 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -383,7 +383,6 @@  struct kvm_mem_alias {
 struct kvm_arch{
 	int naliases;
 	struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS];
-
 	unsigned int n_free_mmu_pages;
 	unsigned int n_requested_mmu_pages;
 	unsigned int n_alloc_mmu_pages;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 2a36f7f..18bcee5 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1093,13 +1093,15 @@  static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp);
 
 static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 {
+	bool need_flush;
+
 	if (sp->role.glevels != vcpu->arch.mmu.root_level) {
 		kvm_mmu_zap_page(vcpu->kvm, sp);
 		return 1;
 	}
 
-	if (rmap_write_protect(vcpu->kvm, sp->gfn))
-		kvm_flush_remote_tlbs(vcpu->kvm);
+	need_flush = rmap_write_protect(vcpu->kvm, sp->gfn);
+	kvm_flush_remote_tlbs_cond(vcpu->kvm, need_flush);
 	kvm_unlink_unsync_page(vcpu->kvm, sp);
 	if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
 		kvm_mmu_zap_page(vcpu->kvm, sp);
@@ -1184,8 +1186,7 @@  static void mmu_sync_children(struct kvm_vcpu *vcpu,
 		for_each_sp(pages, sp, parents, i)
 			protected |= rmap_write_protect(vcpu->kvm, sp->gfn);
 
-		if (protected)
-			kvm_flush_remote_tlbs(vcpu->kvm);
+		kvm_flush_remote_tlbs_cond(vcpu->kvm, protected);
 
 		for_each_sp(pages, sp, parents, i) {
 			kvm_sync_page(vcpu, sp);
@@ -1210,6 +1211,7 @@  static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 	struct hlist_head *bucket;
 	struct kvm_mmu_page *sp;
 	struct hlist_node *node, *tmp;
+	bool need_flush;
 
 	role = vcpu->arch.mmu.base_role;
 	role.level = level;
@@ -1251,8 +1253,8 @@  static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 	sp->global = role.cr4_pge;
 	hlist_add_head(&sp->hash_link, bucket);
 	if (!direct) {
-		if (rmap_write_protect(vcpu->kvm, gfn))
-			kvm_flush_remote_tlbs(vcpu->kvm);
+		need_flush = rmap_write_protect(vcpu->kvm, gfn);
+		kvm_flush_remote_tlbs_cond(vcpu->kvm, need_flush);
 		account_shadowed(vcpu->kvm, gfn);
 	}
 	if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte)
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 855eb71..18abdf9 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -475,8 +475,10 @@  static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
 			break;
 	}
 
-	if (need_flush)
-		kvm_flush_remote_tlbs(vcpu->kvm);
+	if (need_flush) {
+		vcpu->kvm->remote_tlbs_dirty = true;
+		kvm_x86_ops->tlb_flush(vcpu);
+	}
 	spin_unlock(&vcpu->kvm->mmu_lock);
 
 	if (pte_gpa == -1)
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 11eb702..b779c57 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -125,6 +125,7 @@  struct kvm_kernel_irq_routing_entry {
 struct kvm {
 	struct mutex lock; /* protects the vcpus array and APIC accesses */
 	spinlock_t mmu_lock;
+	bool remote_tlbs_dirty;
 	struct rw_semaphore slots_lock;
 	struct mm_struct *mm; /* userspace tied to this vm */
 	int nmemslots;
@@ -235,6 +236,7 @@  void kvm_resched(struct kvm_vcpu *vcpu);
 void kvm_load_guest_fpu(struct kvm_vcpu *vcpu);
 void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
 void kvm_flush_remote_tlbs(struct kvm *kvm);
+void kvm_flush_remote_tlbs_cond(struct kvm *kvm, bool cond);
 void kvm_reload_remote_mmus(struct kvm *kvm);
 
 long kvm_arch_dev_ioctl(struct file *filp,
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 68b217e..2ee6a6d 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -758,10 +758,22 @@  static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
 
 void kvm_flush_remote_tlbs(struct kvm *kvm)
 {
+	kvm->remote_tlbs_dirty = false;
+	wmb();
 	if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
 		++kvm->stat.remote_tlb_flush;
 }
 
+void kvm_flush_remote_tlbs_cond(struct kvm *kvm, bool cond)
+{
+	if (!cond) {
+		rmb();
+		cond = kvm->remote_tlbs_dirty;
+	}
+	if (cond)
+		kvm_flush_remote_tlbs(kvm);
+}
+
 void kvm_reload_remote_mmus(struct kvm *kvm)
 {
 	make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);
@@ -840,8 +852,9 @@  static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
 	need_tlb_flush = kvm_unmap_hva(kvm, address);
 	spin_unlock(&kvm->mmu_lock);
 
+	rmb();
 	/* we've to flush the tlb before the pages can be freed */
-	if (need_tlb_flush)
+	if (need_tlb_flush || kvm->remote_tlbs_dirty)
 		kvm_flush_remote_tlbs(kvm);
 
 }
@@ -865,8 +878,9 @@  static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
 		need_tlb_flush |= kvm_unmap_hva(kvm, start);
 	spin_unlock(&kvm->mmu_lock);
 
+	rmb();
 	/* we've to flush the tlb before the pages can be freed */
-	if (need_tlb_flush)
+	if (need_tlb_flush || kvm->remote_tlbs_dirty)
 		kvm_flush_remote_tlbs(kvm);
 }