diff mbox

KVM: Defer remote tlb flushes on invlpg (v3)

Message ID 1237468666-30700-1-git-send-email-avi@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Avi Kivity March 19, 2009, 1:17 p.m. UTC
KVM currently flushes the tlbs on all cpus when emulating invlpg.  This
is because at the time of invlpg we lose track of the page, and leaving
stale tlb entries could cause the guest to access the page when it is
later freed (say after being swapped out).

However we have a second change to flush the tlbs, when an mmu notifier is
called to let us know the host pte has been invalidated.  We can safely
defer the flush to this point, which occurs much less frequently.  Of course,
we still do a local tlb flush when emulating invlpg.

Signed-off-by: Avi Kivity <avi@redhat.com>
---

Changes from v2:
- dropped remote flushes from guest pagetable write protect paths
- fixed up memory barriers
- use existing local tlb flush in invlpg, no need to add another one

 arch/x86/kvm/mmu.c         |    3 +--
 arch/x86/kvm/paging_tmpl.h |    5 +----
 include/linux/kvm_host.h   |    2 ++
 virt/kvm/kvm_main.c        |   17 +++++++++++------
 4 files changed, 15 insertions(+), 12 deletions(-)

Comments

Andrew Theurer March 19, 2009, 1:46 p.m. UTC | #1
Avi Kivity wrote:
> KVM currently flushes the tlbs on all cpus when emulating invlpg.  This
> is because at the time of invlpg we lose track of the page, and leaving
> stale tlb entries could cause the guest to access the page when it is
> later freed (say after being swapped out).
>
> However we have a second change to flush the tlbs, when an mmu notifier is
> called to let us know the host pte has been invalidated.  We can safely
> defer the flush to this point, which occurs much less frequently.  Of course,
> we still do a local tlb flush when emulating invlpg.
>   
I should be able to run some performance comparisons with this in the 
next day or two.

-Andrew
> Signed-off-by: Avi Kivity <avi@redhat.com>
> ---
>
> Changes from v2:
> - dropped remote flushes from guest pagetable write protect paths
> - fixed up memory barriers
> - use existing local tlb flush in invlpg, no need to add another one
>
>  arch/x86/kvm/mmu.c         |    3 +--
>  arch/x86/kvm/paging_tmpl.h |    5 +----
>  include/linux/kvm_host.h   |    2 ++
>  virt/kvm/kvm_main.c        |   17 +++++++++++------
>  4 files changed, 15 insertions(+), 12 deletions(-)
>
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index 2a36f7f..f0ea56c 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -1184,8 +1184,7 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
>  		for_each_sp(pages, sp, parents, i)
>  			protected |= rmap_write_protect(vcpu->kvm, sp->gfn);
>
> -		if (protected)
> -			kvm_flush_remote_tlbs(vcpu->kvm);
> +		kvm_flush_remote_tlbs_cond(vcpu->kvm, protected);
>
>  		for_each_sp(pages, sp, parents, i) {
>  			kvm_sync_page(vcpu, sp);
> diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
> index 855eb71..2273b26 100644
> --- a/arch/x86/kvm/paging_tmpl.h
> +++ b/arch/x86/kvm/paging_tmpl.h
> @@ -445,7 +445,6 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
>  	gpa_t pte_gpa = -1;
>  	int level;
>  	u64 *sptep;
> -	int need_flush = 0;
>
>  	spin_lock(&vcpu->kvm->mmu_lock);
>
> @@ -465,7 +464,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
>  				rmap_remove(vcpu->kvm, sptep);
>  				if (is_large_pte(*sptep))
>  					--vcpu->kvm->stat.lpages;
> -				need_flush = 1;
> +				vcpu->kvm->remote_tlbs_dirty = true;
>  			}
>  			set_shadow_pte(sptep, shadow_trap_nonpresent_pte);
>  			break;
> @@ -475,8 +474,6 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
>  			break;
>  	}
>
> -	if (need_flush)
> -		kvm_flush_remote_tlbs(vcpu->kvm);
>  	spin_unlock(&vcpu->kvm->mmu_lock);
>
>  	if (pte_gpa == -1)
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 11eb702..b779c57 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -125,6 +125,7 @@ struct kvm_kernel_irq_routing_entry {
>  struct kvm {
>  	struct mutex lock; /* protects the vcpus array and APIC accesses */
>  	spinlock_t mmu_lock;
> +	bool remote_tlbs_dirty;
>  	struct rw_semaphore slots_lock;
>  	struct mm_struct *mm; /* userspace tied to this vm */
>  	int nmemslots;
> @@ -235,6 +236,7 @@ void kvm_resched(struct kvm_vcpu *vcpu);
>  void kvm_load_guest_fpu(struct kvm_vcpu *vcpu);
>  void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
>  void kvm_flush_remote_tlbs(struct kvm *kvm);
> +void kvm_flush_remote_tlbs_cond(struct kvm *kvm, bool cond);
>  void kvm_reload_remote_mmus(struct kvm *kvm);
>
>  long kvm_arch_dev_ioctl(struct file *filp,
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 68b217e..12afa50 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -758,10 +758,18 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
>
>  void kvm_flush_remote_tlbs(struct kvm *kvm)
>  {
> +	kvm->remote_tlbs_dirty = false;
> +	smp_wmb();
>  	if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
>  		++kvm->stat.remote_tlb_flush;
>  }
>
> +void kvm_flush_remote_tlbs_cond(struct kvm *kvm, bool cond)
> +{
> +	if (cond || kvm->remote_tlbs_dirty)
> +		kvm_flush_remote_tlbs(kvm);
> +}
> +
>  void kvm_reload_remote_mmus(struct kvm *kvm)
>  {
>  	make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);
> @@ -841,8 +849,7 @@ static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
>  	spin_unlock(&kvm->mmu_lock);
>
>  	/* we've to flush the tlb before the pages can be freed */
> -	if (need_tlb_flush)
> -		kvm_flush_remote_tlbs(kvm);
> +	kvm_flush_remote_tlbs_cond(kvm, need_tlb_flush);
>
>  }
>
> @@ -866,8 +873,7 @@ static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
>  	spin_unlock(&kvm->mmu_lock);
>
>  	/* we've to flush the tlb before the pages can be freed */
> -	if (need_tlb_flush)
> -		kvm_flush_remote_tlbs(kvm);
> +	kvm_flush_remote_tlbs_cond(kvm, need_tlb_flush);
>  }
>
>  static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
> @@ -907,8 +913,7 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
>  	young = kvm_age_hva(kvm, address);
>  	spin_unlock(&kvm->mmu_lock);
>
> -	if (young)
> -		kvm_flush_remote_tlbs(kvm);
> +	kvm_flush_remote_tlbs_cond(kvm, young);
>
>  	return young;
>  }
>   


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity March 19, 2009, 2:03 p.m. UTC | #2
Andrew Theurer wrote:
> Avi Kivity wrote:
>> KVM currently flushes the tlbs on all cpus when emulating invlpg.  This
>> is because at the time of invlpg we lose track of the page, and leaving
>> stale tlb entries could cause the guest to access the page when it is
>> later freed (say after being swapped out).
>>
>> However we have a second change to flush the tlbs, when an mmu 
>> notifier is
>> called to let us know the host pte has been invalidated.  We can safely
>> defer the flush to this point, which occurs much less frequently.  Of 
>> course,
>> we still do a local tlb flush when emulating invlpg.
>>   
> I should be able to run some performance comparisons with this in the 
> next day or two.

Excellent.  Note that while this does not improve performance relative 
to released versions of kvm; rather it undoes a performance regression 
caused by 967f61 ("KVM: Fix missing smp tlb flush in invlpg"), which 
fixes a memory corruption problem.

The workloads which will exercise this are mmu-intensive smp workloads 
with CONFIG_HIGHMEM (or CONFIG_HIGHMEM64) guests; 32-bit RHEL 3 is a 
pretty bad offender.
Avi Kivity March 29, 2009, 10:36 a.m. UTC | #3
Marcelo, Andrea?

Avi Kivity wrote:
> KVM currently flushes the tlbs on all cpus when emulating invlpg.  This
> is because at the time of invlpg we lose track of the page, and leaving
> stale tlb entries could cause the guest to access the page when it is
> later freed (say after being swapped out).
>
> However we have a second change to flush the tlbs, when an mmu notifier is
> called to let us know the host pte has been invalidated.  We can safely
> defer the flush to this point, which occurs much less frequently.  Of course,
> we still do a local tlb flush when emulating invlpg.
>
> Signed-off-by: Avi Kivity <avi@redhat.com>
> ---
>
> Changes from v2:
> - dropped remote flushes from guest pagetable write protect paths
> - fixed up memory barriers
> - use existing local tlb flush in invlpg, no need to add another one
>
>  arch/x86/kvm/mmu.c         |    3 +--
>  arch/x86/kvm/paging_tmpl.h |    5 +----
>  include/linux/kvm_host.h   |    2 ++
>  virt/kvm/kvm_main.c        |   17 +++++++++++------
>  4 files changed, 15 insertions(+), 12 deletions(-)
>
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index 2a36f7f..f0ea56c 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -1184,8 +1184,7 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
>  		for_each_sp(pages, sp, parents, i)
>  			protected |= rmap_write_protect(vcpu->kvm, sp->gfn);
>  
> -		if (protected)
> -			kvm_flush_remote_tlbs(vcpu->kvm);
> +		kvm_flush_remote_tlbs_cond(vcpu->kvm, protected);
>  
>  		for_each_sp(pages, sp, parents, i) {
>  			kvm_sync_page(vcpu, sp);
> diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
> index 855eb71..2273b26 100644
> --- a/arch/x86/kvm/paging_tmpl.h
> +++ b/arch/x86/kvm/paging_tmpl.h
> @@ -445,7 +445,6 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
>  	gpa_t pte_gpa = -1;
>  	int level;
>  	u64 *sptep;
> -	int need_flush = 0;
>  
>  	spin_lock(&vcpu->kvm->mmu_lock);
>  
> @@ -465,7 +464,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
>  				rmap_remove(vcpu->kvm, sptep);
>  				if (is_large_pte(*sptep))
>  					--vcpu->kvm->stat.lpages;
> -				need_flush = 1;
> +				vcpu->kvm->remote_tlbs_dirty = true;
>  			}
>  			set_shadow_pte(sptep, shadow_trap_nonpresent_pte);
>  			break;
> @@ -475,8 +474,6 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
>  			break;
>  	}
>  
> -	if (need_flush)
> -		kvm_flush_remote_tlbs(vcpu->kvm);
>  	spin_unlock(&vcpu->kvm->mmu_lock);
>  
>  	if (pte_gpa == -1)
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 11eb702..b779c57 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -125,6 +125,7 @@ struct kvm_kernel_irq_routing_entry {
>  struct kvm {
>  	struct mutex lock; /* protects the vcpus array and APIC accesses */
>  	spinlock_t mmu_lock;
> +	bool remote_tlbs_dirty;
>  	struct rw_semaphore slots_lock;
>  	struct mm_struct *mm; /* userspace tied to this vm */
>  	int nmemslots;
> @@ -235,6 +236,7 @@ void kvm_resched(struct kvm_vcpu *vcpu);
>  void kvm_load_guest_fpu(struct kvm_vcpu *vcpu);
>  void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
>  void kvm_flush_remote_tlbs(struct kvm *kvm);
> +void kvm_flush_remote_tlbs_cond(struct kvm *kvm, bool cond);
>  void kvm_reload_remote_mmus(struct kvm *kvm);
>  
>  long kvm_arch_dev_ioctl(struct file *filp,
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 68b217e..12afa50 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -758,10 +758,18 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
>  
>  void kvm_flush_remote_tlbs(struct kvm *kvm)
>  {
> +	kvm->remote_tlbs_dirty = false;
> +	smp_wmb();
>  	if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
>  		++kvm->stat.remote_tlb_flush;
>  }
>  
> +void kvm_flush_remote_tlbs_cond(struct kvm *kvm, bool cond)
> +{
> +	if (cond || kvm->remote_tlbs_dirty)
> +		kvm_flush_remote_tlbs(kvm);
> +}
> +
>  void kvm_reload_remote_mmus(struct kvm *kvm)
>  {
>  	make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);
> @@ -841,8 +849,7 @@ static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
>  	spin_unlock(&kvm->mmu_lock);
>  
>  	/* we've to flush the tlb before the pages can be freed */
> -	if (need_tlb_flush)
> -		kvm_flush_remote_tlbs(kvm);
> +	kvm_flush_remote_tlbs_cond(kvm, need_tlb_flush);
>  
>  }
>  
> @@ -866,8 +873,7 @@ static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
>  	spin_unlock(&kvm->mmu_lock);
>  
>  	/* we've to flush the tlb before the pages can be freed */
> -	if (need_tlb_flush)
> -		kvm_flush_remote_tlbs(kvm);
> +	kvm_flush_remote_tlbs_cond(kvm, need_tlb_flush);
>  }
>  
>  static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
> @@ -907,8 +913,7 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
>  	young = kvm_age_hva(kvm, address);
>  	spin_unlock(&kvm->mmu_lock);
>  
> -	if (young)
> -		kvm_flush_remote_tlbs(kvm);
> +	kvm_flush_remote_tlbs_cond(kvm, young);
>  
>  	return young;
>  }
>
diff mbox

Patch

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 2a36f7f..f0ea56c 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1184,8 +1184,7 @@  static void mmu_sync_children(struct kvm_vcpu *vcpu,
 		for_each_sp(pages, sp, parents, i)
 			protected |= rmap_write_protect(vcpu->kvm, sp->gfn);
 
-		if (protected)
-			kvm_flush_remote_tlbs(vcpu->kvm);
+		kvm_flush_remote_tlbs_cond(vcpu->kvm, protected);
 
 		for_each_sp(pages, sp, parents, i) {
 			kvm_sync_page(vcpu, sp);
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 855eb71..2273b26 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -445,7 +445,6 @@  static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
 	gpa_t pte_gpa = -1;
 	int level;
 	u64 *sptep;
-	int need_flush = 0;
 
 	spin_lock(&vcpu->kvm->mmu_lock);
 
@@ -465,7 +464,7 @@  static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
 				rmap_remove(vcpu->kvm, sptep);
 				if (is_large_pte(*sptep))
 					--vcpu->kvm->stat.lpages;
-				need_flush = 1;
+				vcpu->kvm->remote_tlbs_dirty = true;
 			}
 			set_shadow_pte(sptep, shadow_trap_nonpresent_pte);
 			break;
@@ -475,8 +474,6 @@  static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
 			break;
 	}
 
-	if (need_flush)
-		kvm_flush_remote_tlbs(vcpu->kvm);
 	spin_unlock(&vcpu->kvm->mmu_lock);
 
 	if (pte_gpa == -1)
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 11eb702..b779c57 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -125,6 +125,7 @@  struct kvm_kernel_irq_routing_entry {
 struct kvm {
 	struct mutex lock; /* protects the vcpus array and APIC accesses */
 	spinlock_t mmu_lock;
+	bool remote_tlbs_dirty;
 	struct rw_semaphore slots_lock;
 	struct mm_struct *mm; /* userspace tied to this vm */
 	int nmemslots;
@@ -235,6 +236,7 @@  void kvm_resched(struct kvm_vcpu *vcpu);
 void kvm_load_guest_fpu(struct kvm_vcpu *vcpu);
 void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
 void kvm_flush_remote_tlbs(struct kvm *kvm);
+void kvm_flush_remote_tlbs_cond(struct kvm *kvm, bool cond);
 void kvm_reload_remote_mmus(struct kvm *kvm);
 
 long kvm_arch_dev_ioctl(struct file *filp,
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 68b217e..12afa50 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -758,10 +758,18 @@  static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
 
 void kvm_flush_remote_tlbs(struct kvm *kvm)
 {
+	kvm->remote_tlbs_dirty = false;
+	smp_wmb();
 	if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
 		++kvm->stat.remote_tlb_flush;
 }
 
+void kvm_flush_remote_tlbs_cond(struct kvm *kvm, bool cond)
+{
+	if (cond || kvm->remote_tlbs_dirty)
+		kvm_flush_remote_tlbs(kvm);
+}
+
 void kvm_reload_remote_mmus(struct kvm *kvm)
 {
 	make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);
@@ -841,8 +849,7 @@  static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
 	spin_unlock(&kvm->mmu_lock);
 
 	/* we've to flush the tlb before the pages can be freed */
-	if (need_tlb_flush)
-		kvm_flush_remote_tlbs(kvm);
+	kvm_flush_remote_tlbs_cond(kvm, need_tlb_flush);
 
 }
 
@@ -866,8 +873,7 @@  static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
 	spin_unlock(&kvm->mmu_lock);
 
 	/* we've to flush the tlb before the pages can be freed */
-	if (need_tlb_flush)
-		kvm_flush_remote_tlbs(kvm);
+	kvm_flush_remote_tlbs_cond(kvm, need_tlb_flush);
 }
 
 static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
@@ -907,8 +913,7 @@  static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
 	young = kvm_age_hva(kvm, address);
 	spin_unlock(&kvm->mmu_lock);
 
-	if (young)
-		kvm_flush_remote_tlbs(kvm);
+	kvm_flush_remote_tlbs_cond(kvm, young);
 
 	return young;
 }