diff mbox

[3/3] add support for change_pte mmu notifiers

Message ID 1253731638-24575-4-git-send-email-ieidus@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Izik Eidus Sept. 23, 2009, 6:47 p.m. UTC
this is needed for kvm if it want ksm to directly map pages into its
shadow page tables.

Signed-off-by: Izik Eidus <ieidus@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |    1 +
 arch/x86/kvm/mmu.c              |   62 +++++++++++++++++++++++++++++++++-----
 virt/kvm/kvm_main.c             |   14 +++++++++
 3 files changed, 68 insertions(+), 9 deletions(-)

Comments

Marcelo Tosatti Sept. 24, 2009, 2:55 p.m. UTC | #1
On Wed, Sep 23, 2009 at 09:47:18PM +0300, Izik Eidus wrote:
> this is needed for kvm if it want ksm to directly map pages into its
> shadow page tables.
> 
> Signed-off-by: Izik Eidus <ieidus@redhat.com>
> ---
>  arch/x86/include/asm/kvm_host.h |    1 +
>  arch/x86/kvm/mmu.c              |   62 +++++++++++++++++++++++++++++++++-----
>  virt/kvm/kvm_main.c             |   14 +++++++++
>  3 files changed, 68 insertions(+), 9 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 3be0004..d838922 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -796,6 +796,7 @@ asmlinkage void kvm_handle_fault_on_reboot(void);
>  #define KVM_ARCH_WANT_MMU_NOTIFIER
>  int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
>  int kvm_age_hva(struct kvm *kvm, unsigned long hva);
> +void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
>  int cpuid_maxphyaddr(struct kvm_vcpu *vcpu);
>  int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
>  int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index 5cd8b4e..ceec065 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -748,7 +748,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
>  	return write_protected;
>  }
>  
> -static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
> +static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, u64 data)
>  {
>  	u64 *spte;
>  	int need_tlb_flush = 0;
> @@ -763,8 +763,45 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
>  	return need_tlb_flush;
>  }
>  
> -static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
> -			  int (*handler)(struct kvm *kvm, unsigned long *rmapp))
> +static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, u64 data)
> +{
> +	int need_flush = 0;
> +	u64 *spte, new_spte;
> +	pte_t *ptep = (pte_t *)data;
> +	pfn_t new_pfn;
> +
> +	WARN_ON(pte_huge(*ptep));
> +	new_pfn = pte_pfn(*ptep);
> +	spte = rmap_next(kvm, rmapp, NULL);
> +	while (spte) {
> +		BUG_ON(!is_shadow_present_pte(*spte));
> +		rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte);
> +		need_flush = 1;
> +		if (pte_write(*ptep)) {
> +			rmap_remove(kvm, spte);
> +			__set_spte(spte, shadow_trap_nonpresent_pte);
> +			spte = rmap_next(kvm, rmapp, NULL);
> +		} else {
> +			new_spte = *spte &~ (PT64_BASE_ADDR_MASK);
> +			new_spte |= new_pfn << PAGE_SHIFT;

		        new_spte |= (u64)new_pfn << PAGE_SHIFT;

Otherwise looks good to me.

> +			new_spte &= ~PT_WRITABLE_MASK;
> +			new_spte &= ~SPTE_HOST_WRITEABLE;
> +			if (is_writeble_pte(*spte))
> +				kvm_set_pfn_dirty(spte_to_pfn(*spte));
> +			__set_spte(spte, new_spte);
> +			spte = rmap_next(kvm, rmapp, spte);
> +		}
> +	}
> +	if (need_flush)
> +		kvm_flush_remote_tlbs(kvm);
> +
> +	return 0;
> +}
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andrea Arcangeli Sept. 24, 2009, 5:11 p.m. UTC | #2
On Wed, Sep 23, 2009 at 09:47:18PM +0300, Izik Eidus wrote:
> +	if (need_flush)
> +		kvm_flush_remote_tlbs(kvm);

need_flush can be return to kvm_mmu_notifier_change_pte to defer the
tlb flush after dropping the spin lock I think. We are forced to flush
the tlb inside spin_lock in kvm normal context because that stops the
VM from freeing the page (it hangs on the mmu_lock taken by kvm
invalidate_page/change_pte) so we can unmap tons of sptes and do a
single kvm tlb flush that covers them all (by keeping both actions
under the mmu_lock), but in mmu notifier context the pages can't be
freed from under the guest, so we can flush the tlb flushing the tlb
before making the page freeable, because both old and new page in
do_wp_page are still pinned and can't be freed and reused from under
us even if we release mmu_lock before tlb flush.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3be0004..d838922 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -796,6 +796,7 @@  asmlinkage void kvm_handle_fault_on_reboot(void);
 #define KVM_ARCH_WANT_MMU_NOTIFIER
 int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
 int kvm_age_hva(struct kvm *kvm, unsigned long hva);
+void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
 int cpuid_maxphyaddr(struct kvm_vcpu *vcpu);
 int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
 int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 5cd8b4e..ceec065 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -748,7 +748,7 @@  static int rmap_write_protect(struct kvm *kvm, u64 gfn)
 	return write_protected;
 }
 
-static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
+static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, u64 data)
 {
 	u64 *spte;
 	int need_tlb_flush = 0;
@@ -763,8 +763,45 @@  static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
 	return need_tlb_flush;
 }
 
-static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
-			  int (*handler)(struct kvm *kvm, unsigned long *rmapp))
+static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, u64 data)
+{
+	int need_flush = 0;
+	u64 *spte, new_spte;
+	pte_t *ptep = (pte_t *)data;
+	pfn_t new_pfn;
+
+	WARN_ON(pte_huge(*ptep));
+	new_pfn = pte_pfn(*ptep);
+	spte = rmap_next(kvm, rmapp, NULL);
+	while (spte) {
+		BUG_ON(!is_shadow_present_pte(*spte));
+		rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte);
+		need_flush = 1;
+		if (pte_write(*ptep)) {
+			rmap_remove(kvm, spte);
+			__set_spte(spte, shadow_trap_nonpresent_pte);
+			spte = rmap_next(kvm, rmapp, NULL);
+		} else {
+			new_spte = *spte &~ (PT64_BASE_ADDR_MASK);
+			new_spte |= new_pfn << PAGE_SHIFT;
+
+			new_spte &= ~PT_WRITABLE_MASK;
+			new_spte &= ~SPTE_HOST_WRITEABLE;
+			if (is_writeble_pte(*spte))
+				kvm_set_pfn_dirty(spte_to_pfn(*spte));
+			__set_spte(spte, new_spte);
+			spte = rmap_next(kvm, rmapp, spte);
+		}
+	}
+	if (need_flush)
+		kvm_flush_remote_tlbs(kvm);
+
+	return 0;
+}
+
+static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, u64 data,
+			  int (*handler)(struct kvm *kvm, unsigned long *rmapp,
+					 u64 data))
 {
 	int i, j;
 	int retval = 0;
@@ -786,13 +823,15 @@  static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
 		if (hva >= start && hva < end) {
 			gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
 
-			retval |= handler(kvm, &memslot->rmap[gfn_offset]);
+			retval |= handler(kvm, &memslot->rmap[gfn_offset],
+					  data);
 
 			for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) {
 				int idx = gfn_offset;
 				idx /= KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL + j);
 				retval |= handler(kvm,
-					&memslot->lpage_info[j][idx].rmap_pde);
+					&memslot->lpage_info[j][idx].rmap_pde,
+					data);
 			}
 		}
 	}
@@ -802,10 +841,15 @@  static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
 
 int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
 {
-	return kvm_handle_hva(kvm, hva, kvm_unmap_rmapp);
+	return kvm_handle_hva(kvm, hva, 0, kvm_unmap_rmapp);
+}
+
+void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
+{
+	kvm_handle_hva(kvm, hva, (u64)&pte, kvm_set_pte_rmapp);
 }
 
-static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp)
+static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, u64 data)
 {
 	u64 *spte;
 	int young = 0;
@@ -841,13 +885,13 @@  static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
 	gfn = unalias_gfn(vcpu->kvm, gfn);
 	rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
 
-	kvm_unmap_rmapp(vcpu->kvm, rmapp);
+	kvm_unmap_rmapp(vcpu->kvm, rmapp, 0);
 	kvm_flush_remote_tlbs(vcpu->kvm);
 }
 
 int kvm_age_hva(struct kvm *kvm, unsigned long hva)
 {
-	return kvm_handle_hva(kvm, hva, kvm_age_rmapp);
+	return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp);
 }
 
 #ifdef MMU_DEBUG
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 897bff3..4cbd396 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -851,6 +851,19 @@  static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
 
 }
 
+static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
+					struct mm_struct *mm,
+					unsigned long address,
+					pte_t pte)
+{
+	struct kvm *kvm = mmu_notifier_to_kvm(mn);
+
+	spin_lock(&kvm->mmu_lock);
+	kvm->mmu_notifier_seq++;
+	kvm_set_spte_hva(kvm, address, pte);
+	spin_unlock(&kvm->mmu_lock);
+}
+
 static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
 						    struct mm_struct *mm,
 						    unsigned long start,
@@ -930,6 +943,7 @@  static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
 	.invalidate_range_start	= kvm_mmu_notifier_invalidate_range_start,
 	.invalidate_range_end	= kvm_mmu_notifier_invalidate_range_end,
 	.clear_flush_young	= kvm_mmu_notifier_clear_flush_young,
+	.change_pte		= kvm_mmu_notifier_change_pte,
 	.release		= kvm_mmu_notifier_release,
 };
 #endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */