diff mbox

[RFC] KVM: MMU: check host MMIO pfn with host cache type

Message ID 20171019093543.3lfdpdmkxjluaxqi@hz-desktop (mailing list archive)
State New, archived
Headers show

Commit Message

Haozhong Zhang Oct. 19, 2017, 9:35 a.m. UTC
Hi Paolo and Guangrong,

I'm fixing a EPT memory type misconfig issue when maps host NVDIMM to
guest, and not sure whether I'm in the correct way, so I'm looking for
your inputs. The issue and my fix are stated below.

When QEMU uses a DAX device (e.g. /dev/dax0.0 for NVDIMM) as the
backend of a vNVDIMM, e.g. with following QEMU options

  -machine pc,accel=kvm,nvdimm \
  -m 4G,slots=4,maxmem=32G \
  -object memory-backend-file,id=mem0,share,mem-path=/dev/dax0.0,size=16G \
  -device nvdimm,id=nv0,memdev=mem0

KVM will map the backend host NVDIMM pages to guest in EPT (suppose
EPT is used) with memory type UC in EPT entries, which is expected to
be WB. This memory type misconfig results in poor performance when
guest accesses vNVDIMM.

The root cause of the misconfig is that the host NVDIMM pages are
reserved by the driver, so
 1. first, kvm_is_mmio_pfn(pfn) thinks they are for MMIO,
 2. then, based on the information in step 1, vmx_get_mt_mask()
    determines UC memory type should be used,
 3. and finally, set_spte() writes the wrong memory type in EPT
    entries.

Dan suggested me to refer to
    void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, pfn_t pfn)
and its usage in __vm_insert_mixed() to infer the correct EPT memory
type.

Accordingly, I made a draft patch as attached.

A function 
     bool kvm_vcpu_gfn_to_pgprot(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn, pgprot_t *prot)
is added to call track_pfn_insert() to get the protection information
'prot' (including page table cache type) of 'pfn', which will be later
used by kvm_is_mmio_page() to determine the EPT memory type for
'pfn'. As vcpu->kvm_mm->mmap_sem is touched in this function, I have
to move its callpoint before vcpu->kvm->mmu_lock is acquired in
tdp_page_fault().

However, because of the code path
    tdp_page_fault()
       __direct_map(..., pfn, ..., pfn_prot)  // w/ kvm mmu_lock aquired
           direct_pte_prefetch()              // w/ kvm_mmu_lock aquired
pages other than 'pfn' might be mapped in EPT by
direct_pte_prefetch(), but their page table cache types are missed and
they are finally still mapped with UC memory type.

I consider to bypass direct_pte_prefetch() in __direct_map(), if a
non-NULL pfn_prot is passed to __direct_map(), but it actually
disables prefetch for normal RAM pages as well, which shall affect VM
memory performance IIUC.

Probably I'm fixing in the wrong way. Do you have any suggestions?

Thanks,
Haozhong
diff mbox

Patch

From 4f7263b9dd12aad3c0ab3eab832933963e570dc5 Mon Sep 17 00:00:00 2001
From: Haozhong Zhang <haozhong.zhang@intel.com>
Date: Thu, 19 Oct 2017 08:58:27 +0800
Subject: [PATCH] KVM: MMU: check MMIO pages with host cache type

Signed-off-by: Haozhong Zhang <haozhong.zhang@intel.com>
---
 arch/x86/kvm/mmu.c       | 59 +++++++++++++++++++++++++++++++++++-------------
 arch/x86/mm/pat.c        |  1 +
 include/linux/kvm_host.h |  4 ++++
 virt/kvm/kvm_main.c      | 31 +++++++++++++++++++++++++
 4 files changed, 79 insertions(+), 16 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 0b481cc9c725..331ae74cbf08 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2705,18 +2705,20 @@  static bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
 	return false;
 }
 
-static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
+static bool kvm_is_mmio_pfn(kvm_pfn_t pfn, pgprot_t *prot)
 {
 	if (pfn_valid(pfn))
-		return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn));
+		return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn)) &&
+		       (!prot || pgprot2cachemode(*prot) == _PAGE_CACHE_MODE_UC);
 
 	return true;
 }
 
-static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
+static int set_spte_prot(struct kvm_vcpu *vcpu, u64 *sptep,
 		    unsigned pte_access, int level,
 		    gfn_t gfn, kvm_pfn_t pfn, bool speculative,
-		    bool can_unsync, bool host_writable)
+		    bool can_unsync, bool host_writable,
+		    pgprot_t *pfn_prot)
 {
 	u64 spte = 0;
 	int ret = 0;
@@ -2751,7 +2753,7 @@  static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 		spte |= PT_PAGE_SIZE_MASK;
 	if (tdp_enabled)
 		spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
-			kvm_is_mmio_pfn(pfn));
+			kvm_is_mmio_pfn(pfn, pfn_prot));
 
 	if (host_writable)
 		spte |= SPTE_HOST_WRITEABLE;
@@ -2808,9 +2810,18 @@  static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 	return ret;
 }
 
-static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
-			int write_fault, int level, gfn_t gfn, kvm_pfn_t pfn,
-		       	bool speculative, bool host_writable)
+static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
+		    unsigned pte_access, int level,
+		    gfn_t gfn, kvm_pfn_t pfn, bool speculative,
+		    bool can_unsync, bool host_writable)
+{
+	return set_spte_prot(vcpu, sptep, pte_access, level, gfn, pfn,
+			     speculative, can_unsync, host_writable, NULL);
+}
+
+static int mmu_set_spte_prot(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
+			     int write_fault, int level, gfn_t gfn, kvm_pfn_t pfn,
+			     bool speculative, bool host_writable, pgprot_t *pfn_prot)
 {
 	int was_rmapped = 0;
 	int rmap_count;
@@ -2841,8 +2852,8 @@  static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
 			was_rmapped = 1;
 	}
 
-	if (set_spte(vcpu, sptep, pte_access, level, gfn, pfn, speculative,
-	      true, host_writable)) {
+	if (set_spte_prot(vcpu, sptep, pte_access, level, gfn, pfn, speculative,
+			  true, host_writable, pfn_prot)) {
 		if (write_fault)
 			ret = RET_PF_EMULATE;
 		kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
@@ -2872,6 +2883,14 @@  static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
 	return ret;
 }
 
+static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
+			int write_fault, int level, gfn_t gfn, kvm_pfn_t pfn,
+			bool speculative, bool host_writable)
+{
+	return mmu_set_spte_prot(vcpu, sptep, pte_access, write_fault, level,
+				 gfn, pfn, speculative, host_writable, NULL);
+}
+
 static kvm_pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
 				     bool no_dirty_log)
 {
@@ -2954,7 +2973,8 @@  static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
 }
 
 static int __direct_map(struct kvm_vcpu *vcpu, int write, int map_writable,
-			int level, gfn_t gfn, kvm_pfn_t pfn, bool prefault)
+			int level, gfn_t gfn, kvm_pfn_t pfn, bool prefault,
+			pgprot_t *pfn_prot)
 {
 	struct kvm_shadow_walk_iterator iterator;
 	struct kvm_mmu_page *sp;
@@ -2966,10 +2986,11 @@  static int __direct_map(struct kvm_vcpu *vcpu, int write, int map_writable,
 
 	for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
 		if (iterator.level == level) {
-			emulate = mmu_set_spte(vcpu, iterator.sptep, ACC_ALL,
+			emulate = mmu_set_spte_prot(vcpu, iterator.sptep, ACC_ALL,
 					       write, level, gfn, pfn, prefault,
-					       map_writable);
-			direct_pte_prefetch(vcpu, iterator.sptep);
+					       map_writable, pfn_prot);
+			if (!pfn_prot)
+				direct_pte_prefetch(vcpu, iterator.sptep);
 			++vcpu->stat.pf_fixed;
 			break;
 		}
@@ -3317,7 +3338,8 @@  static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
 		goto out_unlock;
 	if (likely(!force_pt_level))
 		transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
-	r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault);
+	r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault,
+			 NULL);
 	spin_unlock(&vcpu->kvm->mmu_lock);
 
 	return r;
@@ -3868,6 +3890,8 @@  static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
 	unsigned long mmu_seq;
 	int write = error_code & PFERR_WRITE_MASK;
 	bool map_writable;
+	pgprot_t prot;
+	bool prot_valid;
 
 	MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
 
@@ -3900,6 +3924,8 @@  static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
 	if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r))
 		return r;
 
+	prot_valid = kvm_vcpu_gfn_to_pgprot(vcpu, gfn, pfn, &prot);
+
 	spin_lock(&vcpu->kvm->mmu_lock);
 	if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
 		goto out_unlock;
@@ -3907,7 +3933,8 @@  static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
 		goto out_unlock;
 	if (likely(!force_pt_level))
 		transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
-	r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault);
+	r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault,
+			 prot_valid ? &prot : NULL);
 	spin_unlock(&vcpu->kvm->mmu_lock);
 
 	return r;
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index fe7d57a8fb60..cab593ea8956 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -998,6 +998,7 @@  void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, pfn_t pfn)
 	*prot = __pgprot((pgprot_val(*prot) & (~_PAGE_CACHE_MASK)) |
 			 cachemode2protval(pcm));
 }
+EXPORT_SYMBOL_GPL(track_pfn_insert);
 
 /*
  * untrack_pfn is called while unmapping a pfnmap for a region.
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 6882538eda32..dffc34c87914 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -67,6 +67,8 @@ 
 #define KVM_PFN_ERR_HWPOISON	(KVM_PFN_ERR_MASK + 1)
 #define KVM_PFN_ERR_RO_FAULT	(KVM_PFN_ERR_MASK + 2)
 
+#define kvm_pfn_to_pfn(x) ((pfn_t){ .val = (x)})
+
 /*
  * error pfns indicate that the gfn is in slot but faild to
  * translate it to pfn on host.
@@ -702,6 +704,8 @@  kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn);
 struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn);
 unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn);
 unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable);
+bool kvm_vcpu_gfn_to_pgprot(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn,
+			    pgprot_t *prot);
 int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data, int offset,
 			     int len);
 int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa, void *data,
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 3d73299e05f2..1b2581fd310c 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1315,6 +1315,37 @@  unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *w
 	return gfn_to_hva_memslot_prot(slot, gfn, writable);
 }
 
+bool kvm_vcpu_gfn_to_pgprot(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn,
+			    pgprot_t *prot)
+{
+	kvm_pfn_t _pfn;
+	unsigned long addr;
+	struct vm_area_struct *vma;
+	bool prot_valid = false;
+
+	_pfn = kvm_vcpu_gfn_to_pfn(vcpu, gfn);
+	if (_pfn != pfn)
+		return false;
+
+	addr = kvm_vcpu_gfn_to_hva(vcpu, gfn);
+	if (kvm_is_error_hva(addr))
+		return false;
+
+	down_read(&vcpu->kvm->mm->mmap_sem);
+	vma = find_vma(vcpu->kvm->mm, addr);
+	if (!vma)
+		goto out;
+
+	*prot = vma->vm_page_prot;
+	track_pfn_insert(vma, prot, kvm_pfn_to_pfn(pfn));
+	prot_valid = true;
+
+ out:
+	up_read(&vcpu->kvm->mm->mmap_sem);
+
+	return prot_valid;
+}
+
 static int get_user_page_nowait(unsigned long start, int write,
 		struct page **page)
 {
-- 
2.11.0