[06/11] KVM: MMU: let page fault handler be aware tracked page
diff mbox

Message ID 1448907973-36066-7-git-send-email-guangrong.xiao@linux.intel.com
State New
Headers show

Commit Message

Xiao Guangrong Nov. 30, 2015, 6:26 p.m. UTC
The page fault caused by write access on the write tracked page can not
be fixed, it always need to be emulated. page_fault_handle_page_track()
is the fast path we introduce here to skip holding mmu-lock and shadow
page table walking

However, if the page table is not present, it is worth making the page
table entry present and readonly to make the read access happy

mmu_need_write_protect() need to be cooked to avoid page becoming writable
when making page table present or sync/prefetch shadow page table entries

Signed-off-by: Xiao Guangrong <guangrong.xiao@linux.intel.com>
---
 arch/x86/include/asm/kvm_page_track.h |  2 ++
 arch/x86/kvm/mmu.c                    | 44 +++++++++++++++++++++++++++++------
 arch/x86/kvm/page_track.c             | 14 +++++++++++
 arch/x86/kvm/paging_tmpl.h            |  3 +++
 4 files changed, 56 insertions(+), 7 deletions(-)

Comments

Kai Huang Dec. 15, 2015, 8:11 a.m. UTC | #1
On 12/01/2015 02:26 AM, Xiao Guangrong wrote:
> The page fault caused by write access on the write tracked page can not
> be fixed, it always need to be emulated. page_fault_handle_page_track()
> is the fast path we introduce here to skip holding mmu-lock and shadow
> page table walking
Why can it be out side of mmu-lock? Is it OK that some other thread 
removes tracking of this page simultaneously? Shall we assuming the 
emulation code should handle this case?

Even it works for write protection, is it OK for other purpose tracking 
(as your tracking framework can be extended for other purpose)?
>
> However, if the page table is not present, it is worth making the page
> table entry present and readonly to make the read access happy
It's  fine for tracking write from guest. But what if I want to track 
every read from guest? Probably I am exaggerating :)

Thanks,
-Kai
>
> mmu_need_write_protect() need to be cooked to avoid page becoming writable
> when making page table present or sync/prefetch shadow page table entries
>
> Signed-off-by: Xiao Guangrong <guangrong.xiao@linux.intel.com>
> ---
>   arch/x86/include/asm/kvm_page_track.h |  2 ++
>   arch/x86/kvm/mmu.c                    | 44 +++++++++++++++++++++++++++++------
>   arch/x86/kvm/page_track.c             | 14 +++++++++++
>   arch/x86/kvm/paging_tmpl.h            |  3 +++
>   4 files changed, 56 insertions(+), 7 deletions(-)
>
> diff --git a/arch/x86/include/asm/kvm_page_track.h b/arch/x86/include/asm/kvm_page_track.h
> index 9cc17c6..f223201 100644
> --- a/arch/x86/include/asm/kvm_page_track.h
> +++ b/arch/x86/include/asm/kvm_page_track.h
> @@ -15,4 +15,6 @@ void kvm_page_track_add_page(struct kvm *kvm, gfn_t gfn,
>   			     enum kvm_page_track_mode mode);
>   void kvm_page_track_remove_page(struct kvm *kvm, gfn_t gfn,
>   				enum kvm_page_track_mode mode);
> +bool kvm_page_track_check_mode(struct kvm_vcpu *vcpu, gfn_t gfn,
> +			       enum kvm_page_track_mode mode);
>   #endif
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index 39809b8..b23f9fc 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -41,6 +41,7 @@
>   #include <asm/cmpxchg.h>
>   #include <asm/io.h>
>   #include <asm/vmx.h>
> +#include <asm/kvm_page_track.h>
>   
>   /*
>    * When setting this variable to true it enables Two-Dimensional-Paging
> @@ -2456,25 +2457,29 @@ static void kvm_unsync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn)
>   	}
>   }
>   
> -static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
> -				  bool can_unsync)
> +static bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
> +				   bool can_unsync)
>   {
>   	struct kvm_mmu_page *s;
>   	bool need_unsync = false;
>   
> +	if (kvm_page_track_check_mode(vcpu, gfn, KVM_PAGE_TRACK_WRITE))
> +		return true;
> +
>   	for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) {
>   		if (!can_unsync)
> -			return 1;
> +			return true;
>   
>   		if (s->role.level != PT_PAGE_TABLE_LEVEL)
> -			return 1;
> +			return true;
>   
>   		if (!s->unsync)
>   			need_unsync = true;
>   	}
>   	if (need_unsync)
>   		kvm_unsync_pages(vcpu, gfn);
> -	return 0;
> +
> +	return false;
>   }
>   
>   static bool kvm_is_mmio_pfn(pfn_t pfn)
> @@ -3388,10 +3393,30 @@ int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
>   }
>   EXPORT_SYMBOL_GPL(handle_mmio_page_fault);
>   
> +static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu,
> +					 u32 error_code, gfn_t gfn)
> +{
> +	if (unlikely(error_code & PFERR_RSVD_MASK))
> +		return false;
> +
> +	if (!(error_code & PFERR_PRESENT_MASK) ||
> +	      !(error_code & PFERR_WRITE_MASK))
> +		return false;
> +
> +	/*
> +	 * guest is writing the page which is write tracked which can
> +	 * not be fixed by page fault handler.
> +	 */
> +	if (kvm_page_track_check_mode(vcpu, gfn, KVM_PAGE_TRACK_WRITE))
> +		return true;
> +
> +	return false;
> +}
> +
>   static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
>   				u32 error_code, bool prefault)
>   {
> -	gfn_t gfn;
> +	gfn_t gfn = gva >> PAGE_SHIFT;
>   	int r;
>   
>   	pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
> @@ -3403,13 +3428,15 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
>   			return r;
>   	}
>   
> +	if (page_fault_handle_page_track(vcpu, error_code, gfn))
> +		return 1;
> +
>   	r = mmu_topup_memory_caches(vcpu);
>   	if (r)
>   		return r;
>   
>   	MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
>   
> -	gfn = gva >> PAGE_SHIFT;
>   
>   	return nonpaging_map(vcpu, gva & PAGE_MASK,
>   			     error_code, gfn, prefault);
> @@ -3493,6 +3520,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
>   			return r;
>   	}
>   
> +	if (page_fault_handle_page_track(vcpu, error_code, gfn))
> +		return 1;
> +
>   	r = mmu_topup_memory_caches(vcpu);
>   	if (r)
>   		return r;
> diff --git a/arch/x86/kvm/page_track.c b/arch/x86/kvm/page_track.c
> index ad510db..dc2da12 100644
> --- a/arch/x86/kvm/page_track.c
> +++ b/arch/x86/kvm/page_track.c
> @@ -151,3 +151,17 @@ void kvm_page_track_remove_page(struct kvm *kvm, gfn_t gfn,
>   		spin_unlock(&kvm->mmu_lock);
>   	}
>   }
> +
> +/*
> + * check if the corresponding access on the specified guest page is tracked.
> + */
> +bool kvm_page_track_check_mode(struct kvm_vcpu *vcpu, gfn_t gfn,
> +			       enum kvm_page_track_mode mode)
> +{
> +	struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
> +	int index = gfn_to_index(gfn, slot->base_gfn, PT_PAGE_TABLE_LEVEL);
> +
> +	WARN_ON(!check_mode(mode));
> +
> +	return !!ACCESS_ONCE(slot->arch.gfn_track[mode][index]);
> +}
> diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
> index 91e939b..ac85682 100644
> --- a/arch/x86/kvm/paging_tmpl.h
> +++ b/arch/x86/kvm/paging_tmpl.h
> @@ -735,6 +735,9 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
>   		return 0;
>   	}
>   
> +	if (page_fault_handle_page_track(vcpu, error_code, walker.gfn))
> +		return 1;
> +
>   	vcpu->arch.write_fault_to_shadow_pgtable = false;
>   
>   	is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu,

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Xiao Guangrong Dec. 15, 2015, 9:03 a.m. UTC | #2
On 12/15/2015 04:11 PM, Kai Huang wrote:
>
>
> On 12/01/2015 02:26 AM, Xiao Guangrong wrote:
>> The page fault caused by write access on the write tracked page can not
>> be fixed, it always need to be emulated. page_fault_handle_page_track()
>> is the fast path we introduce here to skip holding mmu-lock and shadow
>> page table walking
> Why can it be out side of mmu-lock? Is it OK that some other thread removes tracking of this page
> simultaneously? Shall we assuming the emulation code should handle this case?
>

What your mentioned is the worst case, if that happen the vcpu will spend
longer time to emulate the access rather then retry it. It is bad but it is
the rare contention. It is worth speeding up the common / most case.

> Even it works for write protection, is it OK for other purpose tracking (as your tracking framework
> can be extended for other purpose)?

We only need to make sure that no track event is lost, i.e, we can not
skip the case that the index is changed from 0 to 1.

If we see index is 0, the vcpu can hold the mmu-lock and go to slow path
anyway so no track event will be lost.

>>
>> However, if the page table is not present, it is worth making the page
>> table entry present and readonly to make the read access happy
> It's  fine for tracking write from guest. But what if I want to track every read from guest?
> Probably I am exaggerating :)
>

Then we do not go to the real page fault handler, just keep the shadow
page entry non-present.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Kai Huang Dec. 16, 2015, 7:31 a.m. UTC | #3
On 12/15/2015 05:03 PM, Xiao Guangrong wrote:
>
>
> On 12/15/2015 04:11 PM, Kai Huang wrote:
>>
>>
>> On 12/01/2015 02:26 AM, Xiao Guangrong wrote:
>>> The page fault caused by write access on the write tracked page can not
>>> be fixed, it always need to be emulated. page_fault_handle_page_track()
>>> is the fast path we introduce here to skip holding mmu-lock and shadow
>>> page table walking
>> Why can it be out side of mmu-lock? Is it OK that some other thread 
>> removes tracking of this page
>> simultaneously? Shall we assuming the emulation code should handle 
>> this case?
>>
>
> What your mentioned is the worst case, if that happen the vcpu will spend
> longer time to emulate the access rather then retry it. It is bad but 
> it is
> the rare contention. It is worth speeding up the common / most case.
My concern is when this case happen, whether emulating the access is 
still the right behavior, you know, after other thread removed the GFN 
from tracking..
And as the notifier's  track_write call back will be called in the 
emulating code, won't there be problem if the GFN has been removed from 
tracking by other thread?

Thanks,
-Kai
>
>> Even it works for write protection, is it OK for other purpose 
>> tracking (as your tracking framework
>> can be extended for other purpose)?
>
> We only need to make sure that no track event is lost, i.e, we can not
> skip the case that the index is changed from 0 to 1.
>
> If we see index is 0, the vcpu can hold the mmu-lock and go to slow path
> anyway so no track event will be lost.
>
>>>
>>> However, if the page table is not present, it is worth making the page
>>> table entry present and readonly to make the read access happy
>> It's  fine for tracking write from guest. But what if I want to track 
>> every read from guest?
>> Probably I am exaggerating :)
>>
>
> Then we do not go to the real page fault handler, just keep the shadow
> page entry non-present.
> -- 
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Xiao Guangrong Dec. 16, 2015, 8:23 a.m. UTC | #4
On 12/16/2015 03:31 PM, Kai Huang wrote:
>
>
> On 12/15/2015 05:03 PM, Xiao Guangrong wrote:
>>
>>
>> On 12/15/2015 04:11 PM, Kai Huang wrote:
>>>
>>>
>>> On 12/01/2015 02:26 AM, Xiao Guangrong wrote:
>>>> The page fault caused by write access on the write tracked page can not
>>>> be fixed, it always need to be emulated. page_fault_handle_page_track()
>>>> is the fast path we introduce here to skip holding mmu-lock and shadow
>>>> page table walking
>>> Why can it be out side of mmu-lock? Is it OK that some other thread removes tracking of this page
>>> simultaneously? Shall we assuming the emulation code should handle this case?
>>>
>>
>> What your mentioned is the worst case, if that happen the vcpu will spend
>> longer time to emulate the access rather then retry it. It is bad but it is
>> the rare contention. It is worth speeding up the common / most case.
> My concern is when this case happen, whether emulating the access is still the right behavior, you
> know, after other thread removed the GFN from tracking..
> And as the notifier's  track_write call back will be called in the emulating code, won't there be
> problem if the GFN has been removed from tracking by other thread?
>

When the notify callback is called, the tracker should check if the gfn is what it
is interested by itself. If it sees the gfn is not trackered anymore, it can skip
the callback.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Patch
diff mbox

diff --git a/arch/x86/include/asm/kvm_page_track.h b/arch/x86/include/asm/kvm_page_track.h
index 9cc17c6..f223201 100644
--- a/arch/x86/include/asm/kvm_page_track.h
+++ b/arch/x86/include/asm/kvm_page_track.h
@@ -15,4 +15,6 @@  void kvm_page_track_add_page(struct kvm *kvm, gfn_t gfn,
 			     enum kvm_page_track_mode mode);
 void kvm_page_track_remove_page(struct kvm *kvm, gfn_t gfn,
 				enum kvm_page_track_mode mode);
+bool kvm_page_track_check_mode(struct kvm_vcpu *vcpu, gfn_t gfn,
+			       enum kvm_page_track_mode mode);
 #endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 39809b8..b23f9fc 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -41,6 +41,7 @@ 
 #include <asm/cmpxchg.h>
 #include <asm/io.h>
 #include <asm/vmx.h>
+#include <asm/kvm_page_track.h>
 
 /*
  * When setting this variable to true it enables Two-Dimensional-Paging
@@ -2456,25 +2457,29 @@  static void kvm_unsync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn)
 	}
 }
 
-static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
-				  bool can_unsync)
+static bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
+				   bool can_unsync)
 {
 	struct kvm_mmu_page *s;
 	bool need_unsync = false;
 
+	if (kvm_page_track_check_mode(vcpu, gfn, KVM_PAGE_TRACK_WRITE))
+		return true;
+
 	for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) {
 		if (!can_unsync)
-			return 1;
+			return true;
 
 		if (s->role.level != PT_PAGE_TABLE_LEVEL)
-			return 1;
+			return true;
 
 		if (!s->unsync)
 			need_unsync = true;
 	}
 	if (need_unsync)
 		kvm_unsync_pages(vcpu, gfn);
-	return 0;
+
+	return false;
 }
 
 static bool kvm_is_mmio_pfn(pfn_t pfn)
@@ -3388,10 +3393,30 @@  int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
 }
 EXPORT_SYMBOL_GPL(handle_mmio_page_fault);
 
+static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu,
+					 u32 error_code, gfn_t gfn)
+{
+	if (unlikely(error_code & PFERR_RSVD_MASK))
+		return false;
+
+	if (!(error_code & PFERR_PRESENT_MASK) ||
+	      !(error_code & PFERR_WRITE_MASK))
+		return false;
+
+	/*
+	 * guest is writing the page which is write tracked which can
+	 * not be fixed by page fault handler.
+	 */
+	if (kvm_page_track_check_mode(vcpu, gfn, KVM_PAGE_TRACK_WRITE))
+		return true;
+
+	return false;
+}
+
 static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
 				u32 error_code, bool prefault)
 {
-	gfn_t gfn;
+	gfn_t gfn = gva >> PAGE_SHIFT;
 	int r;
 
 	pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
@@ -3403,13 +3428,15 @@  static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
 			return r;
 	}
 
+	if (page_fault_handle_page_track(vcpu, error_code, gfn))
+		return 1;
+
 	r = mmu_topup_memory_caches(vcpu);
 	if (r)
 		return r;
 
 	MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
 
-	gfn = gva >> PAGE_SHIFT;
 
 	return nonpaging_map(vcpu, gva & PAGE_MASK,
 			     error_code, gfn, prefault);
@@ -3493,6 +3520,9 @@  static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
 			return r;
 	}
 
+	if (page_fault_handle_page_track(vcpu, error_code, gfn))
+		return 1;
+
 	r = mmu_topup_memory_caches(vcpu);
 	if (r)
 		return r;
diff --git a/arch/x86/kvm/page_track.c b/arch/x86/kvm/page_track.c
index ad510db..dc2da12 100644
--- a/arch/x86/kvm/page_track.c
+++ b/arch/x86/kvm/page_track.c
@@ -151,3 +151,17 @@  void kvm_page_track_remove_page(struct kvm *kvm, gfn_t gfn,
 		spin_unlock(&kvm->mmu_lock);
 	}
 }
+
+/*
+ * check if the corresponding access on the specified guest page is tracked.
+ */
+bool kvm_page_track_check_mode(struct kvm_vcpu *vcpu, gfn_t gfn,
+			       enum kvm_page_track_mode mode)
+{
+	struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+	int index = gfn_to_index(gfn, slot->base_gfn, PT_PAGE_TABLE_LEVEL);
+
+	WARN_ON(!check_mode(mode));
+
+	return !!ACCESS_ONCE(slot->arch.gfn_track[mode][index]);
+}
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 91e939b..ac85682 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -735,6 +735,9 @@  static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
 		return 0;
 	}
 
+	if (page_fault_handle_page_track(vcpu, error_code, walker.gfn))
+		return 1;
+
 	vcpu->arch.write_fault_to_shadow_pgtable = false;
 
 	is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu,