diff mbox

[4/5] KVM: MMU: Optimize pte permission checks

Message ID 1347460194-11807-5-git-send-email-avi@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Avi Kivity Sept. 12, 2012, 2:29 p.m. UTC
walk_addr_generic() permission checks are a maze of branchy code, which is
performed four times per lookup.  It depends on the type of access, efer.nxe,
cr0.wp, cr4.smep, and in the near future, cr4.smap.

Optimize this away by precalculating all variants and storing them in a
bitmap.  The bitmap is recalculated when rarely-changing variables change
(cr0, cr4) and is indexed by the often-changing variables (page fault error
code, pte access permissions).

The result is short, branch-free code.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  7 +++++++
 arch/x86/kvm/mmu.c              | 38 ++++++++++++++++++++++++++++++++++++++
 arch/x86/kvm/mmu.h              | 13 -------------
 arch/x86/kvm/paging_tmpl.h      | 22 ++++------------------
 arch/x86/kvm/x86.c              | 12 +++++-------
 5 files changed, 54 insertions(+), 38 deletions(-)

Comments

Xiao Guangrong Sept. 13, 2012, 12:09 p.m. UTC | #1
On 09/12/2012 10:29 PM, Avi Kivity wrote:
> walk_addr_generic() permission checks are a maze of branchy code, which is
> performed four times per lookup.  It depends on the type of access, efer.nxe,
> cr0.wp, cr4.smep, and in the near future, cr4.smap.
> 
> Optimize this away by precalculating all variants and storing them in a
> bitmap.  The bitmap is recalculated when rarely-changing variables change
> (cr0, cr4) and is indexed by the often-changing variables (page fault error
> code, pte access permissions).

Really graceful!

> 
> The result is short, branch-free code.
> 
> Signed-off-by: Avi Kivity <avi@redhat.com>

> +static void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
> +{
> +	unsigned bit, byte, pfec;
> +	u8 map;
> +	bool fault, x, w, u, wf, uf, ff, smep;
> +
> +	smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
> +	for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) {
> +		pfec = byte << 1;
> +		map = 0;
> +		wf = pfec & PFERR_WRITE_MASK;
> +		uf = pfec & PFERR_USER_MASK;
> +		ff = pfec & PFERR_FETCH_MASK;
> +		for (bit = 0; bit < 8; ++bit) {
> +			x = bit & ACC_EXEC_MASK;
> +			w = bit & ACC_WRITE_MASK;
> +			u = bit & ACC_USER_MASK;
> +
> +			/* Not really needed: !nx will cause pte.nx to fault */
> +			x |= !mmu->nx;
> +			/* Allow supervisor writes if !cr0.wp */
> +			w |= !is_write_protection(vcpu) && !uf;
> +			/* Disallow supervisor fetches if cr4.smep */
> +			x &= !(smep && !uf);

In the case of smep, supervisor mode can fetch the memory if pte.u == 0,
so, it should be x &= !(smep && !uf && u)?

> @@ -3672,20 +3672,18 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
>  				gpa_t *gpa, struct x86_exception *exception,
>  				bool write)
>  {
> -	u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
> +	u32 access = ((kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0)
> +		| (write ? PFERR_WRITE_MASK : 0);
> +	u8 bit = vcpu->arch.access;
> 
> -	if (vcpu_match_mmio_gva(vcpu, gva) &&
> -		  check_write_user_access(vcpu, write, access,
> -		  vcpu->arch.access)) {
> +	if (vcpu_match_mmio_gva(vcpu, gva)
> +	    && ((vcpu->arch.walk_mmu->permissions[access >> 1] >> bit) & 1)) {

!((vcpu->arch.walk_mmu->permissions[access >> 1] >> bit) & 1) ?

It is better introducing a function to do the permission check?

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity Sept. 13, 2012, 12:15 p.m. UTC | #2
On 09/13/2012 03:09 PM, Xiao Guangrong wrote:
>> 
>> The result is short, branch-free code.
>> 
>> Signed-off-by: Avi Kivity <avi@redhat.com>
> 
>> +static void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
>> +{
>> +	unsigned bit, byte, pfec;
>> +	u8 map;
>> +	bool fault, x, w, u, wf, uf, ff, smep;
>> +
>> +	smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
>> +	for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) {
>> +		pfec = byte << 1;
>> +		map = 0;
>> +		wf = pfec & PFERR_WRITE_MASK;
>> +		uf = pfec & PFERR_USER_MASK;
>> +		ff = pfec & PFERR_FETCH_MASK;
>> +		for (bit = 0; bit < 8; ++bit) {
>> +			x = bit & ACC_EXEC_MASK;
>> +			w = bit & ACC_WRITE_MASK;
>> +			u = bit & ACC_USER_MASK;
>> +
>> +			/* Not really needed: !nx will cause pte.nx to fault */
>> +			x |= !mmu->nx;
>> +			/* Allow supervisor writes if !cr0.wp */
>> +			w |= !is_write_protection(vcpu) && !uf;
>> +			/* Disallow supervisor fetches if cr4.smep */
>> +			x &= !(smep && !uf);
> 
> In the case of smep, supervisor mode can fetch the memory if pte.u == 0,
> so, it should be x &= !(smep && !uf && u)?

Good catch, will fix.

> 
>> @@ -3672,20 +3672,18 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
>>  				gpa_t *gpa, struct x86_exception *exception,
>>  				bool write)
>>  {
>> -	u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
>> +	u32 access = ((kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0)
>> +		| (write ? PFERR_WRITE_MASK : 0);
>> +	u8 bit = vcpu->arch.access;
>> 
>> -	if (vcpu_match_mmio_gva(vcpu, gva) &&
>> -		  check_write_user_access(vcpu, write, access,
>> -		  vcpu->arch.access)) {
>> +	if (vcpu_match_mmio_gva(vcpu, gva)
>> +	    && ((vcpu->arch.walk_mmu->permissions[access >> 1] >> bit) & 1)) {
> 
> !((vcpu->arch.walk_mmu->permissions[access >> 1] >> bit) & 1) ?
> 
> It is better introducing a function to do the permission check?
> 

Probably, I'll rethink it.
Xiao Guangrong Sept. 13, 2012, 12:41 p.m. UTC | #3
On 09/12/2012 10:29 PM, Avi Kivity wrote:

> +		pte_access = pt_access & gpte_access(vcpu, pte);
> +		eperm |= (mmu->permissions[access >> 1] >> pte_access) & 1;
> 
>  		last_gpte = FNAME(is_last_gpte)(walker, vcpu, mmu, pte);
> -		if (last_gpte) {
> -			pte_access = pt_access & gpte_access(vcpu, pte);
> -			/* check if the kernel is fetching from user page */
> -			if (unlikely(pte_access & PT_USER_MASK) &&
> -			    kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
> -				if (fetch_fault && !user_fault)
> -					eperm = true;
> -		}

I see this in the SDM:

If CR4.SMEP = 1, instructions may be fetched from any linear
address with a valid translation for which the U/S flag (bit 2) is 0 in at
least one of the paging-structure entries controlling the translation.

This patch checks smep on every levels, breaks this rule.
(current code checks smep on the last level).

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity Sept. 13, 2012, 1:35 p.m. UTC | #4
On 09/13/2012 03:41 PM, Xiao Guangrong wrote:
> On 09/12/2012 10:29 PM, Avi Kivity wrote:
> 
>> +		pte_access = pt_access & gpte_access(vcpu, pte);
>> +		eperm |= (mmu->permissions[access >> 1] >> pte_access) & 1;
>> 
>>  		last_gpte = FNAME(is_last_gpte)(walker, vcpu, mmu, pte);
>> -		if (last_gpte) {
>> -			pte_access = pt_access & gpte_access(vcpu, pte);
>> -			/* check if the kernel is fetching from user page */
>> -			if (unlikely(pte_access & PT_USER_MASK) &&
>> -			    kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
>> -				if (fetch_fault && !user_fault)
>> -					eperm = true;
>> -		}
> 
> I see this in the SDM:
> 
> If CR4.SMEP = 1, instructions may be fetched from any linear
> address with a valid translation for which the U/S flag (bit 2) is 0 in at
> least one of the paging-structure entries controlling the translation.

Another good catch.

> 
> This patch checks smep on every levels, breaks this rule.
> (current code checks smep on the last level).
> 

We can just move the permission check to the end of the loop.  We used
to terminate the loop on a permission error, but now we do the whole
thing anyway.

It does mean that we'll need to set accessed bits after the loop is
complete.
diff mbox

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 64adb61..3318bde 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -287,6 +287,13 @@  struct kvm_mmu {
 	union kvm_mmu_page_role base_role;
 	bool direct_map;
 
+	/*
+	 * Bitmap; bit set = permission fault
+	 * Byte index: page fault error code [4:1]
+	 * Bit index: pte permissions in ACC_* format
+	 */
+	u8 permissions[16];
+
 	u64 *pae_root;
 	u64 *lm_root;
 	u64 rsvd_bits_mask[2][4];
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index f297a2c..ce78408 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3516,6 +3516,38 @@  static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
 	}
 }
 
+static void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
+{
+	unsigned bit, byte, pfec;
+	u8 map;
+	bool fault, x, w, u, wf, uf, ff, smep;
+
+	smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
+	for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) {
+		pfec = byte << 1;
+		map = 0;
+		wf = pfec & PFERR_WRITE_MASK;
+		uf = pfec & PFERR_USER_MASK;
+		ff = pfec & PFERR_FETCH_MASK;
+		for (bit = 0; bit < 8; ++bit) {
+			x = bit & ACC_EXEC_MASK;
+			w = bit & ACC_WRITE_MASK;
+			u = bit & ACC_USER_MASK;
+
+			/* Not really needed: !nx will cause pte.nx to fault */
+			x |= !mmu->nx;
+			/* Allow supervisor writes if !cr0.wp */
+			w |= !is_write_protection(vcpu) && !uf;
+			/* Disallow supervisor fetches if cr4.smep */
+			x &= !(smep && !uf);
+
+			fault = (ff && !x) || (uf && !u) || (wf && !w);
+			map |= fault << bit;
+		}
+		mmu->permissions[byte] = map;
+	}
+}
+
 static int paging64_init_context_common(struct kvm_vcpu *vcpu,
 					struct kvm_mmu *context,
 					int level)
@@ -3524,6 +3556,7 @@  static int paging64_init_context_common(struct kvm_vcpu *vcpu,
 	context->root_level = level;
 
 	reset_rsvds_bits_mask(vcpu, context);
+	update_permission_bitmask(vcpu, context);
 
 	ASSERT(is_pae(vcpu));
 	context->new_cr3 = paging_new_cr3;
@@ -3552,6 +3585,7 @@  static int paging32_init_context(struct kvm_vcpu *vcpu,
 	context->root_level = PT32_ROOT_LEVEL;
 
 	reset_rsvds_bits_mask(vcpu, context);
+	update_permission_bitmask(vcpu, context);
 
 	context->new_cr3 = paging_new_cr3;
 	context->page_fault = paging32_page_fault;
@@ -3612,6 +3646,8 @@  static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 		context->gva_to_gpa = paging32_gva_to_gpa;
 	}
 
+	update_permission_bitmask(vcpu, context);
+
 	return 0;
 }
 
@@ -3687,6 +3723,8 @@  static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
 		g_context->gva_to_gpa = paging32_gva_to_gpa_nested;
 	}
 
+	update_permission_bitmask(vcpu, g_context);
+
 	return 0;
 }
 
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 2832081..143ee70 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -89,17 +89,4 @@  static inline bool is_write_protection(struct kvm_vcpu *vcpu)
 	return kvm_read_cr0_bits(vcpu, X86_CR0_WP);
 }
 
-static inline bool check_write_user_access(struct kvm_vcpu *vcpu,
-					   bool write_fault, bool user_fault,
-					   unsigned long pte)
-{
-	if (unlikely(write_fault && !is_writable_pte(pte)
-	      && (user_fault || is_write_protection(vcpu))))
-		return false;
-
-	if (unlikely(user_fault && !(pte & PT_USER_MASK)))
-		return false;
-
-	return true;
-}
 #endif
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 1cbf576..59c4319 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -129,7 +129,7 @@  static int FNAME(walk_addr_generic)(struct guest_walker *walker,
 	pt_element_t pte;
 	pt_element_t __user *uninitialized_var(ptep_user);
 	gfn_t table_gfn;
-	unsigned index, pt_access, uninitialized_var(pte_access);
+	unsigned index, pt_access, pte_access;
 	gpa_t pte_gpa;
 	bool eperm, last_gpte;
 	int offset;
@@ -195,24 +195,10 @@  retry_walk:
 			goto error;
 		}
 
-		if (!check_write_user_access(vcpu, write_fault, user_fault,
-					  pte))
-			eperm = true;
-
-#if PTTYPE == 64
-		if (unlikely(fetch_fault && (pte & PT64_NX_MASK)))
-			eperm = true;
-#endif
+		pte_access = pt_access & gpte_access(vcpu, pte);
+		eperm |= (mmu->permissions[access >> 1] >> pte_access) & 1;
 
 		last_gpte = FNAME(is_last_gpte)(walker, vcpu, mmu, pte);
-		if (last_gpte) {
-			pte_access = pt_access & gpte_access(vcpu, pte);
-			/* check if the kernel is fetching from user page */
-			if (unlikely(pte_access & PT_USER_MASK) &&
-			    kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
-				if (fetch_fault && !user_fault)
-					eperm = true;
-		}
 
 		if (!eperm && unlikely(!(pte & PT_ACCESSED_MASK))) {
 			int ret;
@@ -257,7 +243,7 @@  retry_walk:
 			break;
 		}
 
-		pt_access &= gpte_access(vcpu, pte);
+		pt_access &= pte_access;
 		--walker->level;
 	}
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c4d451e..e59b9fe 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3672,20 +3672,18 @@  static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
 				gpa_t *gpa, struct x86_exception *exception,
 				bool write)
 {
-	u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+	u32 access = ((kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0)
+		| (write ? PFERR_WRITE_MASK : 0);
+	u8 bit = vcpu->arch.access;
 
-	if (vcpu_match_mmio_gva(vcpu, gva) &&
-		  check_write_user_access(vcpu, write, access,
-		  vcpu->arch.access)) {
+	if (vcpu_match_mmio_gva(vcpu, gva)
+	    && ((vcpu->arch.walk_mmu->permissions[access >> 1] >> bit) & 1)) {
 		*gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT |
 					(gva & (PAGE_SIZE - 1));
 		trace_vcpu_match_mmio(gva, *gpa, write, false);
 		return 1;
 	}
 
-	if (write)
-		access |= PFERR_WRITE_MASK;
-
 	*gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
 
 	if (*gpa == UNMAPPED_GVA)