diff mbox

RFC: Add reserved bits check

Message ID 9832F13BD22FB94A829F798DA4A8280501A2106E6A@pdsmsx503.ccr.corp.intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Dong, Eddie March 27, 2009, 4:19 a.m. UTC
Current KVM doesn't check reserved bits of guest page table, while may use reserved bits to bypass guest #PF in VMX.

 

This patch add this check while leaving shadow pte un-constructed if guest RSVD=1.

 

Comments?

Thx, eddie

 

 

 

 



--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Avi Kivity March 27, 2009, 9:34 a.m. UTC | #1
Dong, Eddie wrote:
>  
>
>
>
> Current KVM doesn't check reserved bits of guest page table, while may use reserved bits to bypass guest #PF in VMX.
>
>  
>
> This patch add this check while leaving shadow pte un-constructed if guest RSVD=1.
>
>   


> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -261,6 +261,8 @@ struct kvm_mmu {
>   union kvm_mmu_page_role base_role;
>  
>   u64 *pae_root;
> + u64 rsvd_bits_mask[4];
> + u64 large_page_rsvd_mask;
>  }; 

Make large_page_rsvd_mask() an array too, in preparation for 1GB pages?

Perhaps u64 rsvd_bits_mask[2][4].  First index is bit 7 of the pte, 
second index is the level.  Makes for faster run time too.

>  #define PT_DIRECTORY_LEVEL 2
> @@ -179,6 +180,13 @@ static u64 __read_mostly shadow_user_mask;
>  static u64 __read_mostly shadow_accessed_mask;
>  static u64 __read_mostly shadow_dirty_mask;
>  static u64 __read_mostly shadow_mt_mask;
> +extern struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(
> + struct kvm_vcpu *vcpu, u32 function, u32 index);
>   

This needs to be in a header file, so we don't get random breakage when 
the signature changes.

>  
> +static int is_rsvd_bits_set(struct kvm_vcpu *vcpu, unsigned long pte, int level)
>   

u64 pte... (and bool for return type)

s/pte/gpte/ to make it clear.

> @@ -2184,6 +2215,18 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
>  
>  static int paging64_init_context(struct kvm_vcpu *vcpu)
>  {
> + struct kvm_mmu *context = &vcpu->arch.mmu;
> + int maxphyaddr = cpuid_maxphyaddr(vcpu);
> +
> + context->rsvd_bits_mask[3] =
> +  rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8);
> + context->rsvd_bits_mask[2] =
> +  rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8);
> + context->rsvd_bits_mask[1] =
> +  rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8);
> + context->rsvd_bits_mask[0] = rsvd_bits(maxphyaddr, 51);
> + context->large_page_rsvd_mask =  /* 2MB PDE */
> +  rsvd_bits(maxphyaddr, 51) | rsvd_bits(13, 20);
>   return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL);
>  }
>   

Isn't bit 63 reserved if NX is disabled?

>  
> @@ -2206,6 +2258,18 @@ static int paging32_init_context(struct kvm_vcpu *vcpu)
>  
>  static int paging32E_init_context(struct kvm_vcpu *vcpu)
>  {
> + struct kvm_mmu *context = &vcpu->arch.mmu;
> + int maxphyaddr = cpuid_maxphyaddr(vcpu);
> +
> + /* 3 levels */
> + context->rsvd_bits_mask[2] = rsvd_bits(maxphyaddr, 63) |
> +  rsvd_bits(7, 8) | rsvd_bits(1,2); /* PDPTE */
>   

Will never be use, PDPTEs are loaded by set_cr3(), not walk_addr().

>  
> diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
> index 7314c09..844efe9 100644
> --- a/arch/x86/kvm/paging_tmpl.h
> +++ b/arch/x86/kvm/paging_tmpl.h
> @@ -123,6 +123,7 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
>   gfn_t table_gfn;
>   unsigned index, pt_access, pte_access;
>   gpa_t pte_gpa;
> + int rsvd_fault;
>  
>   pgprintk("%s: addr %lx\n", __func__, addr);
>  walk:
> @@ -153,10 +154,13 @@ walk:
>      walker->level - 1, table_gfn);
>  
>    kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte));
> +  rsvd_fault = is_rsvd_bits_set(vcpu, pte, walker->level);
>   

Does a not present pte set PFERR_RSVD?

>  
>    if (!is_present_pte(pte))
>     goto not_present;
>  
> +  if (rsvd_fault)
> +   goto access_error;
>    if (write_fault && !is_writeble_pte(pte))
>     if (user_fault || is_write_protection(vcpu))
>      goto access_error;
> @@ -233,6 +237,8 @@ err:
>    walker->error_code |= PFERR_USER_MASK;
>   if (fetch_fault)
>    walker->error_code |= PFERR_FETCH_MASK;
> + if (rsvd_fault)
> +  walker->error_code |= PFERR_RSVD_MASK;
>   return 0;
>  }
>  
>
>
>
Dong, Eddie March 27, 2009, 1:46 p.m. UTC | #2
>> + context->rsvd_bits_mask[0] = rsvd_bits(maxphyaddr, 51);
>> + context->large_page_rsvd_mask =  /* 2MB PDE */
>> +  rsvd_bits(maxphyaddr, 51) | rsvd_bits(13, 20);
>>   return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL);  }
>> 
> 
> Isn't bit 63 reserved if NX is disabled?

Sure.

> 
>> 
>> @@ -2206,6 +2258,18 @@ static int paging32_init_context(struct
>> kvm_vcpu *vcpu) 
>> 
>>  static int paging32E_init_context(struct kvm_vcpu *vcpu)  {
>> + struct kvm_mmu *context = &vcpu->arch.mmu;
>> + int maxphyaddr = cpuid_maxphyaddr(vcpu);
>> +
>> + /* 3 levels */
>> + context->rsvd_bits_mask[2] = rsvd_bits(maxphyaddr, 63) |
>> +  rsvd_bits(7, 8) | rsvd_bits(1,2); /* PDPTE */
>> 
> 
> Will never be use, PDPTEs are loaded by set_cr3(), not walk_addr().
> 

I see, then how about to replace CR3_PAE_RESERVED_BITS check at cr3 load with
rsvd_bits_mask[2]? Seems current code are lacking of enough reserved bits check too.

>> @@ -153,10 +154,13 @@ walk:
>>      walker->level - 1, table_gfn);
>> 
>>    kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte));
>> +  rsvd_fault = is_rsvd_bits_set(vcpu, pte, walker->level);
>> 
> 
> Does a not present pte set PFERR_RSVD?

Yes though most commercial OS doesn't use it. 
I plan to post a follow up patch to fix the potential RSVD_fault error code mismatch when bypass_guest_pf=1.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity March 27, 2009, 2:28 p.m. UTC | #3
Dong, Eddie wrote:
>> Will never be use, PDPTEs are loaded by set_cr3(), not walk_addr().
>>
>>     
>
> I see, then how about to replace CR3_PAE_RESERVED_BITS check at cr3 load with
> rsvd_bits_mask[2]? Seems current code are lacking of enough reserved bits check too.
>
>   

Need to make sure rsvd_bits_mask[] is maintained on ept and npt, then.
diff mbox

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 55fd4c5..9370ff0 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -261,6 +261,8 @@  struct kvm_mmu {
  union kvm_mmu_page_role base_role;
 
  u64 *pae_root;
+ u64 rsvd_bits_mask[4];
+ u64 large_page_rsvd_mask;
 };
 
 struct kvm_vcpu_arch {
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 31ba3cb..7f55c4a 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -127,6 +127,7 @@  module_param(oos_shadow, bool, 0644);
 #define PFERR_PRESENT_MASK (1U << 0)
 #define PFERR_WRITE_MASK (1U << 1)
 #define PFERR_USER_MASK (1U << 2)
+#define PFERR_RSVD_MASK (1U << 3)
 #define PFERR_FETCH_MASK (1U << 4)
 
 #define PT_DIRECTORY_LEVEL 2
@@ -179,6 +180,13 @@  static u64 __read_mostly shadow_user_mask;
 static u64 __read_mostly shadow_accessed_mask;
 static u64 __read_mostly shadow_dirty_mask;
 static u64 __read_mostly shadow_mt_mask;
+extern struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(
+ struct kvm_vcpu *vcpu, u32 function, u32 index);
+
+static inline u64 rsvd_bits(int s, int e)
+{
+ return ((1ULL << (e - s + 1)) - 1) << s;
+}
 
 void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
 {
@@ -251,6 +259,18 @@  static int is_rmap_pte(u64 pte)
  return is_shadow_present_pte(pte);
 }
 
+static int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
+{
+ u32 function=0x80000008;
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, function, 0);
+ if (best) {
+  return best->eax & 0xff;
+ }
+ return 40;
+}
+
 static pfn_t spte_to_pfn(u64 pte)
 {
  return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
@@ -2156,6 +2176,17 @@  static void paging_free(struct kvm_vcpu *vcpu)
  nonpaging_free(vcpu);
 }
 
+static int is_rsvd_bits_set(struct kvm_vcpu *vcpu, unsigned long pte, int level)
+{
+ if (level == PT_DIRECTORY_LEVEL && (pte & PT_PAGE_SIZE_MASK)) {
+  /* large page */
+  return (pte & vcpu->arch.mmu.large_page_rsvd_mask) != 0;
+ }
+ else
+  /* 4K page */
+  return (pte & vcpu->arch.mmu.rsvd_bits_mask[level-1]) != 0;
+}
+
 #define PTTYPE 64
 #include "paging_tmpl.h"
 #undef PTTYPE
@@ -2184,6 +2215,18 @@  static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
 
 static int paging64_init_context(struct kvm_vcpu *vcpu)
 {
+ struct kvm_mmu *context = &vcpu->arch.mmu;
+ int maxphyaddr = cpuid_maxphyaddr(vcpu);
+
+ context->rsvd_bits_mask[3] =
+  rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8);
+ context->rsvd_bits_mask[2] =
+  rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8);
+ context->rsvd_bits_mask[1] =
+  rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8);
+ context->rsvd_bits_mask[0] = rsvd_bits(maxphyaddr, 51);
+ context->large_page_rsvd_mask =  /* 2MB PDE */
+  rsvd_bits(maxphyaddr, 51) | rsvd_bits(13, 20);
  return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL);
 }
 
@@ -2191,6 +2234,15 @@  static int paging32_init_context(struct kvm_vcpu *vcpu)
 {
  struct kvm_mmu *context = &vcpu->arch.mmu;
 
+ /* no rsvd bits for 2 level 4K page table entries */
+ context->rsvd_bits_mask[0] = 0;
+ context->rsvd_bits_mask[1] = 0;
+ if (is_cpuid_PSE36())
+  /* 36bits PSE 4MB page */
+  context->large_page_rsvd_mask = rsvd_bits(17, 21);
+ else
+  /* 32 bits PSE 4MB page */
+  context->large_page_rsvd_mask = rsvd_bits(13, 21);
  context->new_cr3 = paging_new_cr3;
  context->page_fault = paging32_page_fault;
  context->gva_to_gpa = paging32_gva_to_gpa;
@@ -2206,6 +2258,18 @@  static int paging32_init_context(struct kvm_vcpu *vcpu)
 
 static int paging32E_init_context(struct kvm_vcpu *vcpu)
 {
+ struct kvm_mmu *context = &vcpu->arch.mmu;
+ int maxphyaddr = cpuid_maxphyaddr(vcpu);
+
+ /* 3 levels */
+ context->rsvd_bits_mask[2] = rsvd_bits(maxphyaddr, 63) |
+  rsvd_bits(7, 8) | rsvd_bits(1,2); /* PDPTE */
+ context->rsvd_bits_mask[1] = rsvd_bits(maxphyaddr, 63); /* PDE */
+ context->rsvd_bits_mask[0] =   /* PTE */
+  rsvd_bits(maxphyaddr, 63) | rsvd_bits(7, 8) | rsvd_bits(1, 2);
+ context->large_page_rsvd_mask =   /* 2M page */
+   rsvd_bits(maxphyaddr, 63) | rsvd_bits(13, 20);
+
  return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL);
 }
 
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 7314c09..844efe9 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -123,6 +123,7 @@  static int FNAME(walk_addr)(struct guest_walker *walker,
  gfn_t table_gfn;
  unsigned index, pt_access, pte_access;
  gpa_t pte_gpa;
+ int rsvd_fault;
 
  pgprintk("%s: addr %lx\n", __func__, addr);
 walk:
@@ -153,10 +154,13 @@  walk:
     walker->level - 1, table_gfn);
 
   kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte));
+  rsvd_fault = is_rsvd_bits_set(vcpu, pte, walker->level);
 
   if (!is_present_pte(pte))
    goto not_present;
 
+  if (rsvd_fault)
+   goto access_error;
   if (write_fault && !is_writeble_pte(pte))
    if (user_fault || is_write_protection(vcpu))
     goto access_error;
@@ -233,6 +237,8 @@  err:
   walker->error_code |= PFERR_USER_MASK;
  if (fetch_fault)
   walker->error_code |= PFERR_FETCH_MASK;
+ if (rsvd_fault)
+  walker->error_code |= PFERR_RSVD_MASK;
  return 0;
 }