Message ID | 9832F13BD22FB94A829F798DA4A8280501A2106E6A@pdsmsx503.ccr.corp.intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Dong, Eddie wrote: > > > > > Current KVM doesn't check reserved bits of guest page table, while may use reserved bits to bypass guest #PF in VMX. > > > > This patch add this check while leaving shadow pte un-constructed if guest RSVD=1. > > > --- a/arch/x86/include/asm/kvm_host.h > +++ b/arch/x86/include/asm/kvm_host.h > @@ -261,6 +261,8 @@ struct kvm_mmu { > union kvm_mmu_page_role base_role; > > u64 *pae_root; > + u64 rsvd_bits_mask[4]; > + u64 large_page_rsvd_mask; > }; Make large_page_rsvd_mask() an array too, in preparation for 1GB pages? Perhaps u64 rsvd_bits_mask[2][4]. First index is bit 7 of the pte, second index is the level. Makes for faster run time too. > #define PT_DIRECTORY_LEVEL 2 > @@ -179,6 +180,13 @@ static u64 __read_mostly shadow_user_mask; > static u64 __read_mostly shadow_accessed_mask; > static u64 __read_mostly shadow_dirty_mask; > static u64 __read_mostly shadow_mt_mask; > +extern struct kvm_cpuid_entry2 *kvm_find_cpuid_entry( > + struct kvm_vcpu *vcpu, u32 function, u32 index); > This needs to be in a header file, so we don't get random breakage when the signature changes. > > +static int is_rsvd_bits_set(struct kvm_vcpu *vcpu, unsigned long pte, int level) > u64 pte... (and bool for return type) s/pte/gpte/ to make it clear. > @@ -2184,6 +2215,18 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) > > static int paging64_init_context(struct kvm_vcpu *vcpu) > { > + struct kvm_mmu *context = &vcpu->arch.mmu; > + int maxphyaddr = cpuid_maxphyaddr(vcpu); > + > + context->rsvd_bits_mask[3] = > + rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8); > + context->rsvd_bits_mask[2] = > + rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8); > + context->rsvd_bits_mask[1] = > + rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8); > + context->rsvd_bits_mask[0] = rsvd_bits(maxphyaddr, 51); > + context->large_page_rsvd_mask = /* 2MB PDE */ > + rsvd_bits(maxphyaddr, 51) | rsvd_bits(13, 20); > return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL); > } > Isn't bit 63 reserved if NX is disabled? > > @@ -2206,6 +2258,18 @@ static int paging32_init_context(struct kvm_vcpu *vcpu) > > static int paging32E_init_context(struct kvm_vcpu *vcpu) > { > + struct kvm_mmu *context = &vcpu->arch.mmu; > + int maxphyaddr = cpuid_maxphyaddr(vcpu); > + > + /* 3 levels */ > + context->rsvd_bits_mask[2] = rsvd_bits(maxphyaddr, 63) | > + rsvd_bits(7, 8) | rsvd_bits(1,2); /* PDPTE */ > Will never be use, PDPTEs are loaded by set_cr3(), not walk_addr(). > > diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h > index 7314c09..844efe9 100644 > --- a/arch/x86/kvm/paging_tmpl.h > +++ b/arch/x86/kvm/paging_tmpl.h > @@ -123,6 +123,7 @@ static int FNAME(walk_addr)(struct guest_walker *walker, > gfn_t table_gfn; > unsigned index, pt_access, pte_access; > gpa_t pte_gpa; > + int rsvd_fault; > > pgprintk("%s: addr %lx\n", __func__, addr); > walk: > @@ -153,10 +154,13 @@ walk: > walker->level - 1, table_gfn); > > kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte)); > + rsvd_fault = is_rsvd_bits_set(vcpu, pte, walker->level); > Does a not present pte set PFERR_RSVD? > > if (!is_present_pte(pte)) > goto not_present; > > + if (rsvd_fault) > + goto access_error; > if (write_fault && !is_writeble_pte(pte)) > if (user_fault || is_write_protection(vcpu)) > goto access_error; > @@ -233,6 +237,8 @@ err: > walker->error_code |= PFERR_USER_MASK; > if (fetch_fault) > walker->error_code |= PFERR_FETCH_MASK; > + if (rsvd_fault) > + walker->error_code |= PFERR_RSVD_MASK; > return 0; > } > > > >
>> + context->rsvd_bits_mask[0] = rsvd_bits(maxphyaddr, 51); >> + context->large_page_rsvd_mask = /* 2MB PDE */ >> + rsvd_bits(maxphyaddr, 51) | rsvd_bits(13, 20); >> return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL); } >> > > Isn't bit 63 reserved if NX is disabled? Sure. > >> >> @@ -2206,6 +2258,18 @@ static int paging32_init_context(struct >> kvm_vcpu *vcpu) >> >> static int paging32E_init_context(struct kvm_vcpu *vcpu) { >> + struct kvm_mmu *context = &vcpu->arch.mmu; >> + int maxphyaddr = cpuid_maxphyaddr(vcpu); >> + >> + /* 3 levels */ >> + context->rsvd_bits_mask[2] = rsvd_bits(maxphyaddr, 63) | >> + rsvd_bits(7, 8) | rsvd_bits(1,2); /* PDPTE */ >> > > Will never be use, PDPTEs are loaded by set_cr3(), not walk_addr(). > I see, then how about to replace CR3_PAE_RESERVED_BITS check at cr3 load with rsvd_bits_mask[2]? Seems current code are lacking of enough reserved bits check too. >> @@ -153,10 +154,13 @@ walk: >> walker->level - 1, table_gfn); >> >> kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte)); >> + rsvd_fault = is_rsvd_bits_set(vcpu, pte, walker->level); >> > > Does a not present pte set PFERR_RSVD? Yes though most commercial OS doesn't use it. I plan to post a follow up patch to fix the potential RSVD_fault error code mismatch when bypass_guest_pf=1. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Dong, Eddie wrote: >> Will never be use, PDPTEs are loaded by set_cr3(), not walk_addr(). >> >> > > I see, then how about to replace CR3_PAE_RESERVED_BITS check at cr3 load with > rsvd_bits_mask[2]? Seems current code are lacking of enough reserved bits check too. > > Need to make sure rsvd_bits_mask[] is maintained on ept and npt, then.
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 55fd4c5..9370ff0 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -261,6 +261,8 @@ struct kvm_mmu { union kvm_mmu_page_role base_role; u64 *pae_root; + u64 rsvd_bits_mask[4]; + u64 large_page_rsvd_mask; }; struct kvm_vcpu_arch { diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 31ba3cb..7f55c4a 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -127,6 +127,7 @@ module_param(oos_shadow, bool, 0644); #define PFERR_PRESENT_MASK (1U << 0) #define PFERR_WRITE_MASK (1U << 1) #define PFERR_USER_MASK (1U << 2) +#define PFERR_RSVD_MASK (1U << 3) #define PFERR_FETCH_MASK (1U << 4) #define PT_DIRECTORY_LEVEL 2 @@ -179,6 +180,13 @@ static u64 __read_mostly shadow_user_mask; static u64 __read_mostly shadow_accessed_mask; static u64 __read_mostly shadow_dirty_mask; static u64 __read_mostly shadow_mt_mask; +extern struct kvm_cpuid_entry2 *kvm_find_cpuid_entry( + struct kvm_vcpu *vcpu, u32 function, u32 index); + +static inline u64 rsvd_bits(int s, int e) +{ + return ((1ULL << (e - s + 1)) - 1) << s; +} void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte) { @@ -251,6 +259,18 @@ static int is_rmap_pte(u64 pte) return is_shadow_present_pte(pte); } +static int cpuid_maxphyaddr(struct kvm_vcpu *vcpu) +{ + u32 function=0x80000008; + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, function, 0); + if (best) { + return best->eax & 0xff; + } + return 40; +} + static pfn_t spte_to_pfn(u64 pte) { return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; @@ -2156,6 +2176,17 @@ static void paging_free(struct kvm_vcpu *vcpu) nonpaging_free(vcpu); } +static int is_rsvd_bits_set(struct kvm_vcpu *vcpu, unsigned long pte, int level) +{ + if (level == PT_DIRECTORY_LEVEL && (pte & PT_PAGE_SIZE_MASK)) { + /* large page */ + return (pte & vcpu->arch.mmu.large_page_rsvd_mask) != 0; + } + else + /* 4K page */ + return (pte & vcpu->arch.mmu.rsvd_bits_mask[level-1]) != 0; +} + #define PTTYPE 64 #include "paging_tmpl.h" #undef PTTYPE @@ -2184,6 +2215,18 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) static int paging64_init_context(struct kvm_vcpu *vcpu) { + struct kvm_mmu *context = &vcpu->arch.mmu; + int maxphyaddr = cpuid_maxphyaddr(vcpu); + + context->rsvd_bits_mask[3] = + rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8); + context->rsvd_bits_mask[2] = + rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8); + context->rsvd_bits_mask[1] = + rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8); + context->rsvd_bits_mask[0] = rsvd_bits(maxphyaddr, 51); + context->large_page_rsvd_mask = /* 2MB PDE */ + rsvd_bits(maxphyaddr, 51) | rsvd_bits(13, 20); return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL); } @@ -2191,6 +2234,15 @@ static int paging32_init_context(struct kvm_vcpu *vcpu) { struct kvm_mmu *context = &vcpu->arch.mmu; + /* no rsvd bits for 2 level 4K page table entries */ + context->rsvd_bits_mask[0] = 0; + context->rsvd_bits_mask[1] = 0; + if (is_cpuid_PSE36()) + /* 36bits PSE 4MB page */ + context->large_page_rsvd_mask = rsvd_bits(17, 21); + else + /* 32 bits PSE 4MB page */ + context->large_page_rsvd_mask = rsvd_bits(13, 21); context->new_cr3 = paging_new_cr3; context->page_fault = paging32_page_fault; context->gva_to_gpa = paging32_gva_to_gpa; @@ -2206,6 +2258,18 @@ static int paging32_init_context(struct kvm_vcpu *vcpu) static int paging32E_init_context(struct kvm_vcpu *vcpu) { + struct kvm_mmu *context = &vcpu->arch.mmu; + int maxphyaddr = cpuid_maxphyaddr(vcpu); + + /* 3 levels */ + context->rsvd_bits_mask[2] = rsvd_bits(maxphyaddr, 63) | + rsvd_bits(7, 8) | rsvd_bits(1,2); /* PDPTE */ + context->rsvd_bits_mask[1] = rsvd_bits(maxphyaddr, 63); /* PDE */ + context->rsvd_bits_mask[0] = /* PTE */ + rsvd_bits(maxphyaddr, 63) | rsvd_bits(7, 8) | rsvd_bits(1, 2); + context->large_page_rsvd_mask = /* 2M page */ + rsvd_bits(maxphyaddr, 63) | rsvd_bits(13, 20); + return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL); } diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 7314c09..844efe9 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -123,6 +123,7 @@ static int FNAME(walk_addr)(struct guest_walker *walker, gfn_t table_gfn; unsigned index, pt_access, pte_access; gpa_t pte_gpa; + int rsvd_fault; pgprintk("%s: addr %lx\n", __func__, addr); walk: @@ -153,10 +154,13 @@ walk: walker->level - 1, table_gfn); kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte)); + rsvd_fault = is_rsvd_bits_set(vcpu, pte, walker->level); if (!is_present_pte(pte)) goto not_present; + if (rsvd_fault) + goto access_error; if (write_fault && !is_writeble_pte(pte)) if (user_fault || is_write_protection(vcpu)) goto access_error; @@ -233,6 +237,8 @@ err: walker->error_code |= PFERR_USER_MASK; if (fetch_fault) walker->error_code |= PFERR_FETCH_MASK; + if (rsvd_fault) + walker->error_code |= PFERR_RSVD_MASK; return 0; }