Message ID | 1368939152-11406-3-git-send-email-jun.nakajima@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On 05/19/2013 12:52 PM, Jun Nakajima wrote: > From: Nadav Har'El <nyh@il.ibm.com> > > This is the first patch in a series which adds nested EPT support to KVM's > nested VMX. Nested EPT means emulating EPT for an L1 guest so that L1 can use > EPT when running a nested guest L2. When L1 uses EPT, it allows the L2 guest > to set its own cr3 and take its own page faults without either of L0 or L1 > getting involved. This often significanlty improves L2's performance over the > previous two alternatives (shadow page tables over EPT, and shadow page > tables over shadow page tables). > > This patch adds EPT support to paging_tmpl.h. > > paging_tmpl.h contains the code for reading and writing page tables. The code > for 32-bit and 64-bit tables is very similar, but not identical, so > paging_tmpl.h is #include'd twice in mmu.c, once with PTTTYPE=32 and once > with PTTYPE=64, and this generates the two sets of similar functions. > > There are subtle but important differences between the format of EPT tables > and that of ordinary x86 64-bit page tables, so for nested EPT we need a > third set of functions to read the guest EPT table and to write the shadow > EPT table. > > So this patch adds third PTTYPE, PTTYPE_EPT, which creates functions (prefixed > with "EPT") which correctly read and write EPT tables. > > Signed-off-by: Nadav Har'El <nyh@il.ibm.com> > Signed-off-by: Jun Nakajima <jun.nakajima@intel.com> > Signed-off-by: Xinhao Xu <xinhao.xu@intel.com> > --- > arch/x86/kvm/mmu.c | 5 +++++ > arch/x86/kvm/paging_tmpl.h | 43 +++++++++++++++++++++++++++++++++++++++++-- > 2 files changed, 46 insertions(+), 2 deletions(-) > > diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c > index 117233f..6c1670f 100644 > --- a/arch/x86/kvm/mmu.c > +++ b/arch/x86/kvm/mmu.c > @@ -3397,6 +3397,11 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gp > return mmu->last_pte_bitmap & (1 << index); > } > > +#define PTTYPE_EPT 18 /* arbitrary */ > +#define PTTYPE PTTYPE_EPT > +#include "paging_tmpl.h" > +#undef PTTYPE > + > #define PTTYPE 64 > #include "paging_tmpl.h" > #undef PTTYPE > diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h > index df34d4a..4c45654 100644 > --- a/arch/x86/kvm/paging_tmpl.h > +++ b/arch/x86/kvm/paging_tmpl.h > @@ -50,6 +50,22 @@ > #define PT_LEVEL_BITS PT32_LEVEL_BITS > #define PT_MAX_FULL_LEVELS 2 > #define CMPXCHG cmpxchg > +#elif PTTYPE == PTTYPE_EPT > + #define pt_element_t u64 > + #define guest_walker guest_walkerEPT > + #define FNAME(name) EPT_##name > + #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK > + #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl) > + #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) > + #define PT_INDEX(addr, level) PT64_INDEX(addr, level) > + #define PT_LEVEL_BITS PT64_LEVEL_BITS > + #ifdef CONFIG_X86_64 > + #define PT_MAX_FULL_LEVELS 4 > + #define CMPXCHG cmpxchg > + #else > + #define CMPXCHG cmpxchg64 CMPXHG is only used in FNAME(cmpxchg_gpte), but you commented it later. Do we really need it? > + #define PT_MAX_FULL_LEVELS 2 And the SDM says: "It uses a page-walk length of 4, meaning that at most 4 EPT paging-structure entriesare accessed to translate a guest-physical address.", Is My SDM obsolete? Which kind of process supports page-walk length = 2? It seems your patch is not able to handle the case that the guest uses walk-lenght = 2 which is running on the host with walk-lenght = 4. (plrease refer to how to handle sp->role.quadrant in FNAME(get_level1_sp_gpa) in the current code.) > + #endif > #else > #error Invalid PTTYPE value > #endif > @@ -80,6 +96,10 @@ static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl) > return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT; > } > > +#if PTTYPE != PTTYPE_EPT > +/* > + * Comment out this for EPT because update_accessed_dirty_bits() is not used. > + */ > static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, > pt_element_t __user *ptep_user, unsigned index, > pt_element_t orig_pte, pt_element_t new_pte) > @@ -102,6 +122,7 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, > > return (ret != orig_pte); > } > +#endif > > static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu, > struct kvm_mmu_page *sp, u64 *spte, > @@ -126,13 +147,21 @@ no_present: > static inline unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, u64 gpte) > { > unsigned access; > - > +#if PTTYPE == PTTYPE_EPT > + access = (gpte & (VMX_EPT_READABLE_MASK | VMX_EPT_WRITABLE_MASK | > + VMX_EPT_EXECUTABLE_MASK)); It seems wrong. The ACC_XXX definition: #define ACC_EXEC_MASK 1 #define ACC_WRITE_MASK PT_WRITABLE_MASK #define ACC_USER_MASK PT_USER_MASK #define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) The bits are different with the bits used in EPT page table, for example, your code always see that the execution is not allowed. > +#else > access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK; > access &= ~(gpte >> PT64_NX_SHIFT); > +#endif > > return access; > } > > +#if PTTYPE != PTTYPE_EPT > +/* > + * EPT A/D bit support is not implemented. > + */ > static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu, > struct kvm_mmu *mmu, > struct guest_walker *walker, > @@ -169,6 +198,7 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu, > } > return 0; > } > +#endif > > /* > * Fetch a guest pte for a guest virtual address > @@ -177,7 +207,6 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, > struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, > gva_t addr, u32 access) > { > - int ret; > pt_element_t pte; > pt_element_t __user *uninitialized_var(ptep_user); > gfn_t table_gfn; > @@ -192,7 +221,9 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, > gfn_t gfn; > > trace_kvm_mmu_pagetable_walk(addr, access); > +#if PTTYPE != PTTYPE_EPT > retry_walk: > +#endif > walker->level = mmu->root_level; > pte = mmu->get_cr3(vcpu); > > @@ -277,6 +308,7 @@ retry_walk: > > walker->gfn = real_gpa >> PAGE_SHIFT; > > +#if PTTYPE != PTTYPE_EPT > if (!write_fault) > protect_clean_gpte(&pte_access, pte); > else > @@ -287,12 +319,15 @@ retry_walk: > accessed_dirty &= pte >> (PT_DIRTY_SHIFT - PT_ACCESSED_SHIFT); > > if (unlikely(!accessed_dirty)) { > + int ret; > + > ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, write_fault); > if (unlikely(ret < 0)) > goto error; > else if (ret) > goto retry_walk; > } > +#endif There are lots of code in paging_tmpl.h depends on PT_ACCESSED_MASK/PT_DIRTY_MASK. I do not see other parts are adjusted in your patch. How about redefine PT_ACCESSED_MASK / PT_DIRTY_MASK, something like: #if PTTYPE == 32 PT_ACCESS = PT_ACCESSED_MASK; ...... #elif PTTYPE == 64 PT_ACCESS = PT_ACCESSED_MASK; ...... #elif PTTYPE == PTTYPE_EPT PT_ACCESS = 0 #else ....... I guess the compiler can drop the unnecessary branch when PT_ACCESS == 0. Also, it can help use to remove the untidy "#if PTTYPE != PTTYPE_EPT" > > walker->pt_access = pt_access; > walker->pte_access = pte_access; > @@ -323,6 +358,7 @@ static int FNAME(walk_addr)(struct guest_walker *walker, > access); > } > > +#if PTTYPE != PTTYPE_EPT > static int FNAME(walk_addr_nested)(struct guest_walker *walker, > struct kvm_vcpu *vcpu, gva_t addr, > u32 access) > @@ -330,6 +366,7 @@ static int FNAME(walk_addr_nested)(struct guest_walker *walker, > return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu, > addr, access); > } > +#endif > > static bool > FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, > @@ -754,6 +791,7 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access, > return gpa; > } > > +#if PTTYPE != PTTYPE_EPT > static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr, > u32 access, > struct x86_exception *exception) > @@ -772,6 +810,7 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr, > > return gpa; > } > +#endif Strange! Why does nested ept not need these functions? How to emulate the instruction faulted on L2? -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 05/21/2013 03:52 PM, Xiao Guangrong wrote: > On 05/19/2013 12:52 PM, Jun Nakajima wrote: >> From: Nadav Har'El <nyh@il.ibm.com> >> >> This is the first patch in a series which adds nested EPT support to KVM's >> nested VMX. Nested EPT means emulating EPT for an L1 guest so that L1 can use >> EPT when running a nested guest L2. When L1 uses EPT, it allows the L2 guest >> to set its own cr3 and take its own page faults without either of L0 or L1 >> getting involved. This often significanlty improves L2's performance over the >> previous two alternatives (shadow page tables over EPT, and shadow page >> tables over shadow page tables). >> >> This patch adds EPT support to paging_tmpl.h. >> >> paging_tmpl.h contains the code for reading and writing page tables. The code >> for 32-bit and 64-bit tables is very similar, but not identical, so >> paging_tmpl.h is #include'd twice in mmu.c, once with PTTTYPE=32 and once >> with PTTYPE=64, and this generates the two sets of similar functions. >> >> There are subtle but important differences between the format of EPT tables >> and that of ordinary x86 64-bit page tables, so for nested EPT we need a >> third set of functions to read the guest EPT table and to write the shadow >> EPT table. >> >> So this patch adds third PTTYPE, PTTYPE_EPT, which creates functions (prefixed >> with "EPT") which correctly read and write EPT tables. >> >> Signed-off-by: Nadav Har'El <nyh@il.ibm.com> >> Signed-off-by: Jun Nakajima <jun.nakajima@intel.com> >> Signed-off-by: Xinhao Xu <xinhao.xu@intel.com> >> --- >> arch/x86/kvm/mmu.c | 5 +++++ >> arch/x86/kvm/paging_tmpl.h | 43 +++++++++++++++++++++++++++++++++++++++++-- >> 2 files changed, 46 insertions(+), 2 deletions(-) >> >> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c >> index 117233f..6c1670f 100644 >> --- a/arch/x86/kvm/mmu.c >> +++ b/arch/x86/kvm/mmu.c >> @@ -3397,6 +3397,11 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gp >> return mmu->last_pte_bitmap & (1 << index); >> } >> >> +#define PTTYPE_EPT 18 /* arbitrary */ >> +#define PTTYPE PTTYPE_EPT >> +#include "paging_tmpl.h" >> +#undef PTTYPE >> + >> #define PTTYPE 64 >> #include "paging_tmpl.h" >> #undef PTTYPE >> diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h >> index df34d4a..4c45654 100644 >> --- a/arch/x86/kvm/paging_tmpl.h >> +++ b/arch/x86/kvm/paging_tmpl.h >> @@ -50,6 +50,22 @@ >> #define PT_LEVEL_BITS PT32_LEVEL_BITS >> #define PT_MAX_FULL_LEVELS 2 >> #define CMPXCHG cmpxchg >> +#elif PTTYPE == PTTYPE_EPT >> + #define pt_element_t u64 >> + #define guest_walker guest_walkerEPT >> + #define FNAME(name) EPT_##name >> + #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK >> + #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl) >> + #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) >> + #define PT_INDEX(addr, level) PT64_INDEX(addr, level) >> + #define PT_LEVEL_BITS PT64_LEVEL_BITS >> + #ifdef CONFIG_X86_64 >> + #define PT_MAX_FULL_LEVELS 4 >> + #define CMPXCHG cmpxchg >> + #else >> + #define CMPXCHG cmpxchg64 > > CMPXHG is only used in FNAME(cmpxchg_gpte), but you commented it later. > Do we really need it? > >> + #define PT_MAX_FULL_LEVELS 2 > > And the SDM says: > > "It uses a page-walk length of 4, meaning that at most 4 EPT paging-structure > entriesare accessed to translate a guest-physical address.", Is My SDM obsolete? > Which kind of process supports page-walk length = 2? > > It seems your patch is not able to handle the case that the guest uses walk-lenght = 2 > which is running on the host with walk-lenght = 4. > (plrease refer to how to handle sp->role.quadrant in FNAME(get_level1_sp_gpa) in > the current code.) > >> + #endif >> #else >> #error Invalid PTTYPE value >> #endif >> @@ -80,6 +96,10 @@ static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl) >> return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT; >> } >> >> +#if PTTYPE != PTTYPE_EPT >> +/* >> + * Comment out this for EPT because update_accessed_dirty_bits() is not used. >> + */ >> static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, >> pt_element_t __user *ptep_user, unsigned index, >> pt_element_t orig_pte, pt_element_t new_pte) >> @@ -102,6 +122,7 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, >> >> return (ret != orig_pte); >> } >> +#endif >> >> static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu, >> struct kvm_mmu_page *sp, u64 *spte, >> @@ -126,13 +147,21 @@ no_present: >> static inline unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, u64 gpte) >> { >> unsigned access; >> - >> +#if PTTYPE == PTTYPE_EPT >> + access = (gpte & (VMX_EPT_READABLE_MASK | VMX_EPT_WRITABLE_MASK | >> + VMX_EPT_EXECUTABLE_MASK)); > > It seems wrong. The ACC_XXX definition: > > #define ACC_EXEC_MASK 1 > #define ACC_WRITE_MASK PT_WRITABLE_MASK > #define ACC_USER_MASK PT_USER_MASK > #define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) > > The bits are different with the bits used in EPT page table, for example, > your code always see that the execution is not allowed. > >> +#else >> access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK; >> access &= ~(gpte >> PT64_NX_SHIFT); >> +#endif >> >> return access; >> } >> >> +#if PTTYPE != PTTYPE_EPT >> +/* >> + * EPT A/D bit support is not implemented. >> + */ >> static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu, >> struct kvm_mmu *mmu, >> struct guest_walker *walker, >> @@ -169,6 +198,7 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu, >> } >> return 0; >> } >> +#endif >> >> /* >> * Fetch a guest pte for a guest virtual address >> @@ -177,7 +207,6 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, >> struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, >> gva_t addr, u32 access) >> { >> - int ret; >> pt_element_t pte; >> pt_element_t __user *uninitialized_var(ptep_user); >> gfn_t table_gfn; >> @@ -192,7 +221,9 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, >> gfn_t gfn; >> >> trace_kvm_mmu_pagetable_walk(addr, access); >> +#if PTTYPE != PTTYPE_EPT >> retry_walk: >> +#endif >> walker->level = mmu->root_level; >> pte = mmu->get_cr3(vcpu); >> >> @@ -277,6 +308,7 @@ retry_walk: >> >> walker->gfn = real_gpa >> PAGE_SHIFT; >> >> +#if PTTYPE != PTTYPE_EPT >> if (!write_fault) >> protect_clean_gpte(&pte_access, pte); >> else >> @@ -287,12 +319,15 @@ retry_walk: >> accessed_dirty &= pte >> (PT_DIRTY_SHIFT - PT_ACCESSED_SHIFT); >> >> if (unlikely(!accessed_dirty)) { >> + int ret; >> + >> ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, write_fault); >> if (unlikely(ret < 0)) >> goto error; >> else if (ret) >> goto retry_walk; >> } >> +#endif > > There are lots of code in paging_tmpl.h depends on PT_ACCESSED_MASK/PT_DIRTY_MASK. > I do not see other parts are adjusted in your patch. > > How about redefine PT_ACCESSED_MASK / PT_DIRTY_MASK, something like: > > #if PTTYPE == 32 > PT_ACCESS = PT_ACCESSED_MASK; > ...... > #elif PTTYPE == 64 > PT_ACCESS = PT_ACCESSED_MASK; > ...... > #elif PTTYPE == PTTYPE_EPT > PT_ACCESS = 0 > #else > ....... > > I guess the compiler can drop the unnecessary branch when PT_ACCESS == 0. > Also, it can help use to remove the untidy "#if PTTYPE != PTTYPE_EPT" > >> >> walker->pt_access = pt_access; >> walker->pte_access = pte_access; >> @@ -323,6 +358,7 @@ static int FNAME(walk_addr)(struct guest_walker *walker, >> access); >> } >> >> +#if PTTYPE != PTTYPE_EPT >> static int FNAME(walk_addr_nested)(struct guest_walker *walker, >> struct kvm_vcpu *vcpu, gva_t addr, >> u32 access) >> @@ -330,6 +366,7 @@ static int FNAME(walk_addr_nested)(struct guest_walker *walker, >> return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu, >> addr, access); >> } >> +#endif >> >> static bool >> FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, >> @@ -754,6 +791,7 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access, >> return gpa; >> } >> >> +#if PTTYPE != PTTYPE_EPT >> static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr, >> u32 access, >> struct x86_exception *exception) >> @@ -772,6 +810,7 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr, >> >> return gpa; >> } >> +#endif > > Strange! > > Why does nested ept not need these functions? How to emulate the instruction faulted on L2? Sorry, i misunderstood it. Have found the reason out. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, May 21, 2013 at 04:30:13PM +0800, Xiao Guangrong wrote: > >> @@ -772,6 +810,7 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr, > >> > >> return gpa; > >> } > >> +#endif > > > > Strange! > > > > Why does nested ept not need these functions? How to emulate the instruction faulted on L2? > > Sorry, i misunderstood it. Have found the reason out. > You can write it down here for future reviewers :) -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 05/21/2013 05:01 PM, Gleb Natapov wrote: > On Tue, May 21, 2013 at 04:30:13PM +0800, Xiao Guangrong wrote: >>>> @@ -772,6 +810,7 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr, >>>> >>>> return gpa; >>>> } >>>> +#endif >>> >>> Strange! >>> >>> Why does nested ept not need these functions? How to emulate the instruction faulted on L2? >> >> Sorry, i misunderstood it. Have found the reason out. >> > You can write it down here for future reviewers :) Okay. The functions used to translate L2's gva to L1's gpa are paging32_gva_to_gpa_nested and paging64_gva_to_gpa_nested which are created by PTTYPE == 32 and PTTYPE == 64. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, May 21, 2013 at 4:05 AM, Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com> wrote: > On 05/21/2013 05:01 PM, Gleb Natapov wrote: >> On Tue, May 21, 2013 at 04:30:13PM +0800, Xiao Guangrong wrote: >>>>> @@ -772,6 +810,7 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr, >>>>> >>>>> return gpa; >>>>> } >>>>> +#endif >>>> >>>> Strange! >>>> >>>> Why does nested ept not need these functions? How to emulate the instruction faulted on L2? >>> >>> Sorry, i misunderstood it. Have found the reason out. >>> >> You can write it down here for future reviewers :) > > Okay. > > The functions used to translate L2's gva to L1's gpa are paging32_gva_to_gpa_nested > and paging64_gva_to_gpa_nested which are created by PTTYPE == 32 and PTTYPE == 64. > > Back to your comments on PT_MAX_FULL_LEVELS: > + #ifdef CONFIG_X86_64 > + #define PT_MAX_FULL_LEVELS 4 > + #define CMPXCHG cmpxchg > + #else > + #define CMPXCHG cmpxchg64 > + #define PT_MAX_FULL_LEVELS 2 I don't think we need to support nEPT on 32-bit hosts. So, I plan to remove such code. What do you think? -- Jun Intel Open Source Technology Center -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 05/22/2013 06:26 AM, Nakajima, Jun wrote: > On Tue, May 21, 2013 at 4:05 AM, Xiao Guangrong > <xiaoguangrong@linux.vnet.ibm.com> wrote: >> On 05/21/2013 05:01 PM, Gleb Natapov wrote: >>> On Tue, May 21, 2013 at 04:30:13PM +0800, Xiao Guangrong wrote: >>>>>> @@ -772,6 +810,7 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr, >>>>>> >>>>>> return gpa; >>>>>> } >>>>>> +#endif >>>>> >>>>> Strange! >>>>> >>>>> Why does nested ept not need these functions? How to emulate the instruction faulted on L2? >>>> >>>> Sorry, i misunderstood it. Have found the reason out. >>>> >>> You can write it down here for future reviewers :) >> >> Okay. >> >> The functions used to translate L2's gva to L1's gpa are paging32_gva_to_gpa_nested >> and paging64_gva_to_gpa_nested which are created by PTTYPE == 32 and PTTYPE == 64. >> >> > > Back to your comments on PT_MAX_FULL_LEVELS: >> + #ifdef CONFIG_X86_64 >> + #define PT_MAX_FULL_LEVELS 4 >> + #define CMPXCHG cmpxchg >> + #else >> + #define CMPXCHG cmpxchg64 >> + #define PT_MAX_FULL_LEVELS 2 > I don't think we need to support nEPT on 32-bit hosts. So, I plan to > remove such code. What do you think? Good to me. :) -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, May 21, 2013 at 03:26:18PM -0700, Nakajima, Jun wrote: > On Tue, May 21, 2013 at 4:05 AM, Xiao Guangrong > <xiaoguangrong@linux.vnet.ibm.com> wrote: > > On 05/21/2013 05:01 PM, Gleb Natapov wrote: > >> On Tue, May 21, 2013 at 04:30:13PM +0800, Xiao Guangrong wrote: > >>>>> @@ -772,6 +810,7 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr, > >>>>> > >>>>> return gpa; > >>>>> } > >>>>> +#endif > >>>> > >>>> Strange! > >>>> > >>>> Why does nested ept not need these functions? How to emulate the instruction faulted on L2? > >>> > >>> Sorry, i misunderstood it. Have found the reason out. > >>> > >> You can write it down here for future reviewers :) > > > > Okay. > > > > The functions used to translate L2's gva to L1's gpa are paging32_gva_to_gpa_nested > > and paging64_gva_to_gpa_nested which are created by PTTYPE == 32 and PTTYPE == 64. > > > > > > Back to your comments on PT_MAX_FULL_LEVELS: > > + #ifdef CONFIG_X86_64 > > + #define PT_MAX_FULL_LEVELS 4 > > + #define CMPXCHG cmpxchg > > + #else > > + #define CMPXCHG cmpxchg64 > > + #define PT_MAX_FULL_LEVELS 2 > I don't think we need to support nEPT on 32-bit hosts. So, I plan to > remove such code. What do you think? > Why shouldn't we support nEPT on 32-bit hosts? -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, May 21, 2013 at 03:52:12PM +0800, Xiao Guangrong wrote: > On 05/19/2013 12:52 PM, Jun Nakajima wrote: > > From: Nadav Har'El <nyh@il.ibm.com> > > > > This is the first patch in a series which adds nested EPT support to KVM's > > nested VMX. Nested EPT means emulating EPT for an L1 guest so that L1 can use > > EPT when running a nested guest L2. When L1 uses EPT, it allows the L2 guest > > to set its own cr3 and take its own page faults without either of L0 or L1 > > getting involved. This often significanlty improves L2's performance over the > > previous two alternatives (shadow page tables over EPT, and shadow page > > tables over shadow page tables). > > > > This patch adds EPT support to paging_tmpl.h. > > > > paging_tmpl.h contains the code for reading and writing page tables. The code > > for 32-bit and 64-bit tables is very similar, but not identical, so > > paging_tmpl.h is #include'd twice in mmu.c, once with PTTTYPE=32 and once > > with PTTYPE=64, and this generates the two sets of similar functions. > > > > There are subtle but important differences between the format of EPT tables > > and that of ordinary x86 64-bit page tables, so for nested EPT we need a > > third set of functions to read the guest EPT table and to write the shadow > > EPT table. > > > > So this patch adds third PTTYPE, PTTYPE_EPT, which creates functions (prefixed > > with "EPT") which correctly read and write EPT tables. > > > > Signed-off-by: Nadav Har'El <nyh@il.ibm.com> > > Signed-off-by: Jun Nakajima <jun.nakajima@intel.com> > > Signed-off-by: Xinhao Xu <xinhao.xu@intel.com> > > --- > > arch/x86/kvm/mmu.c | 5 +++++ > > arch/x86/kvm/paging_tmpl.h | 43 +++++++++++++++++++++++++++++++++++++++++-- > > 2 files changed, 46 insertions(+), 2 deletions(-) > > > > diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c > > index 117233f..6c1670f 100644 > > --- a/arch/x86/kvm/mmu.c > > +++ b/arch/x86/kvm/mmu.c > > @@ -3397,6 +3397,11 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gp > > return mmu->last_pte_bitmap & (1 << index); > > } > > > > +#define PTTYPE_EPT 18 /* arbitrary */ > > +#define PTTYPE PTTYPE_EPT > > +#include "paging_tmpl.h" > > +#undef PTTYPE > > + > > #define PTTYPE 64 > > #include "paging_tmpl.h" > > #undef PTTYPE > > diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h > > index df34d4a..4c45654 100644 > > --- a/arch/x86/kvm/paging_tmpl.h > > +++ b/arch/x86/kvm/paging_tmpl.h > > @@ -50,6 +50,22 @@ > > #define PT_LEVEL_BITS PT32_LEVEL_BITS > > #define PT_MAX_FULL_LEVELS 2 > > #define CMPXCHG cmpxchg > > +#elif PTTYPE == PTTYPE_EPT > > + #define pt_element_t u64 > > + #define guest_walker guest_walkerEPT > > + #define FNAME(name) EPT_##name > > + #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK > > + #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl) > > + #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) > > + #define PT_INDEX(addr, level) PT64_INDEX(addr, level) > > + #define PT_LEVEL_BITS PT64_LEVEL_BITS > > + #ifdef CONFIG_X86_64 > > + #define PT_MAX_FULL_LEVELS 4 > > + #define CMPXCHG cmpxchg > > + #else > > + #define CMPXCHG cmpxchg64 > > CMPXHG is only used in FNAME(cmpxchg_gpte), but you commented it later. > Do we really need it? > > > + #define PT_MAX_FULL_LEVELS 2 > > And the SDM says: > > "It uses a page-walk length of 4, meaning that at most 4 EPT paging-structure > entriesare accessed to translate a guest-physical address.", Is My SDM obsolete? > Which kind of process supports page-walk length = 2? > > It seems your patch is not able to handle the case that the guest uses walk-lenght = 2 > which is running on the host with walk-lenght = 4. > (plrease refer to how to handle sp->role.quadrant in FNAME(get_level1_sp_gpa) in > the current code.) > But since EPT always has 4 levels on all existing cpus it is not an issue and the only case that we should worry about is guest walk-lenght == host walk-lenght == 4, or have I misunderstood what you mean here? -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 06/11/2013 07:32 PM, Gleb Natapov wrote: > On Tue, May 21, 2013 at 03:52:12PM +0800, Xiao Guangrong wrote: >> On 05/19/2013 12:52 PM, Jun Nakajima wrote: >>> From: Nadav Har'El <nyh@il.ibm.com> >>> >>> This is the first patch in a series which adds nested EPT support to KVM's >>> nested VMX. Nested EPT means emulating EPT for an L1 guest so that L1 can use >>> EPT when running a nested guest L2. When L1 uses EPT, it allows the L2 guest >>> to set its own cr3 and take its own page faults without either of L0 or L1 >>> getting involved. This often significanlty improves L2's performance over the >>> previous two alternatives (shadow page tables over EPT, and shadow page >>> tables over shadow page tables). >>> >>> This patch adds EPT support to paging_tmpl.h. >>> >>> paging_tmpl.h contains the code for reading and writing page tables. The code >>> for 32-bit and 64-bit tables is very similar, but not identical, so >>> paging_tmpl.h is #include'd twice in mmu.c, once with PTTTYPE=32 and once >>> with PTTYPE=64, and this generates the two sets of similar functions. >>> >>> There are subtle but important differences between the format of EPT tables >>> and that of ordinary x86 64-bit page tables, so for nested EPT we need a >>> third set of functions to read the guest EPT table and to write the shadow >>> EPT table. >>> >>> So this patch adds third PTTYPE, PTTYPE_EPT, which creates functions (prefixed >>> with "EPT") which correctly read and write EPT tables. >>> >>> Signed-off-by: Nadav Har'El <nyh@il.ibm.com> >>> Signed-off-by: Jun Nakajima <jun.nakajima@intel.com> >>> Signed-off-by: Xinhao Xu <xinhao.xu@intel.com> >>> --- >>> arch/x86/kvm/mmu.c | 5 +++++ >>> arch/x86/kvm/paging_tmpl.h | 43 +++++++++++++++++++++++++++++++++++++++++-- >>> 2 files changed, 46 insertions(+), 2 deletions(-) >>> >>> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c >>> index 117233f..6c1670f 100644 >>> --- a/arch/x86/kvm/mmu.c >>> +++ b/arch/x86/kvm/mmu.c >>> @@ -3397,6 +3397,11 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gp >>> return mmu->last_pte_bitmap & (1 << index); >>> } >>> >>> +#define PTTYPE_EPT 18 /* arbitrary */ >>> +#define PTTYPE PTTYPE_EPT >>> +#include "paging_tmpl.h" >>> +#undef PTTYPE >>> + >>> #define PTTYPE 64 >>> #include "paging_tmpl.h" >>> #undef PTTYPE >>> diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h >>> index df34d4a..4c45654 100644 >>> --- a/arch/x86/kvm/paging_tmpl.h >>> +++ b/arch/x86/kvm/paging_tmpl.h >>> @@ -50,6 +50,22 @@ >>> #define PT_LEVEL_BITS PT32_LEVEL_BITS >>> #define PT_MAX_FULL_LEVELS 2 >>> #define CMPXCHG cmpxchg >>> +#elif PTTYPE == PTTYPE_EPT >>> + #define pt_element_t u64 >>> + #define guest_walker guest_walkerEPT >>> + #define FNAME(name) EPT_##name >>> + #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK >>> + #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl) >>> + #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) >>> + #define PT_INDEX(addr, level) PT64_INDEX(addr, level) >>> + #define PT_LEVEL_BITS PT64_LEVEL_BITS >>> + #ifdef CONFIG_X86_64 >>> + #define PT_MAX_FULL_LEVELS 4 >>> + #define CMPXCHG cmpxchg >>> + #else >>> + #define CMPXCHG cmpxchg64 >> >> CMPXHG is only used in FNAME(cmpxchg_gpte), but you commented it later. >> Do we really need it? >> >>> + #define PT_MAX_FULL_LEVELS 2 >> >> And the SDM says: >> >> "It uses a page-walk length of 4, meaning that at most 4 EPT paging-structure >> entriesare accessed to translate a guest-physical address.", Is My SDM obsolete? >> Which kind of process supports page-walk length = 2? >> >> It seems your patch is not able to handle the case that the guest uses walk-lenght = 2 >> which is running on the host with walk-lenght = 4. >> (plrease refer to how to handle sp->role.quadrant in FNAME(get_level1_sp_gpa) in >> the current code.) >> > But since EPT always has 4 levels on all existing cpus it is not an issue and the only case > that we should worry about is guest walk-lenght == host walk-lenght == 4, or have I Yes. I totally agree with you, but... > misunderstood what you mean here? What confused me is that this patch defines "#define PT_MAX_FULL_LEVELS 2", so i asked the question: "Which kind of process supports page-walk length = 2". Sorry, there is a typo in my origin comments. "process" should be "processor" or "CPU". -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, Jun 17, 2013 at 08:11:03PM +0800, Xiao Guangrong wrote: > On 06/11/2013 07:32 PM, Gleb Natapov wrote: > > On Tue, May 21, 2013 at 03:52:12PM +0800, Xiao Guangrong wrote: > >> On 05/19/2013 12:52 PM, Jun Nakajima wrote: > >>> From: Nadav Har'El <nyh@il.ibm.com> > >>> > >>> This is the first patch in a series which adds nested EPT support to KVM's > >>> nested VMX. Nested EPT means emulating EPT for an L1 guest so that L1 can use > >>> EPT when running a nested guest L2. When L1 uses EPT, it allows the L2 guest > >>> to set its own cr3 and take its own page faults without either of L0 or L1 > >>> getting involved. This often significanlty improves L2's performance over the > >>> previous two alternatives (shadow page tables over EPT, and shadow page > >>> tables over shadow page tables). > >>> > >>> This patch adds EPT support to paging_tmpl.h. > >>> > >>> paging_tmpl.h contains the code for reading and writing page tables. The code > >>> for 32-bit and 64-bit tables is very similar, but not identical, so > >>> paging_tmpl.h is #include'd twice in mmu.c, once with PTTTYPE=32 and once > >>> with PTTYPE=64, and this generates the two sets of similar functions. > >>> > >>> There are subtle but important differences between the format of EPT tables > >>> and that of ordinary x86 64-bit page tables, so for nested EPT we need a > >>> third set of functions to read the guest EPT table and to write the shadow > >>> EPT table. > >>> > >>> So this patch adds third PTTYPE, PTTYPE_EPT, which creates functions (prefixed > >>> with "EPT") which correctly read and write EPT tables. > >>> > >>> Signed-off-by: Nadav Har'El <nyh@il.ibm.com> > >>> Signed-off-by: Jun Nakajima <jun.nakajima@intel.com> > >>> Signed-off-by: Xinhao Xu <xinhao.xu@intel.com> > >>> --- > >>> arch/x86/kvm/mmu.c | 5 +++++ > >>> arch/x86/kvm/paging_tmpl.h | 43 +++++++++++++++++++++++++++++++++++++++++-- > >>> 2 files changed, 46 insertions(+), 2 deletions(-) > >>> > >>> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c > >>> index 117233f..6c1670f 100644 > >>> --- a/arch/x86/kvm/mmu.c > >>> +++ b/arch/x86/kvm/mmu.c > >>> @@ -3397,6 +3397,11 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gp > >>> return mmu->last_pte_bitmap & (1 << index); > >>> } > >>> > >>> +#define PTTYPE_EPT 18 /* arbitrary */ > >>> +#define PTTYPE PTTYPE_EPT > >>> +#include "paging_tmpl.h" > >>> +#undef PTTYPE > >>> + > >>> #define PTTYPE 64 > >>> #include "paging_tmpl.h" > >>> #undef PTTYPE > >>> diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h > >>> index df34d4a..4c45654 100644 > >>> --- a/arch/x86/kvm/paging_tmpl.h > >>> +++ b/arch/x86/kvm/paging_tmpl.h > >>> @@ -50,6 +50,22 @@ > >>> #define PT_LEVEL_BITS PT32_LEVEL_BITS > >>> #define PT_MAX_FULL_LEVELS 2 > >>> #define CMPXCHG cmpxchg > >>> +#elif PTTYPE == PTTYPE_EPT > >>> + #define pt_element_t u64 > >>> + #define guest_walker guest_walkerEPT > >>> + #define FNAME(name) EPT_##name > >>> + #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK > >>> + #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl) > >>> + #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) > >>> + #define PT_INDEX(addr, level) PT64_INDEX(addr, level) > >>> + #define PT_LEVEL_BITS PT64_LEVEL_BITS > >>> + #ifdef CONFIG_X86_64 > >>> + #define PT_MAX_FULL_LEVELS 4 > >>> + #define CMPXCHG cmpxchg > >>> + #else > >>> + #define CMPXCHG cmpxchg64 > >> > >> CMPXHG is only used in FNAME(cmpxchg_gpte), but you commented it later. > >> Do we really need it? > >> > >>> + #define PT_MAX_FULL_LEVELS 2 > >> > >> And the SDM says: > >> > >> "It uses a page-walk length of 4, meaning that at most 4 EPT paging-structure > >> entriesare accessed to translate a guest-physical address.", Is My SDM obsolete? > >> Which kind of process supports page-walk length = 2? > >> > >> It seems your patch is not able to handle the case that the guest uses walk-lenght = 2 > >> which is running on the host with walk-lenght = 4. > >> (plrease refer to how to handle sp->role.quadrant in FNAME(get_level1_sp_gpa) in > >> the current code.) > >> > > But since EPT always has 4 levels on all existing cpus it is not an issue and the only case > > that we should worry about is guest walk-lenght == host walk-lenght == 4, or have I > > Yes. I totally agree with you, but... > > > misunderstood what you mean here? > > What confused me is that this patch defines "#define PT_MAX_FULL_LEVELS 2", so i asked the > question: "Which kind of process supports page-walk length = 2". > Sorry, there is a typo in my origin comments. "process" should be "processor" or "CPU". > That is how I understood it, but then the discussion moved to dropping of nEPT support on 32-bit host. What's the connection? Even on 32bit host the walk is 4 levels. Doesn't shadow page code support 4 levels on 32bit host? -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 06/18/2013 06:57 PM, Gleb Natapov wrote: > On Mon, Jun 17, 2013 at 08:11:03PM +0800, Xiao Guangrong wrote: >> On 06/11/2013 07:32 PM, Gleb Natapov wrote: >>> On Tue, May 21, 2013 at 03:52:12PM +0800, Xiao Guangrong wrote: >>>> On 05/19/2013 12:52 PM, Jun Nakajima wrote: >>>>> From: Nadav Har'El <nyh@il.ibm.com> >>>>> >>>>> This is the first patch in a series which adds nested EPT support to KVM's >>>>> nested VMX. Nested EPT means emulating EPT for an L1 guest so that L1 can use >>>>> EPT when running a nested guest L2. When L1 uses EPT, it allows the L2 guest >>>>> to set its own cr3 and take its own page faults without either of L0 or L1 >>>>> getting involved. This often significanlty improves L2's performance over the >>>>> previous two alternatives (shadow page tables over EPT, and shadow page >>>>> tables over shadow page tables). >>>>> >>>>> This patch adds EPT support to paging_tmpl.h. >>>>> >>>>> paging_tmpl.h contains the code for reading and writing page tables. The code >>>>> for 32-bit and 64-bit tables is very similar, but not identical, so >>>>> paging_tmpl.h is #include'd twice in mmu.c, once with PTTTYPE=32 and once >>>>> with PTTYPE=64, and this generates the two sets of similar functions. >>>>> >>>>> There are subtle but important differences between the format of EPT tables >>>>> and that of ordinary x86 64-bit page tables, so for nested EPT we need a >>>>> third set of functions to read the guest EPT table and to write the shadow >>>>> EPT table. >>>>> >>>>> So this patch adds third PTTYPE, PTTYPE_EPT, which creates functions (prefixed >>>>> with "EPT") which correctly read and write EPT tables. >>>>> >>>>> Signed-off-by: Nadav Har'El <nyh@il.ibm.com> >>>>> Signed-off-by: Jun Nakajima <jun.nakajima@intel.com> >>>>> Signed-off-by: Xinhao Xu <xinhao.xu@intel.com> >>>>> --- >>>>> arch/x86/kvm/mmu.c | 5 +++++ >>>>> arch/x86/kvm/paging_tmpl.h | 43 +++++++++++++++++++++++++++++++++++++++++-- >>>>> 2 files changed, 46 insertions(+), 2 deletions(-) >>>>> >>>>> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c >>>>> index 117233f..6c1670f 100644 >>>>> --- a/arch/x86/kvm/mmu.c >>>>> +++ b/arch/x86/kvm/mmu.c >>>>> @@ -3397,6 +3397,11 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gp >>>>> return mmu->last_pte_bitmap & (1 << index); >>>>> } >>>>> >>>>> +#define PTTYPE_EPT 18 /* arbitrary */ >>>>> +#define PTTYPE PTTYPE_EPT >>>>> +#include "paging_tmpl.h" >>>>> +#undef PTTYPE >>>>> + >>>>> #define PTTYPE 64 >>>>> #include "paging_tmpl.h" >>>>> #undef PTTYPE >>>>> diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h >>>>> index df34d4a..4c45654 100644 >>>>> --- a/arch/x86/kvm/paging_tmpl.h >>>>> +++ b/arch/x86/kvm/paging_tmpl.h >>>>> @@ -50,6 +50,22 @@ >>>>> #define PT_LEVEL_BITS PT32_LEVEL_BITS >>>>> #define PT_MAX_FULL_LEVELS 2 >>>>> #define CMPXCHG cmpxchg >>>>> +#elif PTTYPE == PTTYPE_EPT >>>>> + #define pt_element_t u64 >>>>> + #define guest_walker guest_walkerEPT >>>>> + #define FNAME(name) EPT_##name >>>>> + #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK >>>>> + #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl) >>>>> + #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) >>>>> + #define PT_INDEX(addr, level) PT64_INDEX(addr, level) >>>>> + #define PT_LEVEL_BITS PT64_LEVEL_BITS >>>>> + #ifdef CONFIG_X86_64 >>>>> + #define PT_MAX_FULL_LEVELS 4 >>>>> + #define CMPXCHG cmpxchg >>>>> + #else >>>>> + #define CMPXCHG cmpxchg64 >>>> >>>> CMPXHG is only used in FNAME(cmpxchg_gpte), but you commented it later. >>>> Do we really need it? >>>> >>>>> + #define PT_MAX_FULL_LEVELS 2 >>>> >>>> And the SDM says: >>>> >>>> "It uses a page-walk length of 4, meaning that at most 4 EPT paging-structure >>>> entriesare accessed to translate a guest-physical address.", Is My SDM obsolete? >>>> Which kind of process supports page-walk length = 2? >>>> >>>> It seems your patch is not able to handle the case that the guest uses walk-lenght = 2 >>>> which is running on the host with walk-lenght = 4. >>>> (plrease refer to how to handle sp->role.quadrant in FNAME(get_level1_sp_gpa) in >>>> the current code.) >>>> >>> But since EPT always has 4 levels on all existing cpus it is not an issue and the only case >>> that we should worry about is guest walk-lenght == host walk-lenght == 4, or have I >> >> Yes. I totally agree with you, but... >> >>> misunderstood what you mean here? >> >> What confused me is that this patch defines "#define PT_MAX_FULL_LEVELS 2", so i asked the >> question: "Which kind of process supports page-walk length = 2". >> Sorry, there is a typo in my origin comments. "process" should be "processor" or "CPU". >> > That is how I understood it, but then the discussion moved to dropping > of nEPT support on 32-bit host. What's the connection? Even on 32bit If EPT supports "walk-level = 2" on 32bit host (maybe it is not true), i thought dropping 32bit support to reduce the complex is worthwhile, otherwise: a) we need to handle different page size between L0 and L2 and b) we need to carefully review the code due to lacking PDPT supporting on nept on L2. I remember that the origin version of NEPT did not support PAE-32bit L2 guest. I have found it out: http://comments.gmane.org/gmane.comp.emulators.kvm.devel/95395 It seems no changes in this version, I have no idea how it was fixed in this version. > host the walk is 4 levels. Doesn't shadow page code support 4 levels on > 32bit host? Yes, it does. 4 levels is fine on 32bit host. If EPT only supports 4 levels on both 32bit and 64bit hosts, there is no big difference to support nept on 32bit L2 and 64bit L2. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Jun 18, 2013 at 08:51:25PM +0800, Xiao Guangrong wrote: > On 06/18/2013 06:57 PM, Gleb Natapov wrote: > > On Mon, Jun 17, 2013 at 08:11:03PM +0800, Xiao Guangrong wrote: > >> On 06/11/2013 07:32 PM, Gleb Natapov wrote: > >>> On Tue, May 21, 2013 at 03:52:12PM +0800, Xiao Guangrong wrote: > >>>> On 05/19/2013 12:52 PM, Jun Nakajima wrote: > >>>>> From: Nadav Har'El <nyh@il.ibm.com> > >>>>> > >>>>> This is the first patch in a series which adds nested EPT support to KVM's > >>>>> nested VMX. Nested EPT means emulating EPT for an L1 guest so that L1 can use > >>>>> EPT when running a nested guest L2. When L1 uses EPT, it allows the L2 guest > >>>>> to set its own cr3 and take its own page faults without either of L0 or L1 > >>>>> getting involved. This often significanlty improves L2's performance over the > >>>>> previous two alternatives (shadow page tables over EPT, and shadow page > >>>>> tables over shadow page tables). > >>>>> > >>>>> This patch adds EPT support to paging_tmpl.h. > >>>>> > >>>>> paging_tmpl.h contains the code for reading and writing page tables. The code > >>>>> for 32-bit and 64-bit tables is very similar, but not identical, so > >>>>> paging_tmpl.h is #include'd twice in mmu.c, once with PTTTYPE=32 and once > >>>>> with PTTYPE=64, and this generates the two sets of similar functions. > >>>>> > >>>>> There are subtle but important differences between the format of EPT tables > >>>>> and that of ordinary x86 64-bit page tables, so for nested EPT we need a > >>>>> third set of functions to read the guest EPT table and to write the shadow > >>>>> EPT table. > >>>>> > >>>>> So this patch adds third PTTYPE, PTTYPE_EPT, which creates functions (prefixed > >>>>> with "EPT") which correctly read and write EPT tables. > >>>>> > >>>>> Signed-off-by: Nadav Har'El <nyh@il.ibm.com> > >>>>> Signed-off-by: Jun Nakajima <jun.nakajima@intel.com> > >>>>> Signed-off-by: Xinhao Xu <xinhao.xu@intel.com> > >>>>> --- > >>>>> arch/x86/kvm/mmu.c | 5 +++++ > >>>>> arch/x86/kvm/paging_tmpl.h | 43 +++++++++++++++++++++++++++++++++++++++++-- > >>>>> 2 files changed, 46 insertions(+), 2 deletions(-) > >>>>> > >>>>> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c > >>>>> index 117233f..6c1670f 100644 > >>>>> --- a/arch/x86/kvm/mmu.c > >>>>> +++ b/arch/x86/kvm/mmu.c > >>>>> @@ -3397,6 +3397,11 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gp > >>>>> return mmu->last_pte_bitmap & (1 << index); > >>>>> } > >>>>> > >>>>> +#define PTTYPE_EPT 18 /* arbitrary */ > >>>>> +#define PTTYPE PTTYPE_EPT > >>>>> +#include "paging_tmpl.h" > >>>>> +#undef PTTYPE > >>>>> + > >>>>> #define PTTYPE 64 > >>>>> #include "paging_tmpl.h" > >>>>> #undef PTTYPE > >>>>> diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h > >>>>> index df34d4a..4c45654 100644 > >>>>> --- a/arch/x86/kvm/paging_tmpl.h > >>>>> +++ b/arch/x86/kvm/paging_tmpl.h > >>>>> @@ -50,6 +50,22 @@ > >>>>> #define PT_LEVEL_BITS PT32_LEVEL_BITS > >>>>> #define PT_MAX_FULL_LEVELS 2 > >>>>> #define CMPXCHG cmpxchg > >>>>> +#elif PTTYPE == PTTYPE_EPT > >>>>> + #define pt_element_t u64 > >>>>> + #define guest_walker guest_walkerEPT > >>>>> + #define FNAME(name) EPT_##name > >>>>> + #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK > >>>>> + #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl) > >>>>> + #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) > >>>>> + #define PT_INDEX(addr, level) PT64_INDEX(addr, level) > >>>>> + #define PT_LEVEL_BITS PT64_LEVEL_BITS > >>>>> + #ifdef CONFIG_X86_64 > >>>>> + #define PT_MAX_FULL_LEVELS 4 > >>>>> + #define CMPXCHG cmpxchg > >>>>> + #else > >>>>> + #define CMPXCHG cmpxchg64 > >>>> > >>>> CMPXHG is only used in FNAME(cmpxchg_gpte), but you commented it later. > >>>> Do we really need it? > >>>> > >>>>> + #define PT_MAX_FULL_LEVELS 2 > >>>> > >>>> And the SDM says: > >>>> > >>>> "It uses a page-walk length of 4, meaning that at most 4 EPT paging-structure > >>>> entriesare accessed to translate a guest-physical address.", Is My SDM obsolete? > >>>> Which kind of process supports page-walk length = 2? > >>>> > >>>> It seems your patch is not able to handle the case that the guest uses walk-lenght = 2 > >>>> which is running on the host with walk-lenght = 4. > >>>> (plrease refer to how to handle sp->role.quadrant in FNAME(get_level1_sp_gpa) in > >>>> the current code.) > >>>> > >>> But since EPT always has 4 levels on all existing cpus it is not an issue and the only case > >>> that we should worry about is guest walk-lenght == host walk-lenght == 4, or have I > >> > >> Yes. I totally agree with you, but... > >> > >>> misunderstood what you mean here? > >> > >> What confused me is that this patch defines "#define PT_MAX_FULL_LEVELS 2", so i asked the > >> question: "Which kind of process supports page-walk length = 2". > >> Sorry, there is a typo in my origin comments. "process" should be "processor" or "CPU". > >> > > That is how I understood it, but then the discussion moved to dropping > > of nEPT support on 32-bit host. What's the connection? Even on 32bit > > If EPT supports "walk-level = 2" on 32bit host (maybe it is not true), i thought dropping > 32bit support to reduce the complex is worthwhile, otherwise: > a) we need to handle different page size between L0 and L2 and > b) we need to carefully review the code due to lacking PDPT supporting on nept on L2. > > I remember that the origin version of NEPT did not support PAE-32bit L2 guest. I have > found it out: > http://comments.gmane.org/gmane.comp.emulators.kvm.devel/95395 > > It seems no changes in this version, I have no idea how it was fixed in this version. > I think that patches that reloads nested PDPT pointers did that. > > host the walk is 4 levels. Doesn't shadow page code support 4 levels on > > 32bit host? > > Yes, it does. 4 levels is fine on 32bit host. > > If EPT only supports 4 levels on both 32bit and 64bit hosts, there is no big difference > to support nept on 32bit L2 and 64bit L2. > OK, that is my understating too. Thanks for confirmation. -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 117233f..6c1670f 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3397,6 +3397,11 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gp return mmu->last_pte_bitmap & (1 << index); } +#define PTTYPE_EPT 18 /* arbitrary */ +#define PTTYPE PTTYPE_EPT +#include "paging_tmpl.h" +#undef PTTYPE + #define PTTYPE 64 #include "paging_tmpl.h" #undef PTTYPE diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index df34d4a..4c45654 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -50,6 +50,22 @@ #define PT_LEVEL_BITS PT32_LEVEL_BITS #define PT_MAX_FULL_LEVELS 2 #define CMPXCHG cmpxchg +#elif PTTYPE == PTTYPE_EPT + #define pt_element_t u64 + #define guest_walker guest_walkerEPT + #define FNAME(name) EPT_##name + #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK + #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl) + #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) + #define PT_INDEX(addr, level) PT64_INDEX(addr, level) + #define PT_LEVEL_BITS PT64_LEVEL_BITS + #ifdef CONFIG_X86_64 + #define PT_MAX_FULL_LEVELS 4 + #define CMPXCHG cmpxchg + #else + #define CMPXCHG cmpxchg64 + #define PT_MAX_FULL_LEVELS 2 + #endif #else #error Invalid PTTYPE value #endif @@ -80,6 +96,10 @@ static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl) return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT; } +#if PTTYPE != PTTYPE_EPT +/* + * Comment out this for EPT because update_accessed_dirty_bits() is not used. + */ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, pt_element_t __user *ptep_user, unsigned index, pt_element_t orig_pte, pt_element_t new_pte) @@ -102,6 +122,7 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, return (ret != orig_pte); } +#endif static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *spte, @@ -126,13 +147,21 @@ no_present: static inline unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, u64 gpte) { unsigned access; - +#if PTTYPE == PTTYPE_EPT + access = (gpte & (VMX_EPT_READABLE_MASK | VMX_EPT_WRITABLE_MASK | + VMX_EPT_EXECUTABLE_MASK)); +#else access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK; access &= ~(gpte >> PT64_NX_SHIFT); +#endif return access; } +#if PTTYPE != PTTYPE_EPT +/* + * EPT A/D bit support is not implemented. + */ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, struct guest_walker *walker, @@ -169,6 +198,7 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu, } return 0; } +#endif /* * Fetch a guest pte for a guest virtual address @@ -177,7 +207,6 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, gva_t addr, u32 access) { - int ret; pt_element_t pte; pt_element_t __user *uninitialized_var(ptep_user); gfn_t table_gfn; @@ -192,7 +221,9 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, gfn_t gfn; trace_kvm_mmu_pagetable_walk(addr, access); +#if PTTYPE != PTTYPE_EPT retry_walk: +#endif walker->level = mmu->root_level; pte = mmu->get_cr3(vcpu); @@ -277,6 +308,7 @@ retry_walk: walker->gfn = real_gpa >> PAGE_SHIFT; +#if PTTYPE != PTTYPE_EPT if (!write_fault) protect_clean_gpte(&pte_access, pte); else @@ -287,12 +319,15 @@ retry_walk: accessed_dirty &= pte >> (PT_DIRTY_SHIFT - PT_ACCESSED_SHIFT); if (unlikely(!accessed_dirty)) { + int ret; + ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, write_fault); if (unlikely(ret < 0)) goto error; else if (ret) goto retry_walk; } +#endif walker->pt_access = pt_access; walker->pte_access = pte_access; @@ -323,6 +358,7 @@ static int FNAME(walk_addr)(struct guest_walker *walker, access); } +#if PTTYPE != PTTYPE_EPT static int FNAME(walk_addr_nested)(struct guest_walker *walker, struct kvm_vcpu *vcpu, gva_t addr, u32 access) @@ -330,6 +366,7 @@ static int FNAME(walk_addr_nested)(struct guest_walker *walker, return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu, addr, access); } +#endif static bool FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, @@ -754,6 +791,7 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access, return gpa; } +#if PTTYPE != PTTYPE_EPT static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access, struct x86_exception *exception) @@ -772,6 +810,7 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr, return gpa; } +#endif /* * Using the cached information from sp->gfns is safe because: