diff mbox

[v5,07/14] nEPT: Add EPT tables support to paging_tmpl.h

Message ID 1375282131-9713-8-git-send-email-gleb@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Gleb Natapov July 31, 2013, 2:48 p.m. UTC
From: Nadav Har'El <nyh@il.ibm.com>

This is the first patch in a series which adds nested EPT support to KVM's
nested VMX. Nested EPT means emulating EPT for an L1 guest so that L1 can use
EPT when running a nested guest L2. When L1 uses EPT, it allows the L2 guest
to set its own cr3 and take its own page faults without either of L0 or L1
getting involved. This often significanlty improves L2's performance over the
previous two alternatives (shadow page tables over EPT, and shadow page
tables over shadow page tables).

This patch adds EPT support to paging_tmpl.h.

paging_tmpl.h contains the code for reading and writing page tables. The code
for 32-bit and 64-bit tables is very similar, but not identical, so
paging_tmpl.h is #include'd twice in mmu.c, once with PTTTYPE=32 and once
with PTTYPE=64, and this generates the two sets of similar functions.

There are subtle but important differences between the format of EPT tables
and that of ordinary x86 64-bit page tables, so for nested EPT we need a
third set of functions to read the guest EPT table and to write the shadow
EPT table.

So this patch adds third PTTYPE, PTTYPE_EPT, which creates functions (prefixed
with "EPT") which correctly read and write EPT tables.

Signed-off-by: Nadav Har'El <nyh@il.ibm.com>
Signed-off-by: Jun Nakajima <jun.nakajima@intel.com>
Signed-off-by: Xinhao Xu <xinhao.xu@intel.com>
Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/mmu.c         |    5 +++++
 arch/x86/kvm/paging_tmpl.h |   37 ++++++++++++++++++++++++++++++++++++-
 2 files changed, 41 insertions(+), 1 deletion(-)

Comments

Xiao Guangrong Aug. 1, 2013, 7 a.m. UTC | #1
On 07/31/2013 10:48 PM, Gleb Natapov wrote:
> From: Nadav Har'El <nyh@il.ibm.com>
> 
> This is the first patch in a series which adds nested EPT support to KVM's
> nested VMX. Nested EPT means emulating EPT for an L1 guest so that L1 can use
> EPT when running a nested guest L2. When L1 uses EPT, it allows the L2 guest
> to set its own cr3 and take its own page faults without either of L0 or L1
> getting involved. This often significanlty improves L2's performance over the
> previous two alternatives (shadow page tables over EPT, and shadow page
> tables over shadow page tables).
> 
> This patch adds EPT support to paging_tmpl.h.
> 
> paging_tmpl.h contains the code for reading and writing page tables. The code
> for 32-bit and 64-bit tables is very similar, but not identical, so
> paging_tmpl.h is #include'd twice in mmu.c, once with PTTTYPE=32 and once
> with PTTYPE=64, and this generates the two sets of similar functions.
> 
> There are subtle but important differences between the format of EPT tables
> and that of ordinary x86 64-bit page tables, so for nested EPT we need a
> third set of functions to read the guest EPT table and to write the shadow
> EPT table.
> 
> So this patch adds third PTTYPE, PTTYPE_EPT, which creates functions (prefixed
> with "EPT") which correctly read and write EPT tables.
> 
> Signed-off-by: Nadav Har'El <nyh@il.ibm.com>
> Signed-off-by: Jun Nakajima <jun.nakajima@intel.com>
> Signed-off-by: Xinhao Xu <xinhao.xu@intel.com>
> Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com>
> Signed-off-by: Gleb Natapov <gleb@redhat.com>
> ---
>  arch/x86/kvm/mmu.c         |    5 +++++
>  arch/x86/kvm/paging_tmpl.h |   37 ++++++++++++++++++++++++++++++++++++-
>  2 files changed, 41 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index 4c4274d..b5273c3 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -3494,6 +3494,11 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gp
>  	return mmu->last_pte_bitmap & (1 << index);
>  }
> 
> +#define PTTYPE_EPT 18 /* arbitrary */
> +#define PTTYPE PTTYPE_EPT
> +#include "paging_tmpl.h"
> +#undef PTTYPE
> +
>  #define PTTYPE 64
>  #include "paging_tmpl.h"
>  #undef PTTYPE
> diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
> index 2c2f635..762c904 100644
> --- a/arch/x86/kvm/paging_tmpl.h
> +++ b/arch/x86/kvm/paging_tmpl.h
> @@ -23,6 +23,13 @@
>   * so the code in this file is compiled twice, once per pte size.
>   */
> 
> +/*
> + * This is used to catch non optimized PT_GUEST_(DIRTY|ACCESS)_SHIFT macro
> + * uses for EPT without A/D paging type.
> + */
> +extern u64 __pure __using_nonexistent_pte_bit(void)
> +	       __compiletime_error("wrong use of PT_GUEST_(DIRTY|ACCESS)_SHIFT");
> +
>  #if PTTYPE == 64
>  	#define pt_element_t u64
>  	#define guest_walker guest_walker64
> @@ -58,6 +65,21 @@
>  	#define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
>  	#define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
>  	#define CMPXCHG cmpxchg
> +#elif PTTYPE == PTTYPE_EPT
> +	#define pt_element_t u64
> +	#define guest_walker guest_walkerEPT
> +	#define FNAME(name) ept_##name
> +	#define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
> +	#define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
> +	#define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
> +	#define PT_INDEX(addr, level) PT64_INDEX(addr, level)
> +	#define PT_LEVEL_BITS PT64_LEVEL_BITS
> +	#define PT_GUEST_ACCESSED_MASK 0
> +	#define PT_GUEST_DIRTY_MASK 0
> +	#define PT_GUEST_DIRTY_SHIFT __using_nonexistent_pte_bit()
> +	#define PT_GUEST_ACCESSED_SHIFT __using_nonexistent_pte_bit()
> +	#define CMPXCHG cmpxchg64
> +	#define PT_MAX_FULL_LEVELS 4
>  #else
>  	#error Invalid PTTYPE value
>  #endif
> @@ -115,7 +137,11 @@ static bool FNAME(is_rsvd_bits_set)(struct kvm_mmu *mmu, u64 gpte, int level)
> 
>  static inline int FNAME(is_present_gpte)(unsigned long pte)
>  {
> +#if PTTYPE != PTTYPE_EPT
>  	return is_present_gpte(pte);
> +#else
> +	return pte & 7;
> +#endif
>  }
> 
>  static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
> @@ -165,9 +191,14 @@ no_present:
>  static inline unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, u64 gpte)
>  {
>  	unsigned access;
> -
> +#if PTTYPE == PTTYPE_EPT
> +	access = ((gpte & VMX_EPT_WRITABLE_MASK) ? ACC_WRITE_MASK : 0) |
> +		((gpte & VMX_EPT_EXECUTABLE_MASK) ? ACC_EXEC_MASK : 0) |
> +		ACC_USER_MASK;
> +#else
>  	access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
>  	access &= ~(gpte >> PT64_NX_SHIFT);
> +#endif

Hmm, why not use shadow_x_mask, shadow_user_mask instead? PT_WRITABLE_MASK
is also suitable for ept, i guess we can remove the "#if/#else/#endif" after
that.

> 
>  	return access;
>  }
> @@ -369,6 +400,7 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
>  					access);
>  }
> 
> +#if PTTYPE != PTTYPE_EPT
>  static int FNAME(walk_addr_nested)(struct guest_walker *walker,
>  				   struct kvm_vcpu *vcpu, gva_t addr,
>  				   u32 access)
> @@ -376,6 +408,7 @@ static int FNAME(walk_addr_nested)(struct guest_walker *walker,
>  	return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu,
>  					addr, access);
>  }
> +#endif
> 
>  static bool
>  FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
> @@ -803,6 +836,7 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
>  	return gpa;
>  }
> 
> +#if PTTYPE != PTTYPE_EPT
>  static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
>  				      u32 access,
>  				      struct x86_exception *exception)
> @@ -821,6 +855,7 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
> 
>  	return gpa;
>  }
> +#endif

Can we move the "walk" together, then one "#if" can be dropped?

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov Aug. 1, 2013, 7:10 a.m. UTC | #2
On Thu, Aug 01, 2013 at 03:00:12PM +0800, Xiao Guangrong wrote:
> On 07/31/2013 10:48 PM, Gleb Natapov wrote:
> > From: Nadav Har'El <nyh@il.ibm.com>
> > 
> > This is the first patch in a series which adds nested EPT support to KVM's
> > nested VMX. Nested EPT means emulating EPT for an L1 guest so that L1 can use
> > EPT when running a nested guest L2. When L1 uses EPT, it allows the L2 guest
> > to set its own cr3 and take its own page faults without either of L0 or L1
> > getting involved. This often significanlty improves L2's performance over the
> > previous two alternatives (shadow page tables over EPT, and shadow page
> > tables over shadow page tables).
> > 
> > This patch adds EPT support to paging_tmpl.h.
> > 
> > paging_tmpl.h contains the code for reading and writing page tables. The code
> > for 32-bit and 64-bit tables is very similar, but not identical, so
> > paging_tmpl.h is #include'd twice in mmu.c, once with PTTTYPE=32 and once
> > with PTTYPE=64, and this generates the two sets of similar functions.
> > 
> > There are subtle but important differences between the format of EPT tables
> > and that of ordinary x86 64-bit page tables, so for nested EPT we need a
> > third set of functions to read the guest EPT table and to write the shadow
> > EPT table.
> > 
> > So this patch adds third PTTYPE, PTTYPE_EPT, which creates functions (prefixed
> > with "EPT") which correctly read and write EPT tables.
> > 
> > Signed-off-by: Nadav Har'El <nyh@il.ibm.com>
> > Signed-off-by: Jun Nakajima <jun.nakajima@intel.com>
> > Signed-off-by: Xinhao Xu <xinhao.xu@intel.com>
> > Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com>
> > Signed-off-by: Gleb Natapov <gleb@redhat.com>
> > ---
> >  arch/x86/kvm/mmu.c         |    5 +++++
> >  arch/x86/kvm/paging_tmpl.h |   37 ++++++++++++++++++++++++++++++++++++-
> >  2 files changed, 41 insertions(+), 1 deletion(-)
> > 
> > diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> > index 4c4274d..b5273c3 100644
> > --- a/arch/x86/kvm/mmu.c
> > +++ b/arch/x86/kvm/mmu.c
> > @@ -3494,6 +3494,11 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gp
> >  	return mmu->last_pte_bitmap & (1 << index);
> >  }
> > 
> > +#define PTTYPE_EPT 18 /* arbitrary */
> > +#define PTTYPE PTTYPE_EPT
> > +#include "paging_tmpl.h"
> > +#undef PTTYPE
> > +
> >  #define PTTYPE 64
> >  #include "paging_tmpl.h"
> >  #undef PTTYPE
> > diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
> > index 2c2f635..762c904 100644
> > --- a/arch/x86/kvm/paging_tmpl.h
> > +++ b/arch/x86/kvm/paging_tmpl.h
> > @@ -23,6 +23,13 @@
> >   * so the code in this file is compiled twice, once per pte size.
> >   */
> > 
> > +/*
> > + * This is used to catch non optimized PT_GUEST_(DIRTY|ACCESS)_SHIFT macro
> > + * uses for EPT without A/D paging type.
> > + */
> > +extern u64 __pure __using_nonexistent_pte_bit(void)
> > +	       __compiletime_error("wrong use of PT_GUEST_(DIRTY|ACCESS)_SHIFT");
> > +
> >  #if PTTYPE == 64
> >  	#define pt_element_t u64
> >  	#define guest_walker guest_walker64
> > @@ -58,6 +65,21 @@
> >  	#define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
> >  	#define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
> >  	#define CMPXCHG cmpxchg
> > +#elif PTTYPE == PTTYPE_EPT
> > +	#define pt_element_t u64
> > +	#define guest_walker guest_walkerEPT
> > +	#define FNAME(name) ept_##name
> > +	#define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
> > +	#define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
> > +	#define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
> > +	#define PT_INDEX(addr, level) PT64_INDEX(addr, level)
> > +	#define PT_LEVEL_BITS PT64_LEVEL_BITS
> > +	#define PT_GUEST_ACCESSED_MASK 0
> > +	#define PT_GUEST_DIRTY_MASK 0
> > +	#define PT_GUEST_DIRTY_SHIFT __using_nonexistent_pte_bit()
> > +	#define PT_GUEST_ACCESSED_SHIFT __using_nonexistent_pte_bit()
> > +	#define CMPXCHG cmpxchg64
> > +	#define PT_MAX_FULL_LEVELS 4
> >  #else
> >  	#error Invalid PTTYPE value
> >  #endif
> > @@ -115,7 +137,11 @@ static bool FNAME(is_rsvd_bits_set)(struct kvm_mmu *mmu, u64 gpte, int level)
> > 
> >  static inline int FNAME(is_present_gpte)(unsigned long pte)
> >  {
> > +#if PTTYPE != PTTYPE_EPT
> >  	return is_present_gpte(pte);
> > +#else
> > +	return pte & 7;
> > +#endif
> >  }
> > 
> >  static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
> > @@ -165,9 +191,14 @@ no_present:
> >  static inline unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, u64 gpte)
> >  {
> >  	unsigned access;
> > -
> > +#if PTTYPE == PTTYPE_EPT
> > +	access = ((gpte & VMX_EPT_WRITABLE_MASK) ? ACC_WRITE_MASK : 0) |
> > +		((gpte & VMX_EPT_EXECUTABLE_MASK) ? ACC_EXEC_MASK : 0) |
> > +		ACC_USER_MASK;
> > +#else
> >  	access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
> >  	access &= ~(gpte >> PT64_NX_SHIFT);
> > +#endif
> 
> Hmm, why not use shadow_x_mask, shadow_user_mask instead? PT_WRITABLE_MASK
> is also suitable for ept, i guess we can remove the "#if/#else/#endif" after
> that.
> 
shadow_x_mask and shadow_user_mask do not depend on guest paging mode,
so cannot be used here. Since we have to use ifdefs anyway relying on
VMX_EPT_WRITABLE_MASK == PT_WRITABLE_MASK is not necessary. Makes code
easier to read.

> > 
> >  	return access;
> >  }
> > @@ -369,6 +400,7 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
> >  					access);
> >  }
> > 
> > +#if PTTYPE != PTTYPE_EPT
> >  static int FNAME(walk_addr_nested)(struct guest_walker *walker,
> >  				   struct kvm_vcpu *vcpu, gva_t addr,
> >  				   u32 access)
> > @@ -376,6 +408,7 @@ static int FNAME(walk_addr_nested)(struct guest_walker *walker,
> >  	return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu,
> >  					addr, access);
> >  }
> > +#endif
> > 
> >  static bool
> >  FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
> > @@ -803,6 +836,7 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
> >  	return gpa;
> >  }
> > 
> > +#if PTTYPE != PTTYPE_EPT
> >  static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
> >  				      u32 access,
> >  				      struct x86_exception *exception)
> > @@ -821,6 +855,7 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
> > 
> >  	return gpa;
> >  }
> > +#endif
> 
> Can we move the "walk" together, then one "#if" can be dropped?
Currently walk_addr_nested() is near walk_addr() and gva_to_gpa_nested()
is near gva_to_gpa(). I like this grouping.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Xiao Guangrong Aug. 1, 2013, 7:18 a.m. UTC | #3
On 08/01/2013 03:10 PM, Gleb Natapov wrote:
> On Thu, Aug 01, 2013 at 03:00:12PM +0800, Xiao Guangrong wrote:
>> On 07/31/2013 10:48 PM, Gleb Natapov wrote:
>>> From: Nadav Har'El <nyh@il.ibm.com>
>>>
>>> This is the first patch in a series which adds nested EPT support to KVM's
>>> nested VMX. Nested EPT means emulating EPT for an L1 guest so that L1 can use
>>> EPT when running a nested guest L2. When L1 uses EPT, it allows the L2 guest
>>> to set its own cr3 and take its own page faults without either of L0 or L1
>>> getting involved. This often significanlty improves L2's performance over the
>>> previous two alternatives (shadow page tables over EPT, and shadow page
>>> tables over shadow page tables).
>>>
>>> This patch adds EPT support to paging_tmpl.h.
>>>
>>> paging_tmpl.h contains the code for reading and writing page tables. The code
>>> for 32-bit and 64-bit tables is very similar, but not identical, so
>>> paging_tmpl.h is #include'd twice in mmu.c, once with PTTTYPE=32 and once
>>> with PTTYPE=64, and this generates the two sets of similar functions.
>>>
>>> There are subtle but important differences between the format of EPT tables
>>> and that of ordinary x86 64-bit page tables, so for nested EPT we need a
>>> third set of functions to read the guest EPT table and to write the shadow
>>> EPT table.
>>>
>>> So this patch adds third PTTYPE, PTTYPE_EPT, which creates functions (prefixed
>>> with "EPT") which correctly read and write EPT tables.
>>>
>>> Signed-off-by: Nadav Har'El <nyh@il.ibm.com>
>>> Signed-off-by: Jun Nakajima <jun.nakajima@intel.com>
>>> Signed-off-by: Xinhao Xu <xinhao.xu@intel.com>
>>> Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com>
>>> Signed-off-by: Gleb Natapov <gleb@redhat.com>
>>> ---
>>>  arch/x86/kvm/mmu.c         |    5 +++++
>>>  arch/x86/kvm/paging_tmpl.h |   37 ++++++++++++++++++++++++++++++++++++-
>>>  2 files changed, 41 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
>>> index 4c4274d..b5273c3 100644
>>> --- a/arch/x86/kvm/mmu.c
>>> +++ b/arch/x86/kvm/mmu.c
>>> @@ -3494,6 +3494,11 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gp
>>>  	return mmu->last_pte_bitmap & (1 << index);
>>>  }
>>>
>>> +#define PTTYPE_EPT 18 /* arbitrary */
>>> +#define PTTYPE PTTYPE_EPT
>>> +#include "paging_tmpl.h"
>>> +#undef PTTYPE
>>> +
>>>  #define PTTYPE 64
>>>  #include "paging_tmpl.h"
>>>  #undef PTTYPE
>>> diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
>>> index 2c2f635..762c904 100644
>>> --- a/arch/x86/kvm/paging_tmpl.h
>>> +++ b/arch/x86/kvm/paging_tmpl.h
>>> @@ -23,6 +23,13 @@
>>>   * so the code in this file is compiled twice, once per pte size.
>>>   */
>>>
>>> +/*
>>> + * This is used to catch non optimized PT_GUEST_(DIRTY|ACCESS)_SHIFT macro
>>> + * uses for EPT without A/D paging type.
>>> + */
>>> +extern u64 __pure __using_nonexistent_pte_bit(void)
>>> +	       __compiletime_error("wrong use of PT_GUEST_(DIRTY|ACCESS)_SHIFT");
>>> +
>>>  #if PTTYPE == 64
>>>  	#define pt_element_t u64
>>>  	#define guest_walker guest_walker64
>>> @@ -58,6 +65,21 @@
>>>  	#define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
>>>  	#define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
>>>  	#define CMPXCHG cmpxchg
>>> +#elif PTTYPE == PTTYPE_EPT
>>> +	#define pt_element_t u64
>>> +	#define guest_walker guest_walkerEPT
>>> +	#define FNAME(name) ept_##name
>>> +	#define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
>>> +	#define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
>>> +	#define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
>>> +	#define PT_INDEX(addr, level) PT64_INDEX(addr, level)
>>> +	#define PT_LEVEL_BITS PT64_LEVEL_BITS
>>> +	#define PT_GUEST_ACCESSED_MASK 0
>>> +	#define PT_GUEST_DIRTY_MASK 0
>>> +	#define PT_GUEST_DIRTY_SHIFT __using_nonexistent_pte_bit()
>>> +	#define PT_GUEST_ACCESSED_SHIFT __using_nonexistent_pte_bit()
>>> +	#define CMPXCHG cmpxchg64
>>> +	#define PT_MAX_FULL_LEVELS 4
>>>  #else
>>>  	#error Invalid PTTYPE value
>>>  #endif
>>> @@ -115,7 +137,11 @@ static bool FNAME(is_rsvd_bits_set)(struct kvm_mmu *mmu, u64 gpte, int level)
>>>
>>>  static inline int FNAME(is_present_gpte)(unsigned long pte)
>>>  {
>>> +#if PTTYPE != PTTYPE_EPT
>>>  	return is_present_gpte(pte);
>>> +#else
>>> +	return pte & 7;
>>> +#endif
>>>  }
>>>
>>>  static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
>>> @@ -165,9 +191,14 @@ no_present:
>>>  static inline unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, u64 gpte)
>>>  {
>>>  	unsigned access;
>>> -
>>> +#if PTTYPE == PTTYPE_EPT
>>> +	access = ((gpte & VMX_EPT_WRITABLE_MASK) ? ACC_WRITE_MASK : 0) |
>>> +		((gpte & VMX_EPT_EXECUTABLE_MASK) ? ACC_EXEC_MASK : 0) |
>>> +		ACC_USER_MASK;
>>> +#else
>>>  	access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
>>>  	access &= ~(gpte >> PT64_NX_SHIFT);
>>> +#endif
>>
>> Hmm, why not use shadow_x_mask, shadow_user_mask instead? PT_WRITABLE_MASK
>> is also suitable for ept, i guess we can remove the "#if/#else/#endif" after
>> that.
>>
> shadow_x_mask and shadow_user_mask do not depend on guest paging mode,
> so cannot be used here. Since we have to use ifdefs anyway relying on
> VMX_EPT_WRITABLE_MASK == PT_WRITABLE_MASK is not necessary. Makes code
> easier to read.

Oh, yes, you are right.

Reviewed-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Xiao Guangrong Aug. 1, 2013, 7:31 a.m. UTC | #4
On 08/01/2013 03:18 PM, Xiao Guangrong wrote:
 +#endif
>>>
>>> Hmm, why not use shadow_x_mask, shadow_user_mask instead? PT_WRITABLE_MASK
>>> is also suitable for ept, i guess we can remove the "#if/#else/#endif" after
>>> that.
>>>
>> shadow_x_mask and shadow_user_mask do not depend on guest paging mode,
>> so cannot be used here. Since we have to use ifdefs anyway relying on
>> VMX_EPT_WRITABLE_MASK == PT_WRITABLE_MASK is not necessary. Makes code
>> easier to read.
> 
> Oh, yes, you are right.
> 
> Reviewed-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>

BTW, i notice the code in mmu.c uses PT64_NX_MASK to check the permission,
i.e:

static bool need_remote_flush(u64 old, u64 new)
{
	if (!is_shadow_present_pte(old))
		return false;
	if (!is_shadow_present_pte(new))
		return true;
	if ((old ^ new) & PT64_BASE_ADDR_MASK)
		return true;
	old ^= PT64_NX_MASK;
	new ^= PT64_NX_MASK;
	return (old & ~new & PT64_PERM_MASK) != 0;
}

It checks shadow page table and the mask is wrong one nest ept spte.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov Aug. 1, 2013, 7:42 a.m. UTC | #5
On Thu, Aug 01, 2013 at 03:31:01PM +0800, Xiao Guangrong wrote:
> On 08/01/2013 03:18 PM, Xiao Guangrong wrote:
>  +#endif
> >>>
> >>> Hmm, why not use shadow_x_mask, shadow_user_mask instead? PT_WRITABLE_MASK
> >>> is also suitable for ept, i guess we can remove the "#if/#else/#endif" after
> >>> that.
> >>>
> >> shadow_x_mask and shadow_user_mask do not depend on guest paging mode,
> >> so cannot be used here. Since we have to use ifdefs anyway relying on
> >> VMX_EPT_WRITABLE_MASK == PT_WRITABLE_MASK is not necessary. Makes code
> >> easier to read.
> > 
> > Oh, yes, you are right.
> > 
> > Reviewed-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
> 
> BTW, i notice the code in mmu.c uses PT64_NX_MASK to check the permission,
> i.e:
> 
> static bool need_remote_flush(u64 old, u64 new)
> {
> 	if (!is_shadow_present_pte(old))
> 		return false;
> 	if (!is_shadow_present_pte(new))
> 		return true;
> 	if ((old ^ new) & PT64_BASE_ADDR_MASK)
> 		return true;
> 	old ^= PT64_NX_MASK;
> 	new ^= PT64_NX_MASK;
> 	return (old & ~new & PT64_PERM_MASK) != 0;
> }
> 
> It checks shadow page table and the mask is wrong one nest ept spte.
So shadow_x_mask need to be used here, correct?

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Xiao Guangrong Aug. 1, 2013, 7:51 a.m. UTC | #6
On 08/01/2013 03:42 PM, Gleb Natapov wrote:
> On Thu, Aug 01, 2013 at 03:31:01PM +0800, Xiao Guangrong wrote:
>> On 08/01/2013 03:18 PM, Xiao Guangrong wrote:
>>  +#endif
>>>>>
>>>>> Hmm, why not use shadow_x_mask, shadow_user_mask instead? PT_WRITABLE_MASK
>>>>> is also suitable for ept, i guess we can remove the "#if/#else/#endif" after
>>>>> that.
>>>>>
>>>> shadow_x_mask and shadow_user_mask do not depend on guest paging mode,
>>>> so cannot be used here. Since we have to use ifdefs anyway relying on
>>>> VMX_EPT_WRITABLE_MASK == PT_WRITABLE_MASK is not necessary. Makes code
>>>> easier to read.
>>>
>>> Oh, yes, you are right.
>>>
>>> Reviewed-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
>>
>> BTW, i notice the code in mmu.c uses PT64_NX_MASK to check the permission,
>> i.e:
>>
>> static bool need_remote_flush(u64 old, u64 new)
>> {
>> 	if (!is_shadow_present_pte(old))
>> 		return false;
>> 	if (!is_shadow_present_pte(new))
>> 		return true;
>> 	if ((old ^ new) & PT64_BASE_ADDR_MASK)
>> 		return true;
>> 	old ^= PT64_NX_MASK;
>> 	new ^= PT64_NX_MASK;
>> 	return (old & ~new & PT64_PERM_MASK) != 0;
>> }
>>
>> It checks shadow page table and the mask is wrong one nest ept spte.
> So shadow_x_mask need to be used here, correct?

Yes. The code checks shadow page table which does not depend on guest mode. :)


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov Aug. 1, 2013, 7:56 a.m. UTC | #7
On Thu, Aug 01, 2013 at 03:51:35PM +0800, Xiao Guangrong wrote:
> On 08/01/2013 03:42 PM, Gleb Natapov wrote:
> > On Thu, Aug 01, 2013 at 03:31:01PM +0800, Xiao Guangrong wrote:
> >> On 08/01/2013 03:18 PM, Xiao Guangrong wrote:
> >>  +#endif
> >>>>>
> >>>>> Hmm, why not use shadow_x_mask, shadow_user_mask instead? PT_WRITABLE_MASK
> >>>>> is also suitable for ept, i guess we can remove the "#if/#else/#endif" after
> >>>>> that.
> >>>>>
> >>>> shadow_x_mask and shadow_user_mask do not depend on guest paging mode,
> >>>> so cannot be used here. Since we have to use ifdefs anyway relying on
> >>>> VMX_EPT_WRITABLE_MASK == PT_WRITABLE_MASK is not necessary. Makes code
> >>>> easier to read.
> >>>
> >>> Oh, yes, you are right.
> >>>
> >>> Reviewed-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
> >>
> >> BTW, i notice the code in mmu.c uses PT64_NX_MASK to check the permission,
> >> i.e:
> >>
> >> static bool need_remote_flush(u64 old, u64 new)
> >> {
> >> 	if (!is_shadow_present_pte(old))
> >> 		return false;
> >> 	if (!is_shadow_present_pte(new))
> >> 		return true;
> >> 	if ((old ^ new) & PT64_BASE_ADDR_MASK)
> >> 		return true;
> >> 	old ^= PT64_NX_MASK;
> >> 	new ^= PT64_NX_MASK;
> >> 	return (old & ~new & PT64_PERM_MASK) != 0;
> >> }
> >>
> >> It checks shadow page table and the mask is wrong one nest ept spte.
> > So shadow_x_mask need to be used here, correct?
> 
> Yes. The code checks shadow page table which does not depend on guest mode. :)
> 
Yes, good catch. It would have been hard to find when hit on practice.
Will add separate patch to fix that after you review the reset of the
series :)


--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paolo Bonzini Aug. 1, 2013, 11:05 a.m. UTC | #8
On Aug 01 2013, Xiao Guangrong wrote:
> On 08/01/2013 03:42 PM, Gleb Natapov wrote:
> > On Thu, Aug 01, 2013 at 03:31:01PM +0800, Xiao Guangrong wrote:
> >> On 08/01/2013 03:18 PM, Xiao Guangrong wrote:
> >>  +#endif
> >>>>>
> >>>>> Hmm, why not use shadow_x_mask, shadow_user_mask instead? PT_WRITABLE_MASK
> >>>>> is also suitable for ept, i guess we can remove the "#if/#else/#endif" after
> >>>>> that.
> >>>>>
> >>>> shadow_x_mask and shadow_user_mask do not depend on guest paging mode,
> >>>> so cannot be used here. Since we have to use ifdefs anyway relying on
> >>>> VMX_EPT_WRITABLE_MASK == PT_WRITABLE_MASK is not necessary. Makes code
> >>>> easier to read.
> >>>
> >>> Oh, yes, you are right.
> >>>
> >>> Reviewed-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
> >>
> >> BTW, i notice the code in mmu.c uses PT64_NX_MASK to check the permission,
> >> i.e:
> >>
> >> static bool need_remote_flush(u64 old, u64 new)
> >> {
> >> 	if (!is_shadow_present_pte(old))
> >> 		return false;
> >> 	if (!is_shadow_present_pte(new))
> >> 		return true;
> >> 	if ((old ^ new) & PT64_BASE_ADDR_MASK)
> >> 		return true;
> >> 	old ^= PT64_NX_MASK;
> >> 	new ^= PT64_NX_MASK;
> >> 	return (old & ~new & PT64_PERM_MASK) != 0;
> >> }
> >>
> >> It checks shadow page table and the mask is wrong one nest ept spte.
> > So shadow_x_mask need to be used here, correct?
> 
> Yes. The code checks shadow page table which does not depend on guest mode. :)

The XOR should be with shadow_nx_mask, no?  And PT64_PERM_MASK
should include both shadow_x_mask and shadow_nx_mask, I think.

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov Aug. 1, 2013, 11:07 a.m. UTC | #9
On Thu, Aug 01, 2013 at 01:05:47PM +0200, Paolo Bonzini wrote:
>  On Aug 01 2013, Xiao Guangrong wrote:
> > On 08/01/2013 03:42 PM, Gleb Natapov wrote:
> > > On Thu, Aug 01, 2013 at 03:31:01PM +0800, Xiao Guangrong wrote:
> > >> On 08/01/2013 03:18 PM, Xiao Guangrong wrote:
> > >>  +#endif
> > >>>>>
> > >>>>> Hmm, why not use shadow_x_mask, shadow_user_mask instead? PT_WRITABLE_MASK
> > >>>>> is also suitable for ept, i guess we can remove the "#if/#else/#endif" after
> > >>>>> that.
> > >>>>>
> > >>>> shadow_x_mask and shadow_user_mask do not depend on guest paging mode,
> > >>>> so cannot be used here. Since we have to use ifdefs anyway relying on
> > >>>> VMX_EPT_WRITABLE_MASK == PT_WRITABLE_MASK is not necessary. Makes code
> > >>>> easier to read.
> > >>>
> > >>> Oh, yes, you are right.
> > >>>
> > >>> Reviewed-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
> > >>
> > >> BTW, i notice the code in mmu.c uses PT64_NX_MASK to check the permission,
> > >> i.e:
> > >>
> > >> static bool need_remote_flush(u64 old, u64 new)
> > >> {
> > >> 	if (!is_shadow_present_pte(old))
> > >> 		return false;
> > >> 	if (!is_shadow_present_pte(new))
> > >> 		return true;
> > >> 	if ((old ^ new) & PT64_BASE_ADDR_MASK)
> > >> 		return true;
> > >> 	old ^= PT64_NX_MASK;
> > >> 	new ^= PT64_NX_MASK;
> > >> 	return (old & ~new & PT64_PERM_MASK) != 0;
> > >> }
> > >>
> > >> It checks shadow page table and the mask is wrong one nest ept spte.
> > > So shadow_x_mask need to be used here, correct?
> > 
> > Yes. The code checks shadow page table which does not depend on guest mode. :)
> 
> The XOR should be with shadow_nx_mask, no?  And PT64_PERM_MASK
> should include both shadow_x_mask and shadow_nx_mask, I think.
> 
Yes :) That what I did eventually.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 4c4274d..b5273c3 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3494,6 +3494,11 @@  static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gp
 	return mmu->last_pte_bitmap & (1 << index);
 }
 
+#define PTTYPE_EPT 18 /* arbitrary */
+#define PTTYPE PTTYPE_EPT
+#include "paging_tmpl.h"
+#undef PTTYPE
+
 #define PTTYPE 64
 #include "paging_tmpl.h"
 #undef PTTYPE
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 2c2f635..762c904 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -23,6 +23,13 @@ 
  * so the code in this file is compiled twice, once per pte size.
  */
 
+/*
+ * This is used to catch non optimized PT_GUEST_(DIRTY|ACCESS)_SHIFT macro
+ * uses for EPT without A/D paging type.
+ */
+extern u64 __pure __using_nonexistent_pte_bit(void)
+	       __compiletime_error("wrong use of PT_GUEST_(DIRTY|ACCESS)_SHIFT");
+
 #if PTTYPE == 64
 	#define pt_element_t u64
 	#define guest_walker guest_walker64
@@ -58,6 +65,21 @@ 
 	#define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
 	#define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
 	#define CMPXCHG cmpxchg
+#elif PTTYPE == PTTYPE_EPT
+	#define pt_element_t u64
+	#define guest_walker guest_walkerEPT
+	#define FNAME(name) ept_##name
+	#define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
+	#define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
+	#define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
+	#define PT_INDEX(addr, level) PT64_INDEX(addr, level)
+	#define PT_LEVEL_BITS PT64_LEVEL_BITS
+	#define PT_GUEST_ACCESSED_MASK 0
+	#define PT_GUEST_DIRTY_MASK 0
+	#define PT_GUEST_DIRTY_SHIFT __using_nonexistent_pte_bit()
+	#define PT_GUEST_ACCESSED_SHIFT __using_nonexistent_pte_bit()
+	#define CMPXCHG cmpxchg64
+	#define PT_MAX_FULL_LEVELS 4
 #else
 	#error Invalid PTTYPE value
 #endif
@@ -115,7 +137,11 @@  static bool FNAME(is_rsvd_bits_set)(struct kvm_mmu *mmu, u64 gpte, int level)
 
 static inline int FNAME(is_present_gpte)(unsigned long pte)
 {
+#if PTTYPE != PTTYPE_EPT
 	return is_present_gpte(pte);
+#else
+	return pte & 7;
+#endif
 }
 
 static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
@@ -165,9 +191,14 @@  no_present:
 static inline unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, u64 gpte)
 {
 	unsigned access;
-
+#if PTTYPE == PTTYPE_EPT
+	access = ((gpte & VMX_EPT_WRITABLE_MASK) ? ACC_WRITE_MASK : 0) |
+		((gpte & VMX_EPT_EXECUTABLE_MASK) ? ACC_EXEC_MASK : 0) |
+		ACC_USER_MASK;
+#else
 	access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
 	access &= ~(gpte >> PT64_NX_SHIFT);
+#endif
 
 	return access;
 }
@@ -369,6 +400,7 @@  static int FNAME(walk_addr)(struct guest_walker *walker,
 					access);
 }
 
+#if PTTYPE != PTTYPE_EPT
 static int FNAME(walk_addr_nested)(struct guest_walker *walker,
 				   struct kvm_vcpu *vcpu, gva_t addr,
 				   u32 access)
@@ -376,6 +408,7 @@  static int FNAME(walk_addr_nested)(struct guest_walker *walker,
 	return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu,
 					addr, access);
 }
+#endif
 
 static bool
 FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
@@ -803,6 +836,7 @@  static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
 	return gpa;
 }
 
+#if PTTYPE != PTTYPE_EPT
 static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
 				      u32 access,
 				      struct x86_exception *exception)
@@ -821,6 +855,7 @@  static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
 
 	return gpa;
 }
+#endif
 
 /*
  * Using the cached information from sp->gfns is safe because: