diff mbox

[v4,06/13] nEPT: Add EPT tables support to paging_tmpl.h

Message ID 1374750001-28527-7-git-send-email-gleb@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Gleb Natapov July 25, 2013, 10:59 a.m. UTC
From: Nadav Har'El <nyh@il.ibm.com>

This is the first patch in a series which adds nested EPT support to KVM's
nested VMX. Nested EPT means emulating EPT for an L1 guest so that L1 can use
EPT when running a nested guest L2. When L1 uses EPT, it allows the L2 guest
to set its own cr3 and take its own page faults without either of L0 or L1
getting involved. This often significanlty improves L2's performance over the
previous two alternatives (shadow page tables over EPT, and shadow page
tables over shadow page tables).

This patch adds EPT support to paging_tmpl.h.

paging_tmpl.h contains the code for reading and writing page tables. The code
for 32-bit and 64-bit tables is very similar, but not identical, so
paging_tmpl.h is #include'd twice in mmu.c, once with PTTTYPE=32 and once
with PTTYPE=64, and this generates the two sets of similar functions.

There are subtle but important differences between the format of EPT tables
and that of ordinary x86 64-bit page tables, so for nested EPT we need a
third set of functions to read the guest EPT table and to write the shadow
EPT table.

So this patch adds third PTTYPE, PTTYPE_EPT, which creates functions (prefixed
with "EPT") which correctly read and write EPT tables.

Signed-off-by: Nadav Har'El <nyh@il.ibm.com>
Signed-off-by: Jun Nakajima <jun.nakajima@intel.com>
Signed-off-by: Xinhao Xu <xinhao.xu@intel.com>
Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/mmu.c         |    5 +++++
 arch/x86/kvm/paging_tmpl.h |   43 +++++++++++++++++++++++++++++++++++++++----
 2 files changed, 44 insertions(+), 4 deletions(-)

Comments

Paolo Bonzini July 29, 2013, 9:48 a.m. UTC | #1
Il 25/07/2013 12:59, Gleb Natapov ha scritto:
> From: Nadav Har'El <nyh@il.ibm.com>
> 
> This is the first patch in a series which adds nested EPT support to KVM's
> nested VMX. Nested EPT means emulating EPT for an L1 guest so that L1 can use
> EPT when running a nested guest L2. When L1 uses EPT, it allows the L2 guest
> to set its own cr3 and take its own page faults without either of L0 or L1
> getting involved. This often significanlty improves L2's performance over the
> previous two alternatives (shadow page tables over EPT, and shadow page
> tables over shadow page tables).
> 
> This patch adds EPT support to paging_tmpl.h.
> 
> paging_tmpl.h contains the code for reading and writing page tables. The code
> for 32-bit and 64-bit tables is very similar, but not identical, so
> paging_tmpl.h is #include'd twice in mmu.c, once with PTTTYPE=32 and once
> with PTTYPE=64, and this generates the two sets of similar functions.
> 
> There are subtle but important differences between the format of EPT tables
> and that of ordinary x86 64-bit page tables, so for nested EPT we need a
> third set of functions to read the guest EPT table and to write the shadow
> EPT table.
> 
> So this patch adds third PTTYPE, PTTYPE_EPT, which creates functions (prefixed
> with "EPT") which correctly read and write EPT tables.
> 
> Signed-off-by: Nadav Har'El <nyh@il.ibm.com>
> Signed-off-by: Jun Nakajima <jun.nakajima@intel.com>
> Signed-off-by: Xinhao Xu <xinhao.xu@intel.com>
> Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com>
> Signed-off-by: Gleb Natapov <gleb@redhat.com>
> ---
>  arch/x86/kvm/mmu.c         |    5 +++++
>  arch/x86/kvm/paging_tmpl.h |   43 +++++++++++++++++++++++++++++++++++++++----
>  2 files changed, 44 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index 4c4274d..b5273c3 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -3494,6 +3494,11 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gp
>  	return mmu->last_pte_bitmap & (1 << index);
>  }
>  
> +#define PTTYPE_EPT 18 /* arbitrary */
> +#define PTTYPE PTTYPE_EPT
> +#include "paging_tmpl.h"
> +#undef PTTYPE
> +
>  #define PTTYPE 64
>  #include "paging_tmpl.h"
>  #undef PTTYPE
> diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
> index 7581395..e38b3c0 100644
> --- a/arch/x86/kvm/paging_tmpl.h
> +++ b/arch/x86/kvm/paging_tmpl.h
> @@ -58,6 +58,21 @@
>  	#define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
>  	#define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
>  	#define CMPXCHG cmpxchg
> +#elif PTTYPE == PTTYPE_EPT
> +	#define pt_element_t u64
> +	#define guest_walker guest_walkerEPT
> +	#define FNAME(name) ept_##name
> +	#define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
> +	#define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
> +	#define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
> +	#define PT_INDEX(addr, level) PT64_INDEX(addr, level)
> +	#define PT_LEVEL_BITS PT64_LEVEL_BITS
> +	#define PT_GUEST_ACCESSED_MASK 0
> +	#define PT_GUEST_DIRTY_MASK 0
> +	#define PT_GUEST_DIRTY_SHIFT 0
> +	#define PT_GUEST_ACCESSED_SHIFT 0
> +	#define CMPXCHG cmpxchg64
> +	#define PT_MAX_FULL_LEVELS 4
>  #else
>  	#error Invalid PTTYPE value
>  #endif
> @@ -90,6 +105,10 @@ static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
>  
>  static inline void FNAME(protect_clean_gpte)(unsigned *access, unsigned gpte)
>  {
> +#if PT_GUEST_DIRTY_MASK == 0
> +	/* dirty bit is not supported, so no need to track it */
> +	return;
> +#else
>  	unsigned mask;
>  
>  	BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK);
> @@ -99,6 +118,7 @@ static inline void FNAME(protect_clean_gpte)(unsigned *access, unsigned gpte)
>  	mask |= (gpte >> (PT_GUEST_DIRTY_SHIFT - PT_WRITABLE_SHIFT)) &
>  		PT_WRITABLE_MASK;
>  	*access &= mask;
> +#endif

Please put this #if/#else/#endif in the previous patch.  (See also
below on leaving out protect_clean_gpte altogether).

You probably should also have a

	BUILD_BUG_ON(PT_GUEST_DIRTY_SHIFT < PT_WRITABLE_SHIFT);

in the #else branch.

>  }
>  
>  static bool FNAME(is_rsvd_bits_set)(struct kvm_mmu *mmu, u64 gpte, int level)
> @@ -111,7 +131,11 @@ static bool FNAME(is_rsvd_bits_set)(struct kvm_mmu *mmu, u64 gpte, int level)
>  
>  static inline int FNAME(is_present_gpte)(unsigned long pte)
>  {
> +#if PTTYPE != PTTYPE_EPT
>  	return is_present_gpte(pte);
> +#else
> +	return pte & 7;
> +#endif
>  }
>  
>  static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
> @@ -147,7 +171,8 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
>  	if (!FNAME(is_present_gpte)(gpte))
>  		goto no_present;
>  
> -	if (!(gpte & PT_GUEST_ACCESSED_MASK))
> +	/* if accessed bit is not supported prefetch non accessed gpte */
> +	if (PT_GUEST_ACCESSED_MASK && !(gpte & PT_GUEST_ACCESSED_MASK))

Same for this hunk.  Please put it in the previous patch.

>  		goto no_present;
>  
>  	return false;
> @@ -160,9 +185,14 @@ no_present:
>  static inline unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, u64 gpte)
>  {
>  	unsigned access;
> -
> +#if PTTYPE == PTTYPE_EPT
> +	BUILD_BUG_ON(ACC_WRITE_MASK != VMX_EPT_WRITABLE_MASK);
> +	access = (gpte & VMX_EPT_WRITABLE_MASK) | ACC_USER_MASK |
> +		((gpte & VMX_EPT_EXECUTABLE_MASK) ? ACC_EXEC_MASK : 0);
> +#else
>  	access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
>  	access &= ~(gpte >> PT64_NX_SHIFT);
> +#endif
>  
>  	return access;
>  }
> @@ -212,7 +242,6 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
>  				    struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
>  				    gva_t addr, u32 access)
>  {
> -	int ret;
>  	pt_element_t pte;
>  	pt_element_t __user *uninitialized_var(ptep_user);
>  	gfn_t table_gfn;
> @@ -322,7 +351,9 @@ retry_walk:
>  		accessed_dirty &= pte >>
>  			(PT_GUEST_DIRTY_SHIFT - PT_GUEST_ACCESSED_SHIFT);
> -	if (unlikely(!accessed_dirty)) {
> +	if (PT_GUEST_DIRTY_MASK && unlikely(!accessed_dirty)) {
> +		int ret;
> +
>  		ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, write_fault);

I think the whole block of code starting at

        if (!write_fault)
                protect_clean_gpte(&pte_access, pte);
        else
                /*
                 * On a write fault, fold the dirty bit into accessed_dirty by
                 * shifting it one place right.
                 */
 		accessed_dirty &= pte >>
 			(PT_GUEST_DIRTY_SHIFT - PT_GUEST_ACCESSED_SHIFT);

should be compiled out (in the previous patch) if dirty bits are not in use.
The "then" branch does nothing in that case, and the "else" branch is dead
code that makes no sense.

Once you do this, you can add a

	BUILD_BUG_ON(PT_GUEST_DIRTY_SHIFT < PT_GUEST_ACCESSED_SHIFT);

before the shift.

Please check if, with these changes, you can avoid defining
PT_GUEST_{DIRTY,ACCESSED}_SHIFT altogether in the EPT case.
This is safer because you are sure you left no undefined
behaviors when a bit is being folded onto another.

In principle, with these changes you could leave protect_clean_gpte in mmu.c.
I'm not sure what is the cleanest thing to do there, so I'll leave that to
your judgement.

Paolo

>  		if (unlikely(ret < 0))
>  			goto error;
> @@ -359,6 +390,7 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
>  					access);
>  }
>  
> +#if PTTYPE != PTTYPE_EPT
>  static int FNAME(walk_addr_nested)(struct guest_walker *walker,
>  				   struct kvm_vcpu *vcpu, gva_t addr,
>  				   u32 access)
> @@ -366,6 +398,7 @@ static int FNAME(walk_addr_nested)(struct guest_walker *walker,
>  	return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu,
>  					addr, access);
>  }
> +#endif
>  
>  static bool
>  FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
> @@ -793,6 +826,7 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
>  	return gpa;
>  }
>  
> +#if PTTYPE != PTTYPE_EPT
>  static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
>  				      u32 access,
>  				      struct x86_exception *exception)
> @@ -811,6 +845,7 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
>  
>  	return gpa;
>  }
> +#endif
>  
>  /*
>   * Using the cached information from sp->gfns is safe because:
> 

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov July 29, 2013, 11:33 a.m. UTC | #2
On Mon, Jul 29, 2013 at 11:48:06AM +0200, Paolo Bonzini wrote:
> Il 25/07/2013 12:59, Gleb Natapov ha scritto:
> > From: Nadav Har'El <nyh@il.ibm.com>
> > 
> > This is the first patch in a series which adds nested EPT support to KVM's
> > nested VMX. Nested EPT means emulating EPT for an L1 guest so that L1 can use
> > EPT when running a nested guest L2. When L1 uses EPT, it allows the L2 guest
> > to set its own cr3 and take its own page faults without either of L0 or L1
> > getting involved. This often significanlty improves L2's performance over the
> > previous two alternatives (shadow page tables over EPT, and shadow page
> > tables over shadow page tables).
> > 
> > This patch adds EPT support to paging_tmpl.h.
> > 
> > paging_tmpl.h contains the code for reading and writing page tables. The code
> > for 32-bit and 64-bit tables is very similar, but not identical, so
> > paging_tmpl.h is #include'd twice in mmu.c, once with PTTTYPE=32 and once
> > with PTTYPE=64, and this generates the two sets of similar functions.
> > 
> > There are subtle but important differences between the format of EPT tables
> > and that of ordinary x86 64-bit page tables, so for nested EPT we need a
> > third set of functions to read the guest EPT table and to write the shadow
> > EPT table.
> > 
> > So this patch adds third PTTYPE, PTTYPE_EPT, which creates functions (prefixed
> > with "EPT") which correctly read and write EPT tables.
> > 
> > Signed-off-by: Nadav Har'El <nyh@il.ibm.com>
> > Signed-off-by: Jun Nakajima <jun.nakajima@intel.com>
> > Signed-off-by: Xinhao Xu <xinhao.xu@intel.com>
> > Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com>
> > Signed-off-by: Gleb Natapov <gleb@redhat.com>
> > ---
> >  arch/x86/kvm/mmu.c         |    5 +++++
> >  arch/x86/kvm/paging_tmpl.h |   43 +++++++++++++++++++++++++++++++++++++++----
> >  2 files changed, 44 insertions(+), 4 deletions(-)
> > 
> > diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> > index 4c4274d..b5273c3 100644
> > --- a/arch/x86/kvm/mmu.c
> > +++ b/arch/x86/kvm/mmu.c
> > @@ -3494,6 +3494,11 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gp
> >  	return mmu->last_pte_bitmap & (1 << index);
> >  }
> >  
> > +#define PTTYPE_EPT 18 /* arbitrary */
> > +#define PTTYPE PTTYPE_EPT
> > +#include "paging_tmpl.h"
> > +#undef PTTYPE
> > +
> >  #define PTTYPE 64
> >  #include "paging_tmpl.h"
> >  #undef PTTYPE
> > diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
> > index 7581395..e38b3c0 100644
> > --- a/arch/x86/kvm/paging_tmpl.h
> > +++ b/arch/x86/kvm/paging_tmpl.h
> > @@ -58,6 +58,21 @@
> >  	#define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
> >  	#define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
> >  	#define CMPXCHG cmpxchg
> > +#elif PTTYPE == PTTYPE_EPT
> > +	#define pt_element_t u64
> > +	#define guest_walker guest_walkerEPT
> > +	#define FNAME(name) ept_##name
> > +	#define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
> > +	#define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
> > +	#define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
> > +	#define PT_INDEX(addr, level) PT64_INDEX(addr, level)
> > +	#define PT_LEVEL_BITS PT64_LEVEL_BITS
> > +	#define PT_GUEST_ACCESSED_MASK 0
> > +	#define PT_GUEST_DIRTY_MASK 0
> > +	#define PT_GUEST_DIRTY_SHIFT 0
> > +	#define PT_GUEST_ACCESSED_SHIFT 0
> > +	#define CMPXCHG cmpxchg64
> > +	#define PT_MAX_FULL_LEVELS 4
> >  #else
> >  	#error Invalid PTTYPE value
> >  #endif
> > @@ -90,6 +105,10 @@ static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
> >  
> >  static inline void FNAME(protect_clean_gpte)(unsigned *access, unsigned gpte)
> >  {
> > +#if PT_GUEST_DIRTY_MASK == 0
> > +	/* dirty bit is not supported, so no need to track it */
> > +	return;
> > +#else
> >  	unsigned mask;
> >  
> >  	BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK);
> > @@ -99,6 +118,7 @@ static inline void FNAME(protect_clean_gpte)(unsigned *access, unsigned gpte)
> >  	mask |= (gpte >> (PT_GUEST_DIRTY_SHIFT - PT_WRITABLE_SHIFT)) &
> >  		PT_WRITABLE_MASK;
> >  	*access &= mask;
> > +#endif
> 
> Please put this #if/#else/#endif in the previous patch.  (See also
> below on leaving out protect_clean_gpte altogether).
> 
Why? This change does not make much sense before EPT is introduced. The
previous patch is just a rename that should be easily verifiable by any
reviewer to be NOP.

> You probably should also have a
> 
> 	BUILD_BUG_ON(PT_GUEST_DIRTY_SHIFT < PT_WRITABLE_SHIFT);
> 
> in the #else branch.
> 
> >  }
> >  
> >  static bool FNAME(is_rsvd_bits_set)(struct kvm_mmu *mmu, u64 gpte, int level)
> > @@ -111,7 +131,11 @@ static bool FNAME(is_rsvd_bits_set)(struct kvm_mmu *mmu, u64 gpte, int level)
> >  
> >  static inline int FNAME(is_present_gpte)(unsigned long pte)
> >  {
> > +#if PTTYPE != PTTYPE_EPT
> >  	return is_present_gpte(pte);
> > +#else
> > +	return pte & 7;
> > +#endif
> >  }
> >  
> >  static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
> > @@ -147,7 +171,8 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
> >  	if (!FNAME(is_present_gpte)(gpte))
> >  		goto no_present;
> >  
> > -	if (!(gpte & PT_GUEST_ACCESSED_MASK))
> > +	/* if accessed bit is not supported prefetch non accessed gpte */
> > +	if (PT_GUEST_ACCESSED_MASK && !(gpte & PT_GUEST_ACCESSED_MASK))
> 
> Same for this hunk.  Please put it in the previous patch.
> 
> >  		goto no_present;
> >  
> >  	return false;
> > @@ -160,9 +185,14 @@ no_present:
> >  static inline unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, u64 gpte)
> >  {
> >  	unsigned access;
> > -
> > +#if PTTYPE == PTTYPE_EPT
> > +	BUILD_BUG_ON(ACC_WRITE_MASK != VMX_EPT_WRITABLE_MASK);
> > +	access = (gpte & VMX_EPT_WRITABLE_MASK) | ACC_USER_MASK |
> > +		((gpte & VMX_EPT_EXECUTABLE_MASK) ? ACC_EXEC_MASK : 0);
> > +#else
> >  	access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
> >  	access &= ~(gpte >> PT64_NX_SHIFT);
> > +#endif
> >  
> >  	return access;
> >  }
> > @@ -212,7 +242,6 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
> >  				    struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
> >  				    gva_t addr, u32 access)
> >  {
> > -	int ret;
> >  	pt_element_t pte;
> >  	pt_element_t __user *uninitialized_var(ptep_user);
> >  	gfn_t table_gfn;
> > @@ -322,7 +351,9 @@ retry_walk:
> >  		accessed_dirty &= pte >>
> >  			(PT_GUEST_DIRTY_SHIFT - PT_GUEST_ACCESSED_SHIFT);
> > -	if (unlikely(!accessed_dirty)) {
> > +	if (PT_GUEST_DIRTY_MASK && unlikely(!accessed_dirty)) {
> > +		int ret;
> > +
> >  		ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, write_fault);
> 
> I think the whole block of code starting at
> 
>         if (!write_fault)
>                 protect_clean_gpte(&pte_access, pte);
>         else
>                 /*
>                  * On a write fault, fold the dirty bit into accessed_dirty by
>                  * shifting it one place right.
>                  */
>  		accessed_dirty &= pte >>
>  			(PT_GUEST_DIRTY_SHIFT - PT_GUEST_ACCESSED_SHIFT);
> 
> should be compiled out (in the previous patch) if dirty bits are not in use.
> The "then" branch does nothing in that case, and the "else" branch is dead
> code that makes no sense.
> 
I disagree, there ifdef was there and it was ugly. protect_clean_gpte
and update_accessed_dirty_bits had to be ifdefed too. Compiler should be
smart enough to get rid of all of this code when PT_GUEST_DIRTY_MASK is 0.
Doing it like that was Xiao idea and it looks much nicer. 

> Once you do this, you can add a
> 
> 	BUILD_BUG_ON(PT_GUEST_DIRTY_SHIFT < PT_GUEST_ACCESSED_SHIFT);
> 
> before the shift.
> 
> Please check if, with these changes, you can avoid defining
> PT_GUEST_{DIRTY,ACCESSED}_SHIFT altogether in the EPT case.
> This is safer because you are sure you left no undefined
> behaviors when a bit is being folded onto another.
You basically ask me to get back to the patch how it was before I
addressed Xiao comment and add some more idfefs because previously not
all places where A/D bits were used were protected by it. IMO this would
be a step backward especially as the method in this patch series is a
preparation for A/D support for EPT. When those bits are supported with
EPT they are different than in regular page tables.

> 
> In principle, with these changes you could leave protect_clean_gpte in mmu.c.
Only if I ifdef all other uses of in in the file.

> I'm not sure what is the cleanest thing to do there, so I'll leave that to
> your judgement.
> 
> Paolo
> 
> >  		if (unlikely(ret < 0))
> >  			goto error;
> > @@ -359,6 +390,7 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
> >  					access);
> >  }
> >  
> > +#if PTTYPE != PTTYPE_EPT
> >  static int FNAME(walk_addr_nested)(struct guest_walker *walker,
> >  				   struct kvm_vcpu *vcpu, gva_t addr,
> >  				   u32 access)
> > @@ -366,6 +398,7 @@ static int FNAME(walk_addr_nested)(struct guest_walker *walker,
> >  	return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu,
> >  					addr, access);
> >  }
> > +#endif
> >  
> >  static bool
> >  FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
> > @@ -793,6 +826,7 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
> >  	return gpa;
> >  }
> >  
> > +#if PTTYPE != PTTYPE_EPT
> >  static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
> >  				      u32 access,
> >  				      struct x86_exception *exception)
> > @@ -811,6 +845,7 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
> >  
> >  	return gpa;
> >  }
> > +#endif
> >  
> >  /*
> >   * Using the cached information from sp->gfns is safe because:
> > 

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paolo Bonzini July 29, 2013, 11:55 a.m. UTC | #3
Il 29/07/2013 13:33, Gleb Natapov ha scritto:
> On Mon, Jul 29, 2013 at 11:48:06AM +0200, Paolo Bonzini wrote:
>> Il 25/07/2013 12:59, Gleb Natapov ha scritto:
>>> From: Nadav Har'El <nyh@il.ibm.com>
>>>
>>> This is the first patch in a series which adds nested EPT support to KVM's
>>> nested VMX. Nested EPT means emulating EPT for an L1 guest so that L1 can use
>>> EPT when running a nested guest L2. When L1 uses EPT, it allows the L2 guest
>>> to set its own cr3 and take its own page faults without either of L0 or L1
>>> getting involved. This often significanlty improves L2's performance over the
>>> previous two alternatives (shadow page tables over EPT, and shadow page
>>> tables over shadow page tables).
>>>
>>> This patch adds EPT support to paging_tmpl.h.
>>>
>>> paging_tmpl.h contains the code for reading and writing page tables. The code
>>> for 32-bit and 64-bit tables is very similar, but not identical, so
>>> paging_tmpl.h is #include'd twice in mmu.c, once with PTTTYPE=32 and once
>>> with PTTYPE=64, and this generates the two sets of similar functions.
>>>
>>> There are subtle but important differences between the format of EPT tables
>>> and that of ordinary x86 64-bit page tables, so for nested EPT we need a
>>> third set of functions to read the guest EPT table and to write the shadow
>>> EPT table.
>>>
>>> So this patch adds third PTTYPE, PTTYPE_EPT, which creates functions (prefixed
>>> with "EPT") which correctly read and write EPT tables.
>>>
>>> Signed-off-by: Nadav Har'El <nyh@il.ibm.com>
>>> Signed-off-by: Jun Nakajima <jun.nakajima@intel.com>
>>> Signed-off-by: Xinhao Xu <xinhao.xu@intel.com>
>>> Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com>
>>> Signed-off-by: Gleb Natapov <gleb@redhat.com>
>>> ---
>>>  arch/x86/kvm/mmu.c         |    5 +++++
>>>  arch/x86/kvm/paging_tmpl.h |   43 +++++++++++++++++++++++++++++++++++++++----
>>>  2 files changed, 44 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
>>> index 4c4274d..b5273c3 100644
>>> --- a/arch/x86/kvm/mmu.c
>>> +++ b/arch/x86/kvm/mmu.c
>>> @@ -3494,6 +3494,11 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gp
>>>  	return mmu->last_pte_bitmap & (1 << index);
>>>  }
>>>  
>>> +#define PTTYPE_EPT 18 /* arbitrary */
>>> +#define PTTYPE PTTYPE_EPT
>>> +#include "paging_tmpl.h"
>>> +#undef PTTYPE
>>> +
>>>  #define PTTYPE 64
>>>  #include "paging_tmpl.h"
>>>  #undef PTTYPE
>>> diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
>>> index 7581395..e38b3c0 100644
>>> --- a/arch/x86/kvm/paging_tmpl.h
>>> +++ b/arch/x86/kvm/paging_tmpl.h
>>> @@ -58,6 +58,21 @@
>>>  	#define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
>>>  	#define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
>>>  	#define CMPXCHG cmpxchg
>>> +#elif PTTYPE == PTTYPE_EPT
>>> +	#define pt_element_t u64
>>> +	#define guest_walker guest_walkerEPT
>>> +	#define FNAME(name) ept_##name
>>> +	#define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
>>> +	#define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
>>> +	#define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
>>> +	#define PT_INDEX(addr, level) PT64_INDEX(addr, level)
>>> +	#define PT_LEVEL_BITS PT64_LEVEL_BITS
>>> +	#define PT_GUEST_ACCESSED_MASK 0
>>> +	#define PT_GUEST_DIRTY_MASK 0
>>> +	#define PT_GUEST_DIRTY_SHIFT 0
>>> +	#define PT_GUEST_ACCESSED_SHIFT 0
>>> +	#define CMPXCHG cmpxchg64
>>> +	#define PT_MAX_FULL_LEVELS 4
>>>  #else
>>>  	#error Invalid PTTYPE value
>>>  #endif
>>> @@ -90,6 +105,10 @@ static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
>>>  
>>>  static inline void FNAME(protect_clean_gpte)(unsigned *access, unsigned gpte)
>>>  {
>>> +#if PT_GUEST_DIRTY_MASK == 0
>>> +	/* dirty bit is not supported, so no need to track it */
>>> +	return;
>>> +#else
>>>  	unsigned mask;
>>>  
>>>  	BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK);
>>> @@ -99,6 +118,7 @@ static inline void FNAME(protect_clean_gpte)(unsigned *access, unsigned gpte)
>>>  	mask |= (gpte >> (PT_GUEST_DIRTY_SHIFT - PT_WRITABLE_SHIFT)) &
>>>  		PT_WRITABLE_MASK;
>>>  	*access &= mask;
>>> +#endif
>>
>> Please put this #if/#else/#endif in the previous patch.  (See also
>> below on leaving out protect_clean_gpte altogether).
>>
> Why? This change does not make much sense before EPT is introduced. The
> previous patch is just a rename that should be easily verifiable by any
> reviewer to be NOP.

My initial impression to this patch was "everything's ready after the
previous patch, you just have to set the mask to 0".  Which is not quite
true.  Maybe you need three patches instead of two.

>>         if (!write_fault)
>>                 protect_clean_gpte(&pte_access, pte);
>>         else
>>                 /*
>>                  * On a write fault, fold the dirty bit into accessed_dirty by
>>                  * shifting it one place right.
>>                  */
>>  		accessed_dirty &= pte >>
>>  			(PT_GUEST_DIRTY_SHIFT - PT_GUEST_ACCESSED_SHIFT);
>>
>> should be compiled out (in the previous patch) if dirty bits are not in use.
>> The "then" branch does nothing in that case, and the "else" branch is dead
>> code that makes no sense.
>
> I disagree, there ifdef was there and it was ugly. protect_clean_gpte
> and update_accessed_dirty_bits had to be ifdefed too.

protect_clean_gpte is still #ifdef'ed, so we're talking of a net gain of
2 lines.

Something like this:

+       /* if dirty bit is not supported, no need to track it */
+#if PT_GUEST_DIRTY_MASK == 0
        if (!write_fault)
                 protect_clean_gpte(&pte_access, pte);
...
        if (unlikely(!accessed_dirty)) {
...
        }
+#endif

doesn't look bad at all.  With the old check on EPT it looked ugly, but
with the new check on PT_GUEST_DIRTY_MASK it is quite natural.  Also
because you have anyway a reference to PT_GUEST_DIRTY_MASK in the "if".
 If I see

        if (!write_fault)
                protect_clean_gpte(&pte_access, pte);
        else
                /*
                 * On a write fault, fold the dirty bit into
		 * accessed_dirty by
                 * shifting it one place right.
                 */
                accessed_dirty &=
			pte >> (PT_DIRTY_SHIFT - PT_ACCESSED_SHIFT);

        if (PT_GUEST_DIRTY_MASK != 0 && unlikely(!accessed_dirty)) {

the obvious reaction is "what, is there a case where I'm using
accessed_dirty if PT_GUEST_DIRTY_MASK == 0?"  Of course it makes sense
to leave the otherwise dead assignments to accessed_dirty in the loop
(the compiler removes them anyway); but here the assignment is too close
to the user to not raise such questions.

Perhaps it's obvious to you because you're more familiar with this code.

> Compiler should be
> smart enough to get rid of all of this code when PT_GUEST_DIRTY_MASK is 0.
> Doing it like that was Xiao idea and it looks much nicer. 
> 
>> Once you do this, you can add a
>>
>> 	BUILD_BUG_ON(PT_GUEST_DIRTY_SHIFT < PT_GUEST_ACCESSED_SHIFT);
>>
>> before the shift.
>>
>> Please check if, with these changes, you can avoid defining
>> PT_GUEST_{DIRTY,ACCESSED}_SHIFT altogether in the EPT case.
>> This is safer because you are sure you left no undefined
>> behaviors when a bit is being folded onto another.
> You basically ask me to get back to the patch how it was before I
> addressed Xiao comment and add some more idfefs because previously not
> all places where A/D bits were used were protected by it. IMO this would
> be a step backward especially as the method in this patch series is a
> preparation for A/D support for EPT. When those bits are supported with
> EPT they are different than in regular page tables.

Yes, and if they will put the dirty bit below the accessed bit (rather
than above), you will silently get undefined behavior for a shift by
negative value.  Adding BUILD_BUG_ONs for this, and ensuring
PT_GUEST_{DIRTY,ACCESSED}_SHIFT are never used in the EPT case, is
important in my opinion.

I very much like what you did in the previous patch, allowing
customization of the masks instead of using EPT ifdefs all over the
place.  On the other hand, once you did that the benefits of Xiao's
proposed change pretty much go away.

>> In principle, with these changes you could leave protect_clean_gpte in mmu.c.
> 
> Only if I ifdef all other uses of in in the file.

Yeah, scrap that.

Paolo

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov July 29, 2013, 12:24 p.m. UTC | #4
On Mon, Jul 29, 2013 at 01:55:46PM +0200, Paolo Bonzini wrote:
> Il 29/07/2013 13:33, Gleb Natapov ha scritto:
> > On Mon, Jul 29, 2013 at 11:48:06AM +0200, Paolo Bonzini wrote:
> >> Il 25/07/2013 12:59, Gleb Natapov ha scritto:
> >>> From: Nadav Har'El <nyh@il.ibm.com>
> >>>
> >>> This is the first patch in a series which adds nested EPT support to KVM's
> >>> nested VMX. Nested EPT means emulating EPT for an L1 guest so that L1 can use
> >>> EPT when running a nested guest L2. When L1 uses EPT, it allows the L2 guest
> >>> to set its own cr3 and take its own page faults without either of L0 or L1
> >>> getting involved. This often significanlty improves L2's performance over the
> >>> previous two alternatives (shadow page tables over EPT, and shadow page
> >>> tables over shadow page tables).
> >>>
> >>> This patch adds EPT support to paging_tmpl.h.
> >>>
> >>> paging_tmpl.h contains the code for reading and writing page tables. The code
> >>> for 32-bit and 64-bit tables is very similar, but not identical, so
> >>> paging_tmpl.h is #include'd twice in mmu.c, once with PTTTYPE=32 and once
> >>> with PTTYPE=64, and this generates the two sets of similar functions.
> >>>
> >>> There are subtle but important differences between the format of EPT tables
> >>> and that of ordinary x86 64-bit page tables, so for nested EPT we need a
> >>> third set of functions to read the guest EPT table and to write the shadow
> >>> EPT table.
> >>>
> >>> So this patch adds third PTTYPE, PTTYPE_EPT, which creates functions (prefixed
> >>> with "EPT") which correctly read and write EPT tables.
> >>>
> >>> Signed-off-by: Nadav Har'El <nyh@il.ibm.com>
> >>> Signed-off-by: Jun Nakajima <jun.nakajima@intel.com>
> >>> Signed-off-by: Xinhao Xu <xinhao.xu@intel.com>
> >>> Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com>
> >>> Signed-off-by: Gleb Natapov <gleb@redhat.com>
> >>> ---
> >>>  arch/x86/kvm/mmu.c         |    5 +++++
> >>>  arch/x86/kvm/paging_tmpl.h |   43 +++++++++++++++++++++++++++++++++++++++----
> >>>  2 files changed, 44 insertions(+), 4 deletions(-)
> >>>
> >>> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> >>> index 4c4274d..b5273c3 100644
> >>> --- a/arch/x86/kvm/mmu.c
> >>> +++ b/arch/x86/kvm/mmu.c
> >>> @@ -3494,6 +3494,11 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gp
> >>>  	return mmu->last_pte_bitmap & (1 << index);
> >>>  }
> >>>  
> >>> +#define PTTYPE_EPT 18 /* arbitrary */
> >>> +#define PTTYPE PTTYPE_EPT
> >>> +#include "paging_tmpl.h"
> >>> +#undef PTTYPE
> >>> +
> >>>  #define PTTYPE 64
> >>>  #include "paging_tmpl.h"
> >>>  #undef PTTYPE
> >>> diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
> >>> index 7581395..e38b3c0 100644
> >>> --- a/arch/x86/kvm/paging_tmpl.h
> >>> +++ b/arch/x86/kvm/paging_tmpl.h
> >>> @@ -58,6 +58,21 @@
> >>>  	#define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
> >>>  	#define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
> >>>  	#define CMPXCHG cmpxchg
> >>> +#elif PTTYPE == PTTYPE_EPT
> >>> +	#define pt_element_t u64
> >>> +	#define guest_walker guest_walkerEPT
> >>> +	#define FNAME(name) ept_##name
> >>> +	#define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
> >>> +	#define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
> >>> +	#define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
> >>> +	#define PT_INDEX(addr, level) PT64_INDEX(addr, level)
> >>> +	#define PT_LEVEL_BITS PT64_LEVEL_BITS
> >>> +	#define PT_GUEST_ACCESSED_MASK 0
> >>> +	#define PT_GUEST_DIRTY_MASK 0
> >>> +	#define PT_GUEST_DIRTY_SHIFT 0
> >>> +	#define PT_GUEST_ACCESSED_SHIFT 0
> >>> +	#define CMPXCHG cmpxchg64
> >>> +	#define PT_MAX_FULL_LEVELS 4
> >>>  #else
> >>>  	#error Invalid PTTYPE value
> >>>  #endif
> >>> @@ -90,6 +105,10 @@ static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
> >>>  
> >>>  static inline void FNAME(protect_clean_gpte)(unsigned *access, unsigned gpte)
> >>>  {
> >>> +#if PT_GUEST_DIRTY_MASK == 0
> >>> +	/* dirty bit is not supported, so no need to track it */
> >>> +	return;
> >>> +#else
> >>>  	unsigned mask;
> >>>  
> >>>  	BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK);
> >>> @@ -99,6 +118,7 @@ static inline void FNAME(protect_clean_gpte)(unsigned *access, unsigned gpte)
> >>>  	mask |= (gpte >> (PT_GUEST_DIRTY_SHIFT - PT_WRITABLE_SHIFT)) &
> >>>  		PT_WRITABLE_MASK;
> >>>  	*access &= mask;
> >>> +#endif
> >>
> >> Please put this #if/#else/#endif in the previous patch.  (See also
> >> below on leaving out protect_clean_gpte altogether).
> >>
> > Why? This change does not make much sense before EPT is introduced. The
> > previous patch is just a rename that should be easily verifiable by any
> > reviewer to be NOP.
> 
> My initial impression to this patch was "everything's ready after the
> previous patch, you just have to set the mask to 0".  Which is not quite
> true.  Maybe you need three patches instead of two.
> 
Or change commit message for patch 5 to make it more clear that it is a
preparation patch?

> >>         if (!write_fault)
> >>                 protect_clean_gpte(&pte_access, pte);
> >>         else
> >>                 /*
> >>                  * On a write fault, fold the dirty bit into accessed_dirty by
> >>                  * shifting it one place right.
> >>                  */
> >>  		accessed_dirty &= pte >>
> >>  			(PT_GUEST_DIRTY_SHIFT - PT_GUEST_ACCESSED_SHIFT);
> >>
> >> should be compiled out (in the previous patch) if dirty bits are not in use.
> >> The "then" branch does nothing in that case, and the "else" branch is dead
> >> code that makes no sense.
> >
> > I disagree, there ifdef was there and it was ugly. protect_clean_gpte
> > and update_accessed_dirty_bits had to be ifdefed too.
> 
> protect_clean_gpte is still #ifdef'ed, so we're talking of a net gain of
> 2 lines.
Please no need to count lines :) protect_clean_gpte() _has_ ifdef, it
is not ifdefed.

> 
> Something like this:
> 
> +       /* if dirty bit is not supported, no need to track it */
> +#if PT_GUEST_DIRTY_MASK == 0
>         if (!write_fault)
>                  protect_clean_gpte(&pte_access, pte);
> ...
>         if (unlikely(!accessed_dirty)) {
> ...
>         }
> +#endif
> 
I will have to do the same for update_accessed_dirty_bits(). The problem
of idfdefs they spread around.

> doesn't look bad at all.  With the old check on EPT it looked ugly, but
> with the new check on PT_GUEST_DIRTY_MASK it is quite natural.  Also
> because you have anyway a reference to PT_GUEST_DIRTY_MASK in the "if".
>  If I see
> 
>         if (!write_fault)
>                 protect_clean_gpte(&pte_access, pte);
>         else
>                 /*
>                  * On a write fault, fold the dirty bit into
> 		 * accessed_dirty by
>                  * shifting it one place right.
>                  */
>                 accessed_dirty &=
> 			pte >> (PT_DIRTY_SHIFT - PT_ACCESSED_SHIFT);
> 
>         if (PT_GUEST_DIRTY_MASK != 0 && unlikely(!accessed_dirty)) {
> 
> the obvious reaction is "what, is there a case where I'm using
> accessed_dirty if PT_GUEST_DIRTY_MASK == 0?"  Of course it makes sense
In this case accessed_dirty has correct value of 0 :) The if() bellow just
tells you that since A/D is not supported there is nothing to be done
about zero value of accessed_dirty, but the value itself is correct!

> to leave the otherwise dead assignments to accessed_dirty in the loop
> (the compiler removes them anyway); but here the assignment is too close
> to the user to not raise such questions.
> 
I would think since the assignment is close to the loop it is _more_
easy to see that it is dead, not less.

> Perhaps it's obvious to you because you're more familiar with this code.
> 
> > Compiler should be
> > smart enough to get rid of all of this code when PT_GUEST_DIRTY_MASK is 0.
> > Doing it like that was Xiao idea and it looks much nicer. 
> > 
> >> Once you do this, you can add a
> >>
> >> 	BUILD_BUG_ON(PT_GUEST_DIRTY_SHIFT < PT_GUEST_ACCESSED_SHIFT);
> >>
> >> before the shift.
> >>
> >> Please check if, with these changes, you can avoid defining
> >> PT_GUEST_{DIRTY,ACCESSED}_SHIFT altogether in the EPT case.
> >> This is safer because you are sure you left no undefined
> >> behaviors when a bit is being folded onto another.
> > You basically ask me to get back to the patch how it was before I
> > addressed Xiao comment and add some more idfefs because previously not
> > all places where A/D bits were used were protected by it. IMO this would
> > be a step backward especially as the method in this patch series is a
> > preparation for A/D support for EPT. When those bits are supported with
> > EPT they are different than in regular page tables.
> 
> Yes, and if they will put the dirty bit below the accessed bit (rather
> than above), you will silently get undefined behavior for a shift by
> negative value.  Adding BUILD_BUG_ONs for this, and ensuring
> PT_GUEST_{DIRTY,ACCESSED}_SHIFT are never used in the EPT case, is
> important in my opinion.
> 
The format is defined. Dirty bit is above Accessed.

> I very much like what you did in the previous patch, allowing
> customization of the masks instead of using EPT ifdefs all over the
> place.  On the other hand, once you did that the benefits of Xiao's
> proposed change pretty much go away.
> 
> >> In principle, with these changes you could leave protect_clean_gpte in mmu.c.
> > 
> > Only if I ifdef all other uses of in in the file.
> 
> Yeah, scrap that.
> 
> Paolo

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paolo Bonzini July 29, 2013, 1:19 p.m. UTC | #5
Il 29/07/2013 14:24, Gleb Natapov ha scritto:
>> My initial impression to this patch was "everything's ready after the
>> previous patch, you just have to set the mask to 0".  Which is not quite
>> true.  Maybe you need three patches instead of two.
>>
> Or change commit message for patch 5 to make it more clear that it is a
> preparation patch?

Or both.  Just give it a try.

>>
>> Something like this:
>>
>> +       /* if dirty bit is not supported, no need to track it */
>> +#if PT_GUEST_DIRTY_MASK == 0
>>         if (!write_fault)
>>                  protect_clean_gpte(&pte_access, pte);
>> ...
>>         if (unlikely(!accessed_dirty)) {
>> ...
>>         }
>> +#endif
>>
> I will have to do the same for update_accessed_dirty_bits(). The problem
> of idfdefs they spread around.

Putting update_accessed_dirty_bits() with "#ifdef do we have
accessed_dirty_bits at all" sounds just fine.

But if you do not like #ifdefs you can use __maybe_unused and the
compiler will elide it.

>> doesn't look bad at all.  With the old check on EPT it looked ugly, but
>> with the new check on PT_GUEST_DIRTY_MASK it is quite natural.  Also
>> because you have anyway a reference to PT_GUEST_DIRTY_MASK in the "if".
>>  If I see
>>
>>         if (!write_fault)
>>                 protect_clean_gpte(&pte_access, pte);
>>         else
>>                 /*
>>                  * On a write fault, fold the dirty bit into
>> 		 * accessed_dirty by
>>                  * shifting it one place right.
>>                  */
>>                 accessed_dirty &=
>> 			pte >> (PT_DIRTY_SHIFT - PT_ACCESSED_SHIFT);
>>
>>         if (PT_GUEST_DIRTY_MASK != 0 && unlikely(!accessed_dirty)) {
>>
>> the obvious reaction is "what, is there a case where I'm using
>> accessed_dirty if PT_GUEST_DIRTY_MASK == 0?"  Of course it makes sense
> In this case accessed_dirty has correct value of 0 :) The if() bellow just
> tells you that since A/D is not supported there is nothing to be done
> about zero value of accessed_dirty, but the value itself is correct!

It is correct because accessed_dirty is initialized to 0.  But the "&"
with a bit taken out of thin air (bit 0 of the PTE)?  That's just
disgusting. :)

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov July 29, 2013, 1:27 p.m. UTC | #6
On Mon, Jul 29, 2013 at 03:19:01PM +0200, Paolo Bonzini wrote:
> Il 29/07/2013 14:24, Gleb Natapov ha scritto:
> >> My initial impression to this patch was "everything's ready after the
> >> previous patch, you just have to set the mask to 0".  Which is not quite
> >> true.  Maybe you need three patches instead of two.
> >>
> > Or change commit message for patch 5 to make it more clear that it is a
> > preparation patch?
> 
> Or both.  Just give it a try.
> 
It is not hard to imaging without trying :) Will do.

> >>
> >> Something like this:
> >>
> >> +       /* if dirty bit is not supported, no need to track it */
> >> +#if PT_GUEST_DIRTY_MASK == 0
> >>         if (!write_fault)
> >>                  protect_clean_gpte(&pte_access, pte);
> >> ...
> >>         if (unlikely(!accessed_dirty)) {
> >> ...
> >>         }
> >> +#endif
> >>
> > I will have to do the same for update_accessed_dirty_bits(). The problem
> > of idfdefs they spread around.
> 
> Putting update_accessed_dirty_bits() with "#ifdef do we have
> accessed_dirty_bits at all" sounds just fine.
> 
> But if you do not like #ifdefs you can use __maybe_unused and the
> compiler will elide it.
> 
Those this is unnecessary. That's the point. If #ifdef is unavoidable I
have not problem using it even though I dislike it, but in this case it
is just unnecessary.

> >> doesn't look bad at all.  With the old check on EPT it looked ugly, but
> >> with the new check on PT_GUEST_DIRTY_MASK it is quite natural.  Also
> >> because you have anyway a reference to PT_GUEST_DIRTY_MASK in the "if".
> >>  If I see
> >>
> >>         if (!write_fault)
> >>                 protect_clean_gpte(&pte_access, pte);
> >>         else
> >>                 /*
> >>                  * On a write fault, fold the dirty bit into
> >> 		 * accessed_dirty by
> >>                  * shifting it one place right.
> >>                  */
> >>                 accessed_dirty &=
> >> 			pte >> (PT_DIRTY_SHIFT - PT_ACCESSED_SHIFT);
> >>
> >>         if (PT_GUEST_DIRTY_MASK != 0 && unlikely(!accessed_dirty)) {
> >>
> >> the obvious reaction is "what, is there a case where I'm using
> >> accessed_dirty if PT_GUEST_DIRTY_MASK == 0?"  Of course it makes sense
> > In this case accessed_dirty has correct value of 0 :) The if() bellow just
> > tells you that since A/D is not supported there is nothing to be done
> > about zero value of accessed_dirty, but the value itself is correct!
> 
> It is correct because accessed_dirty is initialized to 0.  But the "&"
> with a bit taken out of thin air (bit 0 of the PTE)?  That's just
> disgusting. :)
> 
Sorry to disgust you, but the code relies on this "&" trick with or
without the patch. It clears all unrelated bits from pte this way. No
new disgusting tricks are added by the patch.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paolo Bonzini July 29, 2013, 2:15 p.m. UTC | #7
Il 29/07/2013 15:27, Gleb Natapov ha scritto:
>>>> doesn't look bad at all.  With the old check on EPT it looked ugly, but
>>>> with the new check on PT_GUEST_DIRTY_MASK it is quite natural.  Also
>>>> because you have anyway a reference to PT_GUEST_DIRTY_MASK in the "if".
>>>>  If I see
>>>>
>>>>         if (!write_fault)
>>>>                 protect_clean_gpte(&pte_access, pte);
>>>>         else
>>>>                 /*
>>>>                  * On a write fault, fold the dirty bit into
>>>> 		 * accessed_dirty by
>>>>                  * shifting it one place right.
>>>>                  */
>>>>                 accessed_dirty &=
>>>> 			pte >> (PT_DIRTY_SHIFT - PT_ACCESSED_SHIFT);
>>>>
>>>>         if (PT_GUEST_DIRTY_MASK != 0 && unlikely(!accessed_dirty)) {
>>>>
>>>> the obvious reaction is "what, is there a case where I'm using
>>>> accessed_dirty if PT_GUEST_DIRTY_MASK == 0?"  Of course it makes sense
>>> In this case accessed_dirty has correct value of 0 :) The if() bellow just
>>> tells you that since A/D is not supported there is nothing to be done
>>> about zero value of accessed_dirty, but the value itself is correct!
>>
>> It is correct because accessed_dirty is initialized to 0.  But the "&"
>> with a bit taken out of thin air (bit 0 of the PTE)?  That's just
>> disgusting. :)
>>
> Sorry to disgust you, but the code relies on this "&" trick with or
> without the patch. It clears all unrelated bits from pte this way. No
> new disgusting tricks are added by the patch.

Oh the code is not disgusting at all!  It is very nice to follow.

The new disgusting ;) trick is that here in the EPT case you're
effectively doing

	accessed_dirty &= pte;

where bit 0 is the "R" bit (iirc) and has absolutely nothing to do with
dirty or accessed.

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov July 29, 2013, 4:14 p.m. UTC | #8
On Mon, Jul 29, 2013 at 04:15:16PM +0200, Paolo Bonzini wrote:
> Il 29/07/2013 15:27, Gleb Natapov ha scritto:
> >>>> doesn't look bad at all.  With the old check on EPT it looked ugly, but
> >>>> with the new check on PT_GUEST_DIRTY_MASK it is quite natural.  Also
> >>>> because you have anyway a reference to PT_GUEST_DIRTY_MASK in the "if".
> >>>>  If I see
> >>>>
> >>>>         if (!write_fault)
> >>>>                 protect_clean_gpte(&pte_access, pte);
> >>>>         else
> >>>>                 /*
> >>>>                  * On a write fault, fold the dirty bit into
> >>>> 		 * accessed_dirty by
> >>>>                  * shifting it one place right.
> >>>>                  */
> >>>>                 accessed_dirty &=
> >>>> 			pte >> (PT_DIRTY_SHIFT - PT_ACCESSED_SHIFT);
> >>>>
> >>>>         if (PT_GUEST_DIRTY_MASK != 0 && unlikely(!accessed_dirty)) {
> >>>>
> >>>> the obvious reaction is "what, is there a case where I'm using
> >>>> accessed_dirty if PT_GUEST_DIRTY_MASK == 0?"  Of course it makes sense
> >>> In this case accessed_dirty has correct value of 0 :) The if() bellow just
> >>> tells you that since A/D is not supported there is nothing to be done
> >>> about zero value of accessed_dirty, but the value itself is correct!
> >>
> >> It is correct because accessed_dirty is initialized to 0.  But the "&"
> >> with a bit taken out of thin air (bit 0 of the PTE)?  That's just
> >> disgusting. :)
> >>
> > Sorry to disgust you, but the code relies on this "&" trick with or
> > without the patch. It clears all unrelated bits from pte this way. No
> > new disgusting tricks are added by the patch.
> 
> Oh the code is not disgusting at all!  It is very nice to follow.
> 
> The new disgusting ;) trick is that here in the EPT case you're
> effectively doing
> 
> 	accessed_dirty &= pte;
> 
> where bit 0 is the "R" bit (iirc) and has absolutely nothing to do with
> dirty or accessed.
> 
What bit 0 has to do with anything? Non ept code after shift also has
random bits and random places in ept (R at P place, U at R place), the
trick is that accessed_dirty masks bits we are not interesting in and
capture only those we want to follow (accessed in regular case, non in
ept case). This is exactly what original code is doing, so they are
either both disgusting or both very nice to follow :)

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paolo Bonzini July 29, 2013, 4:28 p.m. UTC | #9
Il 29/07/2013 18:14, Gleb Natapov ha scritto:
>>>>>> > >>>>                 accessed_dirty &=
>>>>>> > >>>> 			pte >> (PT_DIRTY_SHIFT - PT_ACCESSED_SHIFT);
>>>>>> > >>>>
>>>>>> > >>>>         if (PT_GUEST_DIRTY_MASK != 0 && unlikely(!accessed_dirty)) {
>>>>>> > >>>>
>>>>>> > >>>> the obvious reaction is "what, is there a case where I'm using
>>>>>> > >>>> accessed_dirty if PT_GUEST_DIRTY_MASK == 0?"  Of course it makes sense
>>>>> > >>> In this case accessed_dirty has correct value of 0 :) The if() bellow just
>>>>> > >>> tells you that since A/D is not supported there is nothing to be done
>>>>> > >>> about zero value of accessed_dirty, but the value itself is correct!
>>>> > >>
>>>> > >> It is correct because accessed_dirty is initialized to 0.  But the "&"
>>>> > >> with a bit taken out of thin air (bit 0 of the PTE)?  That's just
>>>> > >> disgusting. :)
>>>> > >>
>>> > > Sorry to disgust you, but the code relies on this "&" trick with or
>>> > > without the patch. It clears all unrelated bits from pte this way. No
>>> > > new disgusting tricks are added by the patch.
>> > 
>> > Oh the code is not disgusting at all!  It is very nice to follow.
>> > 
>> > The new disgusting ;) trick is that here in the EPT case you're
>> > effectively doing
>> > 
>> > 	accessed_dirty &= pte;
>> > 
>> > where bit 0 is the "R" bit (iirc) and has absolutely nothing to do with
>> > dirty or accessed.
>
> What bit 0 has to do with anything? Non ept code after shift also has
> random bits and random places in ept (R at P place, U at R place), the
> trick is that accessed_dirty masks bits we are not interesting in and
> capture only those we want to follow (accessed in regular case, non in
> ept case). This is exactly what original code is doing, so they are
> either both disgusting or both very nice to follow :)

The comment is clear: "fold the dirty bit into accessed_dirty by
shifting it one place right".  In the EPT case the comment makes no
sense and it is not obvious that you rely on accessed_dirty=0 even
before that line.

That's why I'd rather have that code out of the PT_GUEST_DIRTY_MASK==0 case.

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov July 29, 2013, 4:43 p.m. UTC | #10
On Mon, Jul 29, 2013 at 06:28:37PM +0200, Paolo Bonzini wrote:
> Il 29/07/2013 18:14, Gleb Natapov ha scritto:
> >>>>>> > >>>>                 accessed_dirty &=
> >>>>>> > >>>> 			pte >> (PT_DIRTY_SHIFT - PT_ACCESSED_SHIFT);
> >>>>>> > >>>>
> >>>>>> > >>>>         if (PT_GUEST_DIRTY_MASK != 0 && unlikely(!accessed_dirty)) {
> >>>>>> > >>>>
> >>>>>> > >>>> the obvious reaction is "what, is there a case where I'm using
> >>>>>> > >>>> accessed_dirty if PT_GUEST_DIRTY_MASK == 0?"  Of course it makes sense
> >>>>> > >>> In this case accessed_dirty has correct value of 0 :) The if() bellow just
> >>>>> > >>> tells you that since A/D is not supported there is nothing to be done
> >>>>> > >>> about zero value of accessed_dirty, but the value itself is correct!
> >>>> > >>
> >>>> > >> It is correct because accessed_dirty is initialized to 0.  But the "&"
> >>>> > >> with a bit taken out of thin air (bit 0 of the PTE)?  That's just
> >>>> > >> disgusting. :)
> >>>> > >>
> >>> > > Sorry to disgust you, but the code relies on this "&" trick with or
> >>> > > without the patch. It clears all unrelated bits from pte this way. No
> >>> > > new disgusting tricks are added by the patch.
> >> > 
> >> > Oh the code is not disgusting at all!  It is very nice to follow.
> >> > 
> >> > The new disgusting ;) trick is that here in the EPT case you're
> >> > effectively doing
> >> > 
> >> > 	accessed_dirty &= pte;
> >> > 
> >> > where bit 0 is the "R" bit (iirc) and has absolutely nothing to do with
> >> > dirty or accessed.
> >
> > What bit 0 has to do with anything? Non ept code after shift also has
> > random bits and random places in ept (R at P place, U at R place), the
> > trick is that accessed_dirty masks bits we are not interesting in and
> > capture only those we want to follow (accessed in regular case, non in
> > ept case). This is exactly what original code is doing, so they are
> > either both disgusting or both very nice to follow :)
> 
> The comment is clear: "fold the dirty bit into accessed_dirty by
> shifting it one place right".  In the EPT case the comment makes no
> sense and it is not obvious that you rely on accessed_dirty=0 even
> before that line.
It is not obvious that the code relies on accessed_dirty been initialized
to the bits the code wants to track at the start of the function. It
wasn't for me. With if() it would have been much clearer, but the
current way is faster. 

> 
> That's why I'd rather have that code out of the PT_GUEST_DIRTY_MASK==0 case.
> 
What problem current code has that you are trying to fix? What _technical_
justification you provide? There is no point adding ifdefs where they
are clearly not needed just because.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paolo Bonzini July 29, 2013, 5:06 p.m. UTC | #11
Il 29/07/2013 18:43, Gleb Natapov ha scritto:
> On Mon, Jul 29, 2013 at 06:28:37PM +0200, Paolo Bonzini wrote:
>> Il 29/07/2013 18:14, Gleb Natapov ha scritto:
>>>>>>>>>>>>>                 accessed_dirty &=
>>>>>>>>>>>>> 			pte >> (PT_DIRTY_SHIFT - PT_ACCESSED_SHIFT);
>>>>>>>>>>>>>
>>>>>>>>>>>>>         if (PT_GUEST_DIRTY_MASK != 0 && unlikely(!accessed_dirty)) {
>>>>>>>>>>>>>
>>>>>>>>>>>>> the obvious reaction is "what, is there a case where I'm using
>>>>>>>>>>>>> accessed_dirty if PT_GUEST_DIRTY_MASK == 0?"  Of course it makes sense
>>>>>>>>>>> In this case accessed_dirty has correct value of 0 :) The if() bellow just
>>>>>>>>>>> tells you that since A/D is not supported there is nothing to be done
>>>>>>>>>>> about zero value of accessed_dirty, but the value itself is correct!
>>>>>>>>>
>>>>>>>>> It is correct because accessed_dirty is initialized to 0.  But the "&"
>>>>>>>>> with a bit taken out of thin air (bit 0 of the PTE)?  That's just
>>>>>>>>> disgusting. :)
>>>>>>>>>
>>>>>>> Sorry to disgust you, but the code relies on this "&" trick with or
>>>>>>> without the patch. It clears all unrelated bits from pte this way. No
>>>>>>> new disgusting tricks are added by the patch.
>>>>>
>>>>> Oh the code is not disgusting at all!  It is very nice to follow.
>>>>>
>>>>> The new disgusting ;) trick is that here in the EPT case you're
>>>>> effectively doing
>>>>>
>>>>> 	accessed_dirty &= pte;
>>>>>
>>>>> where bit 0 is the "R" bit (iirc) and has absolutely nothing to do with
>>>>> dirty or accessed.
>>>
>>> What bit 0 has to do with anything? Non ept code after shift also has
>>> random bits and random places in ept (R at P place, U at R place), the
>>> trick is that accessed_dirty masks bits we are not interesting in and
>>> capture only those we want to follow (accessed in regular case, non in
>>> ept case). This is exactly what original code is doing, so they are
>>> either both disgusting or both very nice to follow :)
>>
>> The comment is clear: "fold the dirty bit into accessed_dirty by
>> shifting it one place right".  In the EPT case the comment makes no
>> sense and it is not obvious that you rely on accessed_dirty=0 even
>> before that line.
> It is not obvious that the code relies on accessed_dirty been initialized
> to the bits the code wants to track at the start of the function. It
> wasn't for me. With if() it would have been much clearer, but the
> current way is faster. 

Sure it is not obvious.  But relying on the mask being zero is a whole
different level of non-obviousness.

>> That's why I'd rather have that code out of the PT_GUEST_DIRTY_MASK==0 case.
>>
> What problem current code has that you are trying to fix? What _technical_
> justification you provide?

Of course there is no technical justification.  Did I ever say otherwise?

> There is no point adding ifdefs where they
> are clearly not needed just because.

If you loathe ifdefs so much, you can of course wrap the whole code
we're talking about with an if().  That takes an extra level of
indentation, of course.

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov July 29, 2013, 5:11 p.m. UTC | #12
On Mon, Jul 29, 2013 at 07:06:21PM +0200, Paolo Bonzini wrote:
> Il 29/07/2013 18:43, Gleb Natapov ha scritto:
> > On Mon, Jul 29, 2013 at 06:28:37PM +0200, Paolo Bonzini wrote:
> >> Il 29/07/2013 18:14, Gleb Natapov ha scritto:
> >>>>>>>>>>>>>                 accessed_dirty &=
> >>>>>>>>>>>>> 			pte >> (PT_DIRTY_SHIFT - PT_ACCESSED_SHIFT);
> >>>>>>>>>>>>>
> >>>>>>>>>>>>>         if (PT_GUEST_DIRTY_MASK != 0 && unlikely(!accessed_dirty)) {
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> the obvious reaction is "what, is there a case where I'm using
> >>>>>>>>>>>>> accessed_dirty if PT_GUEST_DIRTY_MASK == 0?"  Of course it makes sense
> >>>>>>>>>>> In this case accessed_dirty has correct value of 0 :) The if() bellow just
> >>>>>>>>>>> tells you that since A/D is not supported there is nothing to be done
> >>>>>>>>>>> about zero value of accessed_dirty, but the value itself is correct!
> >>>>>>>>>
> >>>>>>>>> It is correct because accessed_dirty is initialized to 0.  But the "&"
> >>>>>>>>> with a bit taken out of thin air (bit 0 of the PTE)?  That's just
> >>>>>>>>> disgusting. :)
> >>>>>>>>>
> >>>>>>> Sorry to disgust you, but the code relies on this "&" trick with or
> >>>>>>> without the patch. It clears all unrelated bits from pte this way. No
> >>>>>>> new disgusting tricks are added by the patch.
> >>>>>
> >>>>> Oh the code is not disgusting at all!  It is very nice to follow.
> >>>>>
> >>>>> The new disgusting ;) trick is that here in the EPT case you're
> >>>>> effectively doing
> >>>>>
> >>>>> 	accessed_dirty &= pte;
> >>>>>
> >>>>> where bit 0 is the "R" bit (iirc) and has absolutely nothing to do with
> >>>>> dirty or accessed.
> >>>
> >>> What bit 0 has to do with anything? Non ept code after shift also has
> >>> random bits and random places in ept (R at P place, U at R place), the
> >>> trick is that accessed_dirty masks bits we are not interesting in and
> >>> capture only those we want to follow (accessed in regular case, non in
> >>> ept case). This is exactly what original code is doing, so they are
> >>> either both disgusting or both very nice to follow :)
> >>
> >> The comment is clear: "fold the dirty bit into accessed_dirty by
> >> shifting it one place right".  In the EPT case the comment makes no
> >> sense and it is not obvious that you rely on accessed_dirty=0 even
> >> before that line.
> > It is not obvious that the code relies on accessed_dirty been initialized
> > to the bits the code wants to track at the start of the function. It
> > wasn't for me. With if() it would have been much clearer, but the
> > current way is faster. 
> 
> Sure it is not obvious.  But relying on the mask being zero is a whole
> different level of non-obviousness.
> 
I disagree.

> >> That's why I'd rather have that code out of the PT_GUEST_DIRTY_MASK==0 case.
> >>
> > What problem current code has that you are trying to fix? What _technical_
> > justification you provide?
> 
> Of course there is no technical justification.  Did I ever say otherwise?
> 
I just want to be absolutely sure that we are bikeshedding here.

> > There is no point adding ifdefs where they
> > are clearly not needed just because.
> 
> If you loathe ifdefs so much, you can of course wrap the whole code
> we're talking about with an if().  That takes an extra level of
> indentation, of course.
> 
And that point of doing it is...?

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paolo Bonzini July 30, 2013, 10:03 a.m. UTC | #13
Il 25/07/2013 12:59, Gleb Natapov ha scritto:
> From: Nadav Har'El <nyh@il.ibm.com>
> 
> This is the first patch in a series which adds nested EPT support to KVM's
> nested VMX. Nested EPT means emulating EPT for an L1 guest so that L1 can use
> EPT when running a nested guest L2. When L1 uses EPT, it allows the L2 guest
> to set its own cr3 and take its own page faults without either of L0 or L1
> getting involved. This often significanlty improves L2's performance over the
> previous two alternatives (shadow page tables over EPT, and shadow page
> tables over shadow page tables).
> 
> This patch adds EPT support to paging_tmpl.h.
> 
> paging_tmpl.h contains the code for reading and writing page tables. The code
> for 32-bit and 64-bit tables is very similar, but not identical, so
> paging_tmpl.h is #include'd twice in mmu.c, once with PTTTYPE=32 and once
> with PTTYPE=64, and this generates the two sets of similar functions.
> 
> There are subtle but important differences between the format of EPT tables
> and that of ordinary x86 64-bit page tables, so for nested EPT we need a
> third set of functions to read the guest EPT table and to write the shadow
> EPT table.
> 
> So this patch adds third PTTYPE, PTTYPE_EPT, which creates functions (prefixed
> with "EPT") which correctly read and write EPT tables.
> 
> Signed-off-by: Nadav Har'El <nyh@il.ibm.com>
> Signed-off-by: Jun Nakajima <jun.nakajima@intel.com>
> Signed-off-by: Xinhao Xu <xinhao.xu@intel.com>
> Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com>
> Signed-off-by: Gleb Natapov <gleb@redhat.com>
> ---
>  arch/x86/kvm/mmu.c         |    5 +++++
>  arch/x86/kvm/paging_tmpl.h |   43 +++++++++++++++++++++++++++++++++++++++----
>  2 files changed, 44 insertions(+), 4 deletions(-)

Ok, let's rewind and retry.

> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index 4c4274d..b5273c3 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -3494,6 +3494,11 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gp
>  	return mmu->last_pte_bitmap & (1 << index);
>  }
>  
> +#define PTTYPE_EPT 18 /* arbitrary */
> +#define PTTYPE PTTYPE_EPT
> +#include "paging_tmpl.h"
> +#undef PTTYPE
> +
>  #define PTTYPE 64
>  #include "paging_tmpl.h"
>  #undef PTTYPE
> diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
> index 7581395..e38b3c0 100644
> --- a/arch/x86/kvm/paging_tmpl.h
> +++ b/arch/x86/kvm/paging_tmpl.h
> @@ -58,6 +58,21 @@
>  	#define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
>  	#define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
>  	#define CMPXCHG cmpxchg
> +#elif PTTYPE == PTTYPE_EPT
> +	#define pt_element_t u64
> +	#define guest_walker guest_walkerEPT
> +	#define FNAME(name) ept_##name
> +	#define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
> +	#define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
> +	#define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
> +	#define PT_INDEX(addr, level) PT64_INDEX(addr, level)
> +	#define PT_LEVEL_BITS PT64_LEVEL_BITS
> +	#define PT_GUEST_ACCESSED_MASK 0
> +	#define PT_GUEST_DIRTY_MASK 0
> +	#define PT_GUEST_DIRTY_SHIFT 0
> +	#define PT_GUEST_ACCESSED_SHIFT 0
> +	#define CMPXCHG cmpxchg64
> +	#define PT_MAX_FULL_LEVELS 4
>  #else
>  	#error Invalid PTTYPE value
>  #endif

Please add a

BUILD_BUG_ON(!!PT_GUEST_ACCESSED_MASK != !!PT_GUEST_DIRTY_MASK);
#if PT_GUEST_ACCESSED_MASK
BUILD_BUG_ON(PT_GUEST_DIRTY_SHIFT <= PT_GUEST_ACCESSED_SHIFT);
BUILD_BUG_ON(PT_GUEST_DIRTY_SHIFT <= PT_WRITABLE_SHIFT);
BUILD_BUG_ON(PT_GUEST_DIRTY_MASK != (1 << PT_GUEST_DIRTY_SHIFT));
BUILD_BUG_ON(PT_GUEST_ACCESSED_MASK != (1 << PT_GUEST_ACCESSED_SHIFT));
#endif

here.

> @@ -90,6 +105,10 @@ static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
>  
>  static inline void FNAME(protect_clean_gpte)(unsigned *access, unsigned gpte)
>  {
> +#if PT_GUEST_DIRTY_MASK == 0
> +	/* dirty bit is not supported, so no need to track it */
> +	return;
> +#else

Here you can use a regular "if" instead of the preprocessor if.  Also
please move this and all other PT_GUEST_*-related changes to a separate
patch.

>  	unsigned mask;
>  
>  	BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK);
> @@ -99,6 +118,7 @@ static inline void FNAME(protect_clean_gpte)(unsigned *access, unsigned gpte)
>  	mask |= (gpte >> (PT_GUEST_DIRTY_SHIFT - PT_WRITABLE_SHIFT)) &
>  		PT_WRITABLE_MASK;
>  	*access &= mask;
> +#endif
>  }
>  
>  static bool FNAME(is_rsvd_bits_set)(struct kvm_mmu *mmu, u64 gpte, int level)
> @@ -111,7 +131,11 @@ static bool FNAME(is_rsvd_bits_set)(struct kvm_mmu *mmu, u64 gpte, int level)
>  
>  static inline int FNAME(is_present_gpte)(unsigned long pte)
>  {
> +#if PTTYPE != PTTYPE_EPT
>  	return is_present_gpte(pte);
> +#else
> +	return pte & 7;
> +#endif
>  }
>  
>  static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
> @@ -147,7 +171,8 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
>  	if (!FNAME(is_present_gpte)(gpte))
>  		goto no_present;
>  
> -	if (!(gpte & PT_GUEST_ACCESSED_MASK))
> +	/* if accessed bit is not supported prefetch non accessed gpte */
> +	if (PT_GUEST_ACCESSED_MASK && !(gpte & PT_GUEST_ACCESSED_MASK))
>  		goto no_present;
>  
>  	return false;
> @@ -160,9 +185,14 @@ no_present:
>  static inline unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, u64 gpte)
>  {
>  	unsigned access;
> -
> +#if PTTYPE == PTTYPE_EPT
> +	BUILD_BUG_ON(ACC_WRITE_MASK != VMX_EPT_WRITABLE_MASK);
> +	access = (gpte & VMX_EPT_WRITABLE_MASK) | ACC_USER_MASK |
> +		((gpte & VMX_EPT_EXECUTABLE_MASK) ? ACC_EXEC_MASK : 0);

You can use a ?: here even for writable.  The compiler will simplify (a
& b ? b : 0) to just "a & b" for single-bit b.

> +#else
>  	access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
>  	access &= ~(gpte >> PT64_NX_SHIFT);
> +#endif
>  
>  	return access;
>  }
> @@ -212,7 +242,6 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
>  				    struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
>  				    gva_t addr, u32 access)
>  {
> -	int ret;

This change is not needed anymore since you removed the #ifdef.

>  	pt_element_t pte;
>  	pt_element_t __user *uninitialized_var(ptep_user);
>  	gfn_t table_gfn;
> @@ -322,7 +351,9 @@ retry_walk:
>  		accessed_dirty &= pte >>
>  			(PT_GUEST_DIRTY_SHIFT - PT_GUEST_ACCESSED_SHIFT);

This shift is one of two things that bugged me.  I dislike including
"wrong" code just because it is dead.  Perhaps you can #define the
shifts to 8 and 9 already now, even if the masks stay 0?

Then when you implement nEPT A/D bits you only have to flip the masks
from 0 to (1 << PT_GUEST_*_SHIFT).

>  
> -	if (unlikely(!accessed_dirty)) {
> +	if (PT_GUEST_DIRTY_MASK && unlikely(!accessed_dirty)) {

If we want to drop the dead-code elimination burden on the compiler,
let's go all the way.

So, instead of this "if", please make update_accessed_dirty_bits return
0 at once if PT_GUEST_DIRTY_MASK == 0; this way you can do the same
thing for protect_clean_gpte and update_accessed_dirty_bits.

Paolo

> +		int ret;
> +
>  		ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, write_fault);
>  		if (unlikely(ret < 0))
>  			goto error;
> @@ -359,6 +390,7 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
>  					access);
>  }
>  
> +#if PTTYPE != PTTYPE_EPT
>  static int FNAME(walk_addr_nested)(struct guest_walker *walker,
>  				   struct kvm_vcpu *vcpu, gva_t addr,
>  				   u32 access)
> @@ -366,6 +398,7 @@ static int FNAME(walk_addr_nested)(struct guest_walker *walker,
>  	return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu,
>  					addr, access);
>  }
> +#endif
>  
>  static bool
>  FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
> @@ -793,6 +826,7 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
>  	return gpa;
>  }
>  
> +#if PTTYPE != PTTYPE_EPT
>  static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
>  				      u32 access,
>  				      struct x86_exception *exception)
> @@ -811,6 +845,7 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
>  
>  	return gpa;
>  }
> +#endif
>  
>  /*
>   * Using the cached information from sp->gfns is safe because:
> 

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov July 30, 2013, 11:56 a.m. UTC | #14
On Tue, Jul 30, 2013 at 12:03:07PM +0200, Paolo Bonzini wrote:
> Il 25/07/2013 12:59, Gleb Natapov ha scritto:
> > From: Nadav Har'El <nyh@il.ibm.com>
> > 
> > This is the first patch in a series which adds nested EPT support to KVM's
> > nested VMX. Nested EPT means emulating EPT for an L1 guest so that L1 can use
> > EPT when running a nested guest L2. When L1 uses EPT, it allows the L2 guest
> > to set its own cr3 and take its own page faults without either of L0 or L1
> > getting involved. This often significanlty improves L2's performance over the
> > previous two alternatives (shadow page tables over EPT, and shadow page
> > tables over shadow page tables).
> > 
> > This patch adds EPT support to paging_tmpl.h.
> > 
> > paging_tmpl.h contains the code for reading and writing page tables. The code
> > for 32-bit and 64-bit tables is very similar, but not identical, so
> > paging_tmpl.h is #include'd twice in mmu.c, once with PTTTYPE=32 and once
> > with PTTYPE=64, and this generates the two sets of similar functions.
> > 
> > There are subtle but important differences between the format of EPT tables
> > and that of ordinary x86 64-bit page tables, so for nested EPT we need a
> > third set of functions to read the guest EPT table and to write the shadow
> > EPT table.
> > 
> > So this patch adds third PTTYPE, PTTYPE_EPT, which creates functions (prefixed
> > with "EPT") which correctly read and write EPT tables.
> > 
> > Signed-off-by: Nadav Har'El <nyh@il.ibm.com>
> > Signed-off-by: Jun Nakajima <jun.nakajima@intel.com>
> > Signed-off-by: Xinhao Xu <xinhao.xu@intel.com>
> > Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com>
> > Signed-off-by: Gleb Natapov <gleb@redhat.com>
> > ---
> >  arch/x86/kvm/mmu.c         |    5 +++++
> >  arch/x86/kvm/paging_tmpl.h |   43 +++++++++++++++++++++++++++++++++++++++----
> >  2 files changed, 44 insertions(+), 4 deletions(-)
> 
> Ok, let's rewind and retry.
> 
> > diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> > index 4c4274d..b5273c3 100644
> > --- a/arch/x86/kvm/mmu.c
> > +++ b/arch/x86/kvm/mmu.c
> > @@ -3494,6 +3494,11 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gp
> >  	return mmu->last_pte_bitmap & (1 << index);
> >  }
> >  
> > +#define PTTYPE_EPT 18 /* arbitrary */
> > +#define PTTYPE PTTYPE_EPT
> > +#include "paging_tmpl.h"
> > +#undef PTTYPE
> > +
> >  #define PTTYPE 64
> >  #include "paging_tmpl.h"
> >  #undef PTTYPE
> > diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
> > index 7581395..e38b3c0 100644
> > --- a/arch/x86/kvm/paging_tmpl.h
> > +++ b/arch/x86/kvm/paging_tmpl.h
> > @@ -58,6 +58,21 @@
> >  	#define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
> >  	#define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
> >  	#define CMPXCHG cmpxchg
> > +#elif PTTYPE == PTTYPE_EPT
> > +	#define pt_element_t u64
> > +	#define guest_walker guest_walkerEPT
> > +	#define FNAME(name) ept_##name
> > +	#define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
> > +	#define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
> > +	#define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
> > +	#define PT_INDEX(addr, level) PT64_INDEX(addr, level)
> > +	#define PT_LEVEL_BITS PT64_LEVEL_BITS
> > +	#define PT_GUEST_ACCESSED_MASK 0
> > +	#define PT_GUEST_DIRTY_MASK 0
> > +	#define PT_GUEST_DIRTY_SHIFT 0
> > +	#define PT_GUEST_ACCESSED_SHIFT 0
> > +	#define CMPXCHG cmpxchg64
> > +	#define PT_MAX_FULL_LEVELS 4
> >  #else
> >  	#error Invalid PTTYPE value
> >  #endif
> 
> Please add a
> 
> BUILD_BUG_ON(!!PT_GUEST_ACCESSED_MASK != !!PT_GUEST_DIRTY_MASK);
> #if PT_GUEST_ACCESSED_MASK
> BUILD_BUG_ON(PT_GUEST_DIRTY_SHIFT <= PT_GUEST_ACCESSED_SHIFT);
> BUILD_BUG_ON(PT_GUEST_DIRTY_SHIFT <= PT_WRITABLE_SHIFT);
> BUILD_BUG_ON(PT_GUEST_DIRTY_MASK != (1 << PT_GUEST_DIRTY_SHIFT));
> BUILD_BUG_ON(PT_GUEST_ACCESSED_MASK != (1 << PT_GUEST_ACCESSED_SHIFT));
This will not work if I define _SHIFT to be 8/9 now. But we do not use
BUILD_BUG_ON() to check values from the same define "namespace". It is
implied that they are correct and when they change all "namespace"
remains to be consistent. If you look at BUILD_BUG_ON() that we have
(and this patch adds) they are from the form:
  PT_WRITABLE_MASK != ACC_WRITE_MASK
  ACC_WRITE_MASK != VMX_EPT_WRITABLE_MASK
  VMX_EPT_READABLE_MASK != PT_PRESENT_MASK
  VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK

i.e they compare value from different "namespaces".

> #endif
> 
> here.
> 
> > @@ -90,6 +105,10 @@ static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
> >  
> >  static inline void FNAME(protect_clean_gpte)(unsigned *access, unsigned gpte)
> >  {
> > +#if PT_GUEST_DIRTY_MASK == 0
> > +	/* dirty bit is not supported, so no need to track it */
> > +	return;
> > +#else
> 
> Here you can use a regular "if" instead of the preprocessor if.  Also
Yeah, for some reason I thought BUILD_BUG_ON() prevent me from doing so,
but now I see that it should not.

> please move this and all other PT_GUEST_*-related changes to a separate
> patch.
Did already.

> 
> >  	unsigned mask;
> >  
> >  	BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK);
> > @@ -99,6 +118,7 @@ static inline void FNAME(protect_clean_gpte)(unsigned *access, unsigned gpte)
> >  	mask |= (gpte >> (PT_GUEST_DIRTY_SHIFT - PT_WRITABLE_SHIFT)) &
> >  		PT_WRITABLE_MASK;
> >  	*access &= mask;
> > +#endif
> >  }
> >  
> >  static bool FNAME(is_rsvd_bits_set)(struct kvm_mmu *mmu, u64 gpte, int level)
> > @@ -111,7 +131,11 @@ static bool FNAME(is_rsvd_bits_set)(struct kvm_mmu *mmu, u64 gpte, int level)
> >  
> >  static inline int FNAME(is_present_gpte)(unsigned long pte)
> >  {
> > +#if PTTYPE != PTTYPE_EPT
> >  	return is_present_gpte(pte);
> > +#else
> > +	return pte & 7;
> > +#endif
> >  }
> >  
> >  static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
> > @@ -147,7 +171,8 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
> >  	if (!FNAME(is_present_gpte)(gpte))
> >  		goto no_present;
> >  
> > -	if (!(gpte & PT_GUEST_ACCESSED_MASK))
> > +	/* if accessed bit is not supported prefetch non accessed gpte */
> > +	if (PT_GUEST_ACCESSED_MASK && !(gpte & PT_GUEST_ACCESSED_MASK))
> >  		goto no_present;
> >  
> >  	return false;
> > @@ -160,9 +185,14 @@ no_present:
> >  static inline unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, u64 gpte)
> >  {
> >  	unsigned access;
> > -
> > +#if PTTYPE == PTTYPE_EPT
> > +	BUILD_BUG_ON(ACC_WRITE_MASK != VMX_EPT_WRITABLE_MASK);
> > +	access = (gpte & VMX_EPT_WRITABLE_MASK) | ACC_USER_MASK |
> > +		((gpte & VMX_EPT_EXECUTABLE_MASK) ? ACC_EXEC_MASK : 0);
> 
> You can use a ?: here even for writable.  The compiler will simplify (a
> & b ? b : 0) to just "a & b" for single-bit b.
> 
Good, one less BUILD_BUG_ON().

> > +#else
> >  	access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
> >  	access &= ~(gpte >> PT64_NX_SHIFT);
> > +#endif
> >  
> >  	return access;
> >  }
> > @@ -212,7 +242,6 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
> >  				    struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
> >  				    gva_t addr, u32 access)
> >  {
> > -	int ret;
> 
> This change is not needed anymore since you removed the #ifdef.
> 
Ack.

> >  	pt_element_t pte;
> >  	pt_element_t __user *uninitialized_var(ptep_user);
> >  	gfn_t table_gfn;
> > @@ -322,7 +351,9 @@ retry_walk:
> >  		accessed_dirty &= pte >>
> >  			(PT_GUEST_DIRTY_SHIFT - PT_GUEST_ACCESSED_SHIFT);
> 
> This shift is one of two things that bugged me.  I dislike including
> "wrong" code just because it is dead.  Perhaps you can #define the
Meanwhile I changed comment above this to be:
           /*
            * On a write fault, fold the dirty bit into accessed_dirty.
            * For modes without A/D bits support accessed_dirty will be
            * always clear.
            */

> shifts to 8 and 9 already now, even if the masks stay 0?
> 
Currently I do not see any problem with that, but we will have to be careful
that *_SHIFT values will not leak into a code where it could matter.

> Then when you implement nEPT A/D bits you only have to flip the masks
> from 0 to (1 << PT_GUEST_*_SHIFT).
> 
> >  
> > -	if (unlikely(!accessed_dirty)) {
> > +	if (PT_GUEST_DIRTY_MASK && unlikely(!accessed_dirty)) {
> 
> If we want to drop the dead-code elimination burden on the compiler,
> let's go all the way.
> 
> So, instead of this "if", please make update_accessed_dirty_bits return
> 0 at once if PT_GUEST_DIRTY_MASK == 0; this way you can do the same
> thing for protect_clean_gpte and update_accessed_dirty_bits.
> 
OK.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paolo Bonzini July 30, 2013, 12:13 p.m. UTC | #15
Il 30/07/2013 13:56, Gleb Natapov ha scritto:
>>> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
>>> index 4c4274d..b5273c3 100644
>>> --- a/arch/x86/kvm/mmu.c
>>> +++ b/arch/x86/kvm/mmu.c
>>> @@ -3494,6 +3494,11 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gp
>>>  	return mmu->last_pte_bitmap & (1 << index);
>>>  }
>>>  
>>> +#define PTTYPE_EPT 18 /* arbitrary */
>>> +#define PTTYPE PTTYPE_EPT
>>> +#include "paging_tmpl.h"
>>> +#undef PTTYPE
>>> +
>>>  #define PTTYPE 64
>>>  #include "paging_tmpl.h"
>>>  #undef PTTYPE
>>> diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
>>> index 7581395..e38b3c0 100644
>>> --- a/arch/x86/kvm/paging_tmpl.h
>>> +++ b/arch/x86/kvm/paging_tmpl.h
>>> @@ -58,6 +58,21 @@
>>>  	#define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
>>>  	#define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
>>>  	#define CMPXCHG cmpxchg
>>> +#elif PTTYPE == PTTYPE_EPT
>>> +	#define pt_element_t u64
>>> +	#define guest_walker guest_walkerEPT
>>> +	#define FNAME(name) ept_##name
>>> +	#define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
>>> +	#define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
>>> +	#define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
>>> +	#define PT_INDEX(addr, level) PT64_INDEX(addr, level)
>>> +	#define PT_LEVEL_BITS PT64_LEVEL_BITS
>>> +	#define PT_GUEST_ACCESSED_MASK 0
>>> +	#define PT_GUEST_DIRTY_MASK 0
>>> +	#define PT_GUEST_DIRTY_SHIFT 0
>>> +	#define PT_GUEST_ACCESSED_SHIFT 0
>>> +	#define CMPXCHG cmpxchg64
>>> +	#define PT_MAX_FULL_LEVELS 4
>>>  #else
>>>  	#error Invalid PTTYPE value
>>>  #endif
>>
>> Please add a
>>
>> BUILD_BUG_ON(!!PT_GUEST_ACCESSED_MASK != !!PT_GUEST_DIRTY_MASK);
>> #if PT_GUEST_ACCESSED_MASK
>> BUILD_BUG_ON(PT_GUEST_DIRTY_SHIFT <= PT_GUEST_ACCESSED_SHIFT);
>> BUILD_BUG_ON(PT_GUEST_DIRTY_SHIFT <= PT_WRITABLE_SHIFT);
>> BUILD_BUG_ON(PT_GUEST_DIRTY_MASK != (1 << PT_GUEST_DIRTY_SHIFT));
>> BUILD_BUG_ON(PT_GUEST_ACCESSED_MASK != (1 << PT_GUEST_ACCESSED_SHIFT));
>
> This will not work if I define _SHIFT to be 8/9 now.

They will because I put them under an appropriate "#if". :)

OTOH if you define _SHIFT to be 8/9 you can move the #if so that it only
covers the last two checks.

> But we do not use
> BUILD_BUG_ON() to check values from the same define "namespace". It is
> implied that they are correct and when they change all "namespace"
> remains to be consistent. If you look at BUILD_BUG_ON() that we have
> (and this patch adds) they are from the form:
>   PT_WRITABLE_MASK != ACC_WRITE_MASK
>   ACC_WRITE_MASK != VMX_EPT_WRITABLE_MASK
>   VMX_EPT_READABLE_MASK != PT_PRESENT_MASK
>   VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK
> 
> i.e they compare value from different "namespaces".

Yes, all these BUILD_BUG_ONs make sense.

But I think BUILD_BUG_ON() is more generically for consistency checks
and enforcing invariants that the code expects.  Our invariants are:

* A/D bits are enabled or disabled in pairs

* dirty is the left of accessed and writable

* masks should either be zero or match the corresponding shifts

The alternative to BUILD_BUG_ON would be a comment that explains the
invariants, but there's no need to use English if C can do it better! :)

>>>  	pt_element_t pte;
>>>  	pt_element_t __user *uninitialized_var(ptep_user);
>>>  	gfn_t table_gfn;
>>> @@ -322,7 +351,9 @@ retry_walk:
>>>  		accessed_dirty &= pte >>
>>>  			(PT_GUEST_DIRTY_SHIFT - PT_GUEST_ACCESSED_SHIFT);
>>
>> This shift is one of two things that bugged me.  I dislike including
>> "wrong" code just because it is dead.  Perhaps you can #define the
> Meanwhile I changed comment above this to be:
>            /*
>             * On a write fault, fold the dirty bit into accessed_dirty.
>             * For modes without A/D bits support accessed_dirty will be
>             * always clear.
>             */

Good.

>> shifts to 8 and 9 already now, even if the masks stay 0?
>>
> Currently I do not see any problem with that, but we will have to be careful
> that *_SHIFT values will not leak into a code where it could matter.

They would leak with PT_GUEST_*_SHIFT == 0 too, I think?  (And with
worse effects, because they would use bit 0 and/or do shifts with
negative counts).

Perhaps PT_GUEST_*_SHIFT can instead be defined to a function like
__using_nonexistent_pte_bit(), so that compilation fails unless all such
references are optimized away.  See arch/x86/include/asm/cmpxchg.h for
an example:

/*
 * Non-existant functions to indicate usage errors at link time
 * (or compile-time if the compiler implements __compiletime_error().
 */
extern void __xchg_wrong_size(void)
        __compiletime_error("Bad argument size for xchg");

But using 8/9 would be fine for me.  Choose the one that you prefer.

>> Then when you implement nEPT A/D bits you only have to flip the masks
>> from 0 to (1 << PT_GUEST_*_SHIFT).
>>
>>>  
>>> -	if (unlikely(!accessed_dirty)) {
>>> +	if (PT_GUEST_DIRTY_MASK && unlikely(!accessed_dirty)) {
>>
>> If we want to drop the dead-code elimination burden on the compiler,
>> let's go all the way.
>>
>> So, instead of this "if", please make update_accessed_dirty_bits return
>> 0 at once if PT_GUEST_DIRTY_MASK == 0; this way you can do the same
>> thing for protect_clean_gpte and update_accessed_dirty_bits.
>
> OK.

Good. :)

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov July 30, 2013, 2:22 p.m. UTC | #16
On Tue, Jul 30, 2013 at 02:13:38PM +0200, Paolo Bonzini wrote:
> Il 30/07/2013 13:56, Gleb Natapov ha scritto:
> >>> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> >>> index 4c4274d..b5273c3 100644
> >>> --- a/arch/x86/kvm/mmu.c
> >>> +++ b/arch/x86/kvm/mmu.c
> >>> @@ -3494,6 +3494,11 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gp
> >>>  	return mmu->last_pte_bitmap & (1 << index);
> >>>  }
> >>>  
> >>> +#define PTTYPE_EPT 18 /* arbitrary */
> >>> +#define PTTYPE PTTYPE_EPT
> >>> +#include "paging_tmpl.h"
> >>> +#undef PTTYPE
> >>> +
> >>>  #define PTTYPE 64
> >>>  #include "paging_tmpl.h"
> >>>  #undef PTTYPE
> >>> diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
> >>> index 7581395..e38b3c0 100644
> >>> --- a/arch/x86/kvm/paging_tmpl.h
> >>> +++ b/arch/x86/kvm/paging_tmpl.h
> >>> @@ -58,6 +58,21 @@
> >>>  	#define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
> >>>  	#define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
> >>>  	#define CMPXCHG cmpxchg
> >>> +#elif PTTYPE == PTTYPE_EPT
> >>> +	#define pt_element_t u64
> >>> +	#define guest_walker guest_walkerEPT
> >>> +	#define FNAME(name) ept_##name
> >>> +	#define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
> >>> +	#define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
> >>> +	#define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
> >>> +	#define PT_INDEX(addr, level) PT64_INDEX(addr, level)
> >>> +	#define PT_LEVEL_BITS PT64_LEVEL_BITS
> >>> +	#define PT_GUEST_ACCESSED_MASK 0
> >>> +	#define PT_GUEST_DIRTY_MASK 0
> >>> +	#define PT_GUEST_DIRTY_SHIFT 0
> >>> +	#define PT_GUEST_ACCESSED_SHIFT 0
> >>> +	#define CMPXCHG cmpxchg64
> >>> +	#define PT_MAX_FULL_LEVELS 4
> >>>  #else
> >>>  	#error Invalid PTTYPE value
> >>>  #endif
> >>
> >> Please add a
> >>
> >> BUILD_BUG_ON(!!PT_GUEST_ACCESSED_MASK != !!PT_GUEST_DIRTY_MASK);
> >> #if PT_GUEST_ACCESSED_MASK
> >> BUILD_BUG_ON(PT_GUEST_DIRTY_SHIFT <= PT_GUEST_ACCESSED_SHIFT);
> >> BUILD_BUG_ON(PT_GUEST_DIRTY_SHIFT <= PT_WRITABLE_SHIFT);
> >> BUILD_BUG_ON(PT_GUEST_DIRTY_MASK != (1 << PT_GUEST_DIRTY_SHIFT));
> >> BUILD_BUG_ON(PT_GUEST_ACCESSED_MASK != (1 << PT_GUEST_ACCESSED_SHIFT));
> >
> > This will not work if I define _SHIFT to be 8/9 now.
> 
> They will because I put them under an appropriate "#if". :)
> 
True, missed that.

> OTOH if you define _SHIFT to be 8/9 you can move the #if so that it only
> covers the last two checks.
> 
> > But we do not use
> > BUILD_BUG_ON() to check values from the same define "namespace". It is
> > implied that they are correct and when they change all "namespace"
> > remains to be consistent. If you look at BUILD_BUG_ON() that we have
> > (and this patch adds) they are from the form:
> >   PT_WRITABLE_MASK != ACC_WRITE_MASK
> >   ACC_WRITE_MASK != VMX_EPT_WRITABLE_MASK
> >   VMX_EPT_READABLE_MASK != PT_PRESENT_MASK
> >   VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK
> > 
> > i.e they compare value from different "namespaces".
> 
> Yes, all these BUILD_BUG_ONs make sense.
> 
> But I think BUILD_BUG_ON() is more generically for consistency checks
> and enforcing invariants that the code expects.  Our invariants are:
> 
> * A/D bits are enabled or disabled in pairs
> 
> * dirty is the left of accessed and writable
> 
> * masks should either be zero or match the corresponding shifts
> 
> The alternative to BUILD_BUG_ON would be a comment that explains the
> invariants, but there's no need to use English if C can do it better! :)
> 
OK, will add this in separate patch.

> >>>  	pt_element_t pte;
> >>>  	pt_element_t __user *uninitialized_var(ptep_user);
> >>>  	gfn_t table_gfn;
> >>> @@ -322,7 +351,9 @@ retry_walk:
> >>>  		accessed_dirty &= pte >>
> >>>  			(PT_GUEST_DIRTY_SHIFT - PT_GUEST_ACCESSED_SHIFT);
> >>
> >> This shift is one of two things that bugged me.  I dislike including
> >> "wrong" code just because it is dead.  Perhaps you can #define the
> > Meanwhile I changed comment above this to be:
> >            /*
> >             * On a write fault, fold the dirty bit into accessed_dirty.
> >             * For modes without A/D bits support accessed_dirty will be
> >             * always clear.
> >             */
> 
> Good.
> 
> >> shifts to 8 and 9 already now, even if the masks stay 0?
> >>
> > Currently I do not see any problem with that, but we will have to be careful
> > that *_SHIFT values will not leak into a code where it could matter.
> 
> They would leak with PT_GUEST_*_SHIFT == 0 too, I think?  (And with
> worse effects, because they would use bit 0 and/or do shifts with
> negative counts).
> 
> Perhaps PT_GUEST_*_SHIFT can instead be defined to a function like
> __using_nonexistent_pte_bit(), so that compilation fails unless all such
> references are optimized away.  See arch/x86/include/asm/cmpxchg.h for
> an example:
> 
> /*
>  * Non-existant functions to indicate usage errors at link time
>  * (or compile-time if the compiler implements __compiletime_error().
>  */
> extern void __xchg_wrong_size(void)
>         __compiletime_error("Bad argument size for xchg");
> 
Nice! But not all of them are optimized in the correct stage. The one
in accessed_dirty calculation is reachable, but result is thrown away.
If I define *_SHIFT to be a function the code cannot be optimized since
compiler thinks that function call has side effects. Adding pure function
attribute solves that though, so I'll go with that.
 
--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gleb Natapov July 30, 2013, 2:36 p.m. UTC | #17
On Tue, Jul 30, 2013 at 05:22:32PM +0300, Gleb Natapov wrote:
> > >>
> > >> BUILD_BUG_ON(!!PT_GUEST_ACCESSED_MASK != !!PT_GUEST_DIRTY_MASK);
> > >> #if PT_GUEST_ACCESSED_MASK
> > >> BUILD_BUG_ON(PT_GUEST_DIRTY_SHIFT <= PT_GUEST_ACCESSED_SHIFT);
> > >> BUILD_BUG_ON(PT_GUEST_DIRTY_SHIFT <= PT_WRITABLE_SHIFT);
> > >> BUILD_BUG_ON(PT_GUEST_DIRTY_MASK != (1 << PT_GUEST_DIRTY_SHIFT));
> > >> BUILD_BUG_ON(PT_GUEST_ACCESSED_MASK != (1 << PT_GUEST_ACCESSED_SHIFT));
> > >
> > > This will not work if I define _SHIFT to be 8/9 now.
> > 
> > They will because I put them under an appropriate "#if". :)
> > 
> True, missed that.
> 
> > OTOH if you define _SHIFT to be 8/9 you can move the #if so that it only
> > covers the last two checks.
> > 
> > > But we do not use
> > > BUILD_BUG_ON() to check values from the same define "namespace". It is
> > > implied that they are correct and when they change all "namespace"
> > > remains to be consistent. If you look at BUILD_BUG_ON() that we have
> > > (and this patch adds) they are from the form:
> > >   PT_WRITABLE_MASK != ACC_WRITE_MASK
> > >   ACC_WRITE_MASK != VMX_EPT_WRITABLE_MASK
> > >   VMX_EPT_READABLE_MASK != PT_PRESENT_MASK
> > >   VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK
> > > 
> > > i.e they compare value from different "namespaces".
> > 
> > Yes, all these BUILD_BUG_ONs make sense.
> > 
> > But I think BUILD_BUG_ON() is more generically for consistency checks
> > and enforcing invariants that the code expects.  Our invariants are:
> > 
> > * A/D bits are enabled or disabled in pairs
> > 
> > * dirty is the left of accessed and writable
> > 
> > * masks should either be zero or match the corresponding shifts
> > 
> > The alternative to BUILD_BUG_ON would be a comment that explains the
> > invariants, but there's no need to use English if C can do it better! :)
> > 
> OK, will add this in separate patch.
> 
Not really, BUILD_BUG_ON() is not meant to be used outside of a
function.

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 4c4274d..b5273c3 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3494,6 +3494,11 @@  static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gp
 	return mmu->last_pte_bitmap & (1 << index);
 }
 
+#define PTTYPE_EPT 18 /* arbitrary */
+#define PTTYPE PTTYPE_EPT
+#include "paging_tmpl.h"
+#undef PTTYPE
+
 #define PTTYPE 64
 #include "paging_tmpl.h"
 #undef PTTYPE
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 7581395..e38b3c0 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -58,6 +58,21 @@ 
 	#define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
 	#define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
 	#define CMPXCHG cmpxchg
+#elif PTTYPE == PTTYPE_EPT
+	#define pt_element_t u64
+	#define guest_walker guest_walkerEPT
+	#define FNAME(name) ept_##name
+	#define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
+	#define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
+	#define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
+	#define PT_INDEX(addr, level) PT64_INDEX(addr, level)
+	#define PT_LEVEL_BITS PT64_LEVEL_BITS
+	#define PT_GUEST_ACCESSED_MASK 0
+	#define PT_GUEST_DIRTY_MASK 0
+	#define PT_GUEST_DIRTY_SHIFT 0
+	#define PT_GUEST_ACCESSED_SHIFT 0
+	#define CMPXCHG cmpxchg64
+	#define PT_MAX_FULL_LEVELS 4
 #else
 	#error Invalid PTTYPE value
 #endif
@@ -90,6 +105,10 @@  static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
 
 static inline void FNAME(protect_clean_gpte)(unsigned *access, unsigned gpte)
 {
+#if PT_GUEST_DIRTY_MASK == 0
+	/* dirty bit is not supported, so no need to track it */
+	return;
+#else
 	unsigned mask;
 
 	BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK);
@@ -99,6 +118,7 @@  static inline void FNAME(protect_clean_gpte)(unsigned *access, unsigned gpte)
 	mask |= (gpte >> (PT_GUEST_DIRTY_SHIFT - PT_WRITABLE_SHIFT)) &
 		PT_WRITABLE_MASK;
 	*access &= mask;
+#endif
 }
 
 static bool FNAME(is_rsvd_bits_set)(struct kvm_mmu *mmu, u64 gpte, int level)
@@ -111,7 +131,11 @@  static bool FNAME(is_rsvd_bits_set)(struct kvm_mmu *mmu, u64 gpte, int level)
 
 static inline int FNAME(is_present_gpte)(unsigned long pte)
 {
+#if PTTYPE != PTTYPE_EPT
 	return is_present_gpte(pte);
+#else
+	return pte & 7;
+#endif
 }
 
 static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
@@ -147,7 +171,8 @@  static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
 	if (!FNAME(is_present_gpte)(gpte))
 		goto no_present;
 
-	if (!(gpte & PT_GUEST_ACCESSED_MASK))
+	/* if accessed bit is not supported prefetch non accessed gpte */
+	if (PT_GUEST_ACCESSED_MASK && !(gpte & PT_GUEST_ACCESSED_MASK))
 		goto no_present;
 
 	return false;
@@ -160,9 +185,14 @@  no_present:
 static inline unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, u64 gpte)
 {
 	unsigned access;
-
+#if PTTYPE == PTTYPE_EPT
+	BUILD_BUG_ON(ACC_WRITE_MASK != VMX_EPT_WRITABLE_MASK);
+	access = (gpte & VMX_EPT_WRITABLE_MASK) | ACC_USER_MASK |
+		((gpte & VMX_EPT_EXECUTABLE_MASK) ? ACC_EXEC_MASK : 0);
+#else
 	access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
 	access &= ~(gpte >> PT64_NX_SHIFT);
+#endif
 
 	return access;
 }
@@ -212,7 +242,6 @@  static int FNAME(walk_addr_generic)(struct guest_walker *walker,
 				    struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
 				    gva_t addr, u32 access)
 {
-	int ret;
 	pt_element_t pte;
 	pt_element_t __user *uninitialized_var(ptep_user);
 	gfn_t table_gfn;
@@ -322,7 +351,9 @@  retry_walk:
 		accessed_dirty &= pte >>
 			(PT_GUEST_DIRTY_SHIFT - PT_GUEST_ACCESSED_SHIFT);
 
-	if (unlikely(!accessed_dirty)) {
+	if (PT_GUEST_DIRTY_MASK && unlikely(!accessed_dirty)) {
+		int ret;
+
 		ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, write_fault);
 		if (unlikely(ret < 0))
 			goto error;
@@ -359,6 +390,7 @@  static int FNAME(walk_addr)(struct guest_walker *walker,
 					access);
 }
 
+#if PTTYPE != PTTYPE_EPT
 static int FNAME(walk_addr_nested)(struct guest_walker *walker,
 				   struct kvm_vcpu *vcpu, gva_t addr,
 				   u32 access)
@@ -366,6 +398,7 @@  static int FNAME(walk_addr_nested)(struct guest_walker *walker,
 	return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu,
 					addr, access);
 }
+#endif
 
 static bool
 FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
@@ -793,6 +826,7 @@  static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
 	return gpa;
 }
 
+#if PTTYPE != PTTYPE_EPT
 static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
 				      u32 access,
 				      struct x86_exception *exception)
@@ -811,6 +845,7 @@  static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
 
 	return gpa;
 }
+#endif
 
 /*
  * Using the cached information from sp->gfns is safe because: