diff mbox

[3/4] kvm: x86: mmu: allow A/D bits to be disabled in an mmu

Message ID ab1a8692-acd1-77cd-1239-baee47cbc1d6@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Paolo Bonzini July 3, 2017, 9:22 a.m. UTC
On 01/07/2017 02:26, Peter Feiner wrote:
> Adds the plumbing to disable A/D bits in the MMU based on a new role
> bit, ad_disabled. When A/D is disabled, the MMU operates as though A/D
> aren't available (i.e., using access tracking faults instead).
> 
> To avoid SP -> kvm_mmu_page.role.ad_disabled lookups all over the
> place, A/D disablement is now stored in the SPTE. This state is stored
> in the SPTE by tweaking the use of SPTE_SPECIAL_MASK for access
> tracking. Rather than just setting SPTE_SPECIAL_MASK when an
> access-tracking SPTE is non-present, we now always set
> SPTE_SPECIAL_MASK for access-tracking SPTEs.

Here are the only small fixes I'd like to make (not tested yet, but I'm
trying to poke holes in my understanding of the patch):


> Signed-off-by: Peter Feiner <pfeiner@google.com>
> ---
>  arch/x86/include/asm/kvm_host.h |   3 +-
>  arch/x86/kvm/mmu.c              | 111 +++++++++++++++++++++++++++++-----------
>  arch/x86/kvm/mmutrace.h         |   6 ++-
>  3 files changed, 87 insertions(+), 33 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 695605eb1dfb..2f19c680c3fd 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -254,7 +254,8 @@ union kvm_mmu_page_role {
>  		unsigned cr0_wp:1;
>  		unsigned smep_andnot_wp:1;
>  		unsigned smap_andnot_wp:1;
> -		unsigned :8;
> +		unsigned ad_disabled:1;
> +		unsigned :7;
>  
>  		/*
>  		 * This is left at the top of the word so that
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index 10b3cfc7b411..e814fb6248ac 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -187,10 +187,9 @@ static u64 __read_mostly shadow_mmio_value;
>  static u64 __read_mostly shadow_present_mask;
>  
>  /*
> - * The mask/value to distinguish a PTE that has been marked not-present for
> - * access tracking purposes.
> - * The mask would be either 0 if access tracking is disabled, or
> - * SPTE_SPECIAL_MASK|VMX_EPT_RWX_MASK if access tracking is enabled.
> + * SPTEs used by MMUs without A/D bits are marked with shadow_acc_track_value.
> + * Non-present SPTEs with shadow_acc_track_value set are in place for access
> + * tracking.
>   */
>  static u64 __read_mostly shadow_acc_track_mask;
>  static const u64 shadow_acc_track_value = SPTE_SPECIAL_MASK;
> @@ -216,10 +215,29 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value)
>  }
>  EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
>  
> +static inline bool sp_ad_disabled(struct kvm_mmu_page *sp)
> +{
> +	return shadow_accessed_mask == 0 || sp->role.ad_disabled;
> +}
> +
> +static inline bool spte_ad_enabled(u64 spte)
> +{
> +	return !(spte & shadow_acc_track_value);
> +}
> +
> +static u64 spte_shadow_accessed_mask(u64 spte)
> +{
> +	return spte_ad_enabled(spte) ? shadow_accessed_mask : 0;
> +}
> +
> +static u64 spte_shadow_dirty_mask(u64 spte)
> +{
> +	return spte_ad_enabled(spte) ? shadow_dirty_mask : 0;
> +}
> +
>  static inline bool is_access_track_spte(u64 spte)
>  {
> -	/* Always false if shadow_acc_track_mask is zero.  */
> -	return (spte & shadow_acc_track_mask) == shadow_acc_track_value;
> +	return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0;
>  }
>  
>  /*
> @@ -329,10 +347,9 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
>  		u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask,
>  		u64 acc_track_mask)
>  {
> -	if (acc_track_mask != 0)
> -		acc_track_mask |= SPTE_SPECIAL_MASK;
>  	BUG_ON(!dirty_mask != !accessed_mask);
>  	BUG_ON(!accessed_mask && !acc_track_mask);
> +	BUG_ON(acc_track_mask & shadow_acc_track_value);
>  
>  	shadow_user_mask = user_mask;
>  	shadow_accessed_mask = accessed_mask;
> @@ -341,7 +358,6 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
>  	shadow_x_mask = x_mask;
>  	shadow_present_mask = p_mask;
>  	shadow_acc_track_mask = acc_track_mask;
> -	WARN_ON(shadow_accessed_mask != 0 && shadow_acc_track_mask != 0);
>  }
>  EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
>  
> @@ -561,7 +577,7 @@ static bool spte_has_volatile_bits(u64 spte)
>  	    is_access_track_spte(spte))
>  		return true;
>  
> -	if (shadow_accessed_mask) {
> +	if (spte_ad_enabled(spte)) {
>  		if ((spte & shadow_accessed_mask) == 0 ||
>  	    	    (is_writable_pte(spte) && (spte & shadow_dirty_mask) == 0))
>  			return true;
> @@ -572,14 +588,17 @@ static bool spte_has_volatile_bits(u64 spte)
>  
>  static bool is_accessed_spte(u64 spte)
>  {
> -	return shadow_accessed_mask ? spte & shadow_accessed_mask
> -				    : !is_access_track_spte(spte);
> +	u64 accessed_mask = spte_shadow_accessed_mask(spte);
> +
> +	return accessed_mask ? spte & accessed_mask
> +			     : !is_access_track_spte(spte);
>  }
>  
>  static bool is_dirty_spte(u64 spte)
>  {
> -	return shadow_dirty_mask ? spte & shadow_dirty_mask
> -				 : spte & PT_WRITABLE_MASK;
> +	u64 dirty_mask = spte_shadow_dirty_mask(spte);
> +
> +	return dirty_mask ? spte & dirty_mask : spte & PT_WRITABLE_MASK;
>  }
>  
>  /* Rules for using mmu_spte_set:
> @@ -719,10 +738,10 @@ static u64 mmu_spte_get_lockless(u64 *sptep)
>  
>  static u64 mark_spte_for_access_track(u64 spte)
>  {
> -	if (shadow_accessed_mask != 0)
> +	if (spte_ad_enabled(spte))
>  		return spte & ~shadow_accessed_mask;
>  
> -	if (shadow_acc_track_mask == 0 || is_access_track_spte(spte))
> +	if (is_access_track_spte(spte))
>  		return spte;
>  
>  	/*
> @@ -741,7 +760,6 @@ static u64 mark_spte_for_access_track(u64 spte)
>  	spte |= (spte & shadow_acc_track_saved_bits_mask) <<
>  		shadow_acc_track_saved_bits_shift;
>  	spte &= ~shadow_acc_track_mask;
> -	spte |= shadow_acc_track_value;
>  
>  	return spte;
>  }
> @@ -753,6 +771,7 @@ static u64 restore_acc_track_spte(u64 spte)
>  	u64 saved_bits = (spte >> shadow_acc_track_saved_bits_shift)
>  			 & shadow_acc_track_saved_bits_mask;
>  
> +	WARN_ON_ONCE(spte_ad_enabled(spte));
>  	WARN_ON_ONCE(!is_access_track_spte(spte));
>  
>  	new_spte &= ~shadow_acc_track_mask;
> @@ -771,7 +790,7 @@ static bool mmu_spte_age(u64 *sptep)
>  	if (!is_accessed_spte(spte))
>  		return false;
>  
> -	if (shadow_accessed_mask) {
> +	if (spte_ad_enabled(spte)) {
>  		clear_bit((ffs(shadow_accessed_mask) - 1),
>  			  (unsigned long *)sptep);
>  	} else {
> @@ -1402,6 +1421,22 @@ static bool spte_clear_dirty(u64 *sptep)
>  	return mmu_spte_update(sptep, spte);
>  }
>  
> +static bool wrprot_ad_disabled_spte(u64 *sptep)
> +{
> +	bool was_writable = test_and_clear_bit(PT_WRITABLE_SHIFT,
> +					       (unsigned long *)sptep);
> +	if (was_writable)
> +		kvm_set_pfn_dirty(spte_to_pfn(*sptep));
> +
> +	return was_writable;
> +}
> +
> +/*
> + * Gets the GFN ready for another round of dirty logging by clearing the
> + *	- D bit on ad-enabled SPTEs, and
> + *	- W bit on ad-disabled SPTEs.
> + * Returns true iff any D or W bits were cleared.
> + */
>  static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
>  {
>  	u64 *sptep;
> @@ -1409,7 +1444,10 @@ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
>  	bool flush = false;
>  
>  	for_each_rmap_spte(rmap_head, &iter, sptep)
> -		flush |= spte_clear_dirty(sptep);
> +		if (spte_ad_enabled(*sptep))
> +			flush |= spte_clear_dirty(sptep);
> +		else
> +			flush |= wrprot_ad_disabled_spte(sptep);
>  
>  	return flush;
>  }
> @@ -1432,7 +1470,8 @@ static bool __rmap_set_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
>  	bool flush = false;
>  
>  	for_each_rmap_spte(rmap_head, &iter, sptep)
> -		flush |= spte_set_dirty(sptep);
> +		if (spte_ad_enabled(*sptep))
> +			flush |= spte_set_dirty(sptep);
>  
>  	return flush;
>  }
> @@ -1464,7 +1503,8 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
>  }
>  
>  /**
> - * kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages
> + * kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages, or write
> + * protect the page if the D-bit isn't supported.
>   * @kvm: kvm instance
>   * @slot: slot to clear D-bit
>   * @gfn_offset: start of the BITS_PER_LONG pages we care about
> @@ -2389,7 +2429,12 @@ static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep,
>  	BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
>  
>  	spte = __pa(sp->spt) | shadow_present_mask | PT_WRITABLE_MASK |
> -	       shadow_user_mask | shadow_x_mask | shadow_accessed_mask;
> +	       shadow_user_mask | shadow_x_mask;
> +
> +	if (sp_ad_disabled(sp))
> +		spte |= shadow_acc_track_value;
> +	else
> +		spte |= shadow_accessed_mask;
>  
>  	mmu_spte_set(sptep, spte);
>  
> @@ -2657,10 +2702,15 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
>  {
>  	u64 spte = 0;
>  	int ret = 0;
> +	struct kvm_mmu_page *sp;
>  
>  	if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access))
>  		return 0;
>  
> +	sp = page_header(__pa(sptep));
> +	if (sp_ad_disabled(sp))
> +		spte |= shadow_acc_track_value;
> +
>  	/*
>  	 * For the EPT case, shadow_present_mask is 0 if hardware
>  	 * supports exec-only page table entries.  In that case,
> @@ -2669,7 +2719,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
>  	 */
>  	spte |= shadow_present_mask;
>  	if (!speculative)
> -		spte |= shadow_accessed_mask;
> +		spte |= spte_shadow_accessed_mask(spte);
>  
>  	if (pte_access & ACC_EXEC_MASK)
>  		spte |= shadow_x_mask;
> @@ -2726,7 +2776,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
>  
>  	if (pte_access & ACC_WRITE_MASK) {
>  		kvm_vcpu_mark_page_dirty(vcpu, gfn);
> -		spte |= shadow_dirty_mask;
> +		spte |= spte_shadow_dirty_mask(spte);
>  	}
>  
>  	if (speculative)
> @@ -2868,16 +2918,16 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
>  {
>  	struct kvm_mmu_page *sp;
>  
> +	sp = page_header(__pa(sptep));
> +
>  	/*
> -	 * Since it's no accessed bit on EPT, it's no way to
> -	 * distinguish between actually accessed translations
> -	 * and prefetched, so disable pte prefetch if EPT is
> -	 * enabled.
> +	 * Without accessed bits, there's no way to distinguish between
> +	 * actually accessed translations and prefetched, so disable pte
> +	 * prefetch if accessed bits aren't available.
>  	 */
> -	if (!shadow_accessed_mask)
> +	if (sp_ad_disabled(sp))
>  		return;
>  
> -	sp = page_header(__pa(sptep));
>  	if (sp->role.level > PT_PAGE_TABLE_LEVEL)
>  		return;
>  
> @@ -4624,6 +4674,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
>  	mask.smep_andnot_wp = 1;
>  	mask.smap_andnot_wp = 1;
>  	mask.smm = 1;
> +	mask.ad_disabled = 1;
>  
>  	/*
>  	 * If we don't have indirect shadow pages, it means no page is
> diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
> index 5a24b846a1cb..8b97a6cba8d1 100644
> --- a/arch/x86/kvm/mmutrace.h
> +++ b/arch/x86/kvm/mmutrace.h
> @@ -30,8 +30,9 @@
>  								        \
>  	role.word = __entry->role;					\
>  									\
> -	trace_seq_printf(p, "sp gen %lx gfn %llx %u%s q%u%s %s%s"	\
> -			 " %snxe root %u %s%c",	__entry->mmu_valid_gen,	\
> +	trace_seq_printf(p, "sp gen %lx gfn %llx l%u%s q%u%s %s%s"	\
> +			 " %snxe %sad root %u %s%c",			\
> +			 __entry->mmu_valid_gen,			\
>  			 __entry->gfn, role.level,			\
>  			 role.cr4_pae ? " pae" : "",			\
>  			 role.quadrant,					\
> @@ -39,6 +40,7 @@
>  			 access_str[role.access],			\
>  			 role.invalid ? " invalid" : "",		\
>  			 role.nxe ? "" : "!",				\
> +			 role.ad_disabled ? "!" : "",			\
>  			 __entry->root_count,				\
>  			 __entry->unsync ? "unsync" : "sync", 0);	\
>  	saved_ptr;							\
>
diff mbox

Patch

diff --git a/Documentation/virtual/kvm/mmu.txt b/Documentation/virtual/kvm/mmu.txt
index 481b6a9c25d5..f50d45b1e967 100644
--- a/Documentation/virtual/kvm/mmu.txt
+++ b/Documentation/virtual/kvm/mmu.txt
@@ -179,6 +179,10 @@  Shadow pages contain the following information:
     shadow page; it is also used to go back from a struct kvm_mmu_page
     to a memslot, through the kvm_memslots_for_spte_role macro and
     __gfn_to_memslot.
+  role.ad_disabled:
+    Is 1 if the MMU instance cannot use A/D bits.  EPT did not have A/D
+    bits before Haswell; shadow EPT page tables also cannot use A/D bits
+    if the L1 hypervisor does not enable them.
   gfn:
     Either the guest page table containing the translations shadowed by this
     page, or the base page frame for linear translations.  See role.direct.
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index e814fb6248ac..9aeef6ade6b4 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -217,21 +217,24 @@  void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value)
 
 static inline bool sp_ad_disabled(struct kvm_mmu_page *sp)
 {
-	return shadow_accessed_mask == 0 || sp->role.ad_disabled;
+	return sp->role.ad_disabled;
 }
 
 static inline bool spte_ad_enabled(u64 spte)
 {
+	MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value);
 	return !(spte & shadow_acc_track_value);
 }
 
-static u64 spte_shadow_accessed_mask(u64 spte)
+static inline u64 spte_shadow_accessed_mask(u64 spte)
 {
+	MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value);
 	return spte_ad_enabled(spte) ? shadow_accessed_mask : 0;
 }
 
-static u64 spte_shadow_dirty_mask(u64 spte)
+static inline u64 spte_shadow_dirty_mask(u64 spte)
 {
+	MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value);
 	return spte_ad_enabled(spte) ? shadow_dirty_mask : 0;
 }
 
@@ -4328,6 +4329,7 @@  static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 
 	context->base_role.word = 0;
 	context->base_role.smm = is_smm(vcpu);
+	context->base_role.ad_disabled = (shadow_accessed_mask == 0);
 	context->page_fault = tdp_page_fault;
 	context->sync_page = nonpaging_sync_page;
 	context->invlpg = nonpaging_invlpg;