diff mbox series

[RFC,v2,43/69] KVM: x86/mmu: Allow non-zero init value for shadow PTE

Message ID 2a12f8867229459dba2da233bf7762cb1ac2722c.1625186503.git.isaku.yamahata@intel.com (mailing list archive)
State New, archived
Headers show
Series KVM: X86: TDX support | expand

Commit Message

Isaku Yamahata July 2, 2021, 10:04 p.m. UTC
From: Sean Christopherson <sean.j.christopherson@intel.com>

TDX will run with EPT violation #VEs enabled, which means KVM needs to
set the "suppress #VE" bit in unused PTEs to avoid unintentionally
reflecting not-present EPT violations into the guest.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
---
 arch/x86/kvm/mmu.h      |  1 +
 arch/x86/kvm/mmu/mmu.c  | 50 +++++++++++++++++++++++++++++++++++------
 arch/x86/kvm/mmu/spte.c | 10 +++++++++
 arch/x86/kvm/mmu/spte.h |  2 ++
 4 files changed, 56 insertions(+), 7 deletions(-)

Comments

Paolo Bonzini July 6, 2021, 2:56 p.m. UTC | #1
On 03/07/21 00:04, isaku.yamahata@intel.com wrote:
> From: Sean Christopherson <sean.j.christopherson@intel.com>
> 
> TDX will run with EPT violation #VEs enabled, which means KVM needs to
> set the "suppress #VE" bit in unused PTEs to avoid unintentionally
> reflecting not-present EPT violations into the guest.
> 
> Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
> Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
> ---
>   arch/x86/kvm/mmu.h      |  1 +
>   arch/x86/kvm/mmu/mmu.c  | 50 +++++++++++++++++++++++++++++++++++------
>   arch/x86/kvm/mmu/spte.c | 10 +++++++++
>   arch/x86/kvm/mmu/spte.h |  2 ++
>   4 files changed, 56 insertions(+), 7 deletions(-)

Please ensure that this also works for tdp_mmu.c (if anything, consider 
supporting TDX only for TDP MMU; it's quite likely that mmu.c support 
for EPT/NPT will go away).

Paolo

> diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
> index 69b82857acdb..6ec8d9fdff35 100644
> --- a/arch/x86/kvm/mmu.h
> +++ b/arch/x86/kvm/mmu.h
> @@ -61,6 +61,7 @@ static __always_inline u64 rsvd_bits(int s, int e)
>   
>   void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask);
>   void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only);
> +void kvm_mmu_set_spte_init_value(u64 init_value);
>   
>   void
>   reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 631b92e6e9ba..1c40dfd05979 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -550,9 +550,9 @@ static int mmu_spte_clear_track_bits(u64 *sptep)
>   	u64 old_spte = *sptep;
>   
>   	if (!spte_has_volatile_bits(old_spte))
> -		__update_clear_spte_fast(sptep, 0ull);
> +		__update_clear_spte_fast(sptep, shadow_init_value);
>   	else
> -		old_spte = __update_clear_spte_slow(sptep, 0ull);
> +		old_spte = __update_clear_spte_slow(sptep, shadow_init_value);
>   
>   	if (!is_shadow_present_pte(old_spte))
>   		return 0;
> @@ -582,7 +582,7 @@ static int mmu_spte_clear_track_bits(u64 *sptep)
>    */
>   static void mmu_spte_clear_no_track(u64 *sptep)
>   {
> -	__update_clear_spte_fast(sptep, 0ull);
> +	__update_clear_spte_fast(sptep, shadow_init_value);
>   }
>   
>   static u64 mmu_spte_get_lockless(u64 *sptep)
> @@ -660,6 +660,42 @@ static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
>   	local_irq_enable();
>   }
>   
> +static inline void kvm_init_shadow_page(void *page)
> +{
> +#ifdef CONFIG_X86_64
> +	int ign;
> +
> +	asm volatile (
> +		"rep stosq\n\t"
> +		: "=c"(ign), "=D"(page)
> +		: "a"(shadow_init_value), "c"(4096/8), "D"(page)
> +		: "memory"
> +	);
> +#else
> +	BUG();
> +#endif
> +}
> +
> +static int mmu_topup_shadow_page_cache(struct kvm_vcpu *vcpu)
> +{
> +	struct kvm_mmu_memory_cache *mc = &vcpu->arch.mmu_shadow_page_cache;
> +	int start, end, i, r;
> +
> +	if (shadow_init_value)
> +		start = kvm_mmu_memory_cache_nr_free_objects(mc);
> +
> +	r = kvm_mmu_topup_memory_cache(mc, PT64_ROOT_MAX_LEVEL);
> +	if (r)
> +		return r;
> +
> +	if (shadow_init_value) {
> +		end = kvm_mmu_memory_cache_nr_free_objects(mc);
> +		for (i = start; i < end; i++)
> +			kvm_init_shadow_page(mc->objects[i]);
> +	}
> +	return 0;
> +}
> +
>   static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
>   {
>   	int r;
> @@ -669,8 +705,7 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
>   				       1 + PT64_ROOT_MAX_LEVEL + PTE_PREFETCH_NUM);
>   	if (r)
>   		return r;
> -	r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadow_page_cache,
> -				       PT64_ROOT_MAX_LEVEL);
> +	r = mmu_topup_shadow_page_cache(vcpu);
>   	if (r)
>   		return r;
>   	if (maybe_indirect) {
> @@ -3041,7 +3076,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
>   	struct kvm_shadow_walk_iterator iterator;
>   	struct kvm_mmu_page *sp;
>   	int ret = RET_PF_INVALID;
> -	u64 spte = 0ull;
> +	u64 spte = shadow_init_value;
>   	uint retry_count = 0;
>   
>   	if (!page_fault_can_be_fast(error_code))
> @@ -5383,7 +5418,8 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
>   	vcpu->arch.mmu_page_header_cache.kmem_cache = mmu_page_header_cache;
>   	vcpu->arch.mmu_page_header_cache.gfp_zero = __GFP_ZERO;
>   
> -	vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO;
> +	if (!shadow_init_value)
> +		vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO;
>   
>   	vcpu->arch.mmu = &vcpu->arch.root_mmu;
>   	vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
> diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
> index 66d43cec0c31..0b931f1c2210 100644
> --- a/arch/x86/kvm/mmu/spte.c
> +++ b/arch/x86/kvm/mmu/spte.c
> @@ -34,6 +34,7 @@ u64 __read_mostly shadow_mmio_access_mask;
>   u64 __read_mostly shadow_present_mask;
>   u64 __read_mostly shadow_me_mask;
>   u64 __read_mostly shadow_acc_track_mask;
> +u64 __read_mostly shadow_init_value;
>   
>   u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
>   u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask;
> @@ -211,6 +212,14 @@ u64 kvm_mmu_changed_pte_notifier_make_spte(u64 old_spte, kvm_pfn_t new_pfn)
>   	return new_spte;
>   }
>   
> +void kvm_mmu_set_spte_init_value(u64 init_value)
> +{
> +	if (WARN_ON(!IS_ENABLED(CONFIG_X86_64) && init_value))
> +		init_value = 0;
> +	shadow_init_value = init_value;
> +}
> +EXPORT_SYMBOL_GPL(kvm_mmu_set_spte_init_value);
> +
>   static u8 kvm_get_shadow_phys_bits(void)
>   {
>   	/*
> @@ -355,6 +364,7 @@ void kvm_mmu_reset_all_pte_masks(void)
>   	shadow_present_mask	= PT_PRESENT_MASK;
>   	shadow_acc_track_mask	= 0;
>   	shadow_me_mask		= sme_me_mask;
> +	shadow_init_value	= 0;
>   
>   	shadow_host_writable_mask = DEFAULT_SPTE_HOST_WRITEABLE;
>   	shadow_mmu_writable_mask  = DEFAULT_SPTE_MMU_WRITEABLE;
> diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
> index bca0ba11cccf..f88cf3db31c7 100644
> --- a/arch/x86/kvm/mmu/spte.h
> +++ b/arch/x86/kvm/mmu/spte.h
> @@ -152,6 +152,8 @@ extern u64 __read_mostly shadow_mmio_access_mask;
>   extern u64 __read_mostly shadow_present_mask;
>   extern u64 __read_mostly shadow_me_mask;
>   
> +extern u64 __read_mostly shadow_init_value;
> +
>   /*
>    * SPTEs in MMUs without A/D bits are marked with SPTE_TDP_AD_DISABLED_MASK;
>    * shadow_acc_track_mask is the set of bits to be cleared in non-accessed
>
Isaku Yamahata July 8, 2021, 3:20 p.m. UTC | #2
On Tue, Jul 06, 2021 at 04:56:07PM +0200,
Paolo Bonzini <pbonzini@redhat.com> wrote:

> On 03/07/21 00:04, isaku.yamahata@intel.com wrote:
> > From: Sean Christopherson <sean.j.christopherson@intel.com>
> > 
> > TDX will run with EPT violation #VEs enabled, which means KVM needs to
> > set the "suppress #VE" bit in unused PTEs to avoid unintentionally
> > reflecting not-present EPT violations into the guest.
> > 
> > Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
> > Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
> > ---
> >   arch/x86/kvm/mmu.h      |  1 +
> >   arch/x86/kvm/mmu/mmu.c  | 50 +++++++++++++++++++++++++++++++++++------
> >   arch/x86/kvm/mmu/spte.c | 10 +++++++++
> >   arch/x86/kvm/mmu/spte.h |  2 ++
> >   4 files changed, 56 insertions(+), 7 deletions(-)
> 
> Please ensure that this also works for tdp_mmu.c (if anything, consider
> supporting TDX only for TDP MMU; it's quite likely that mmu.c support for
> EPT/NPT will go away).

It's on my TODO list. Will address it.
diff mbox series

Patch

diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 69b82857acdb..6ec8d9fdff35 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -61,6 +61,7 @@  static __always_inline u64 rsvd_bits(int s, int e)
 
 void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask);
 void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only);
+void kvm_mmu_set_spte_init_value(u64 init_value);
 
 void
 reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 631b92e6e9ba..1c40dfd05979 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -550,9 +550,9 @@  static int mmu_spte_clear_track_bits(u64 *sptep)
 	u64 old_spte = *sptep;
 
 	if (!spte_has_volatile_bits(old_spte))
-		__update_clear_spte_fast(sptep, 0ull);
+		__update_clear_spte_fast(sptep, shadow_init_value);
 	else
-		old_spte = __update_clear_spte_slow(sptep, 0ull);
+		old_spte = __update_clear_spte_slow(sptep, shadow_init_value);
 
 	if (!is_shadow_present_pte(old_spte))
 		return 0;
@@ -582,7 +582,7 @@  static int mmu_spte_clear_track_bits(u64 *sptep)
  */
 static void mmu_spte_clear_no_track(u64 *sptep)
 {
-	__update_clear_spte_fast(sptep, 0ull);
+	__update_clear_spte_fast(sptep, shadow_init_value);
 }
 
 static u64 mmu_spte_get_lockless(u64 *sptep)
@@ -660,6 +660,42 @@  static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
 	local_irq_enable();
 }
 
+static inline void kvm_init_shadow_page(void *page)
+{
+#ifdef CONFIG_X86_64
+	int ign;
+
+	asm volatile (
+		"rep stosq\n\t"
+		: "=c"(ign), "=D"(page)
+		: "a"(shadow_init_value), "c"(4096/8), "D"(page)
+		: "memory"
+	);
+#else
+	BUG();
+#endif
+}
+
+static int mmu_topup_shadow_page_cache(struct kvm_vcpu *vcpu)
+{
+	struct kvm_mmu_memory_cache *mc = &vcpu->arch.mmu_shadow_page_cache;
+	int start, end, i, r;
+
+	if (shadow_init_value)
+		start = kvm_mmu_memory_cache_nr_free_objects(mc);
+
+	r = kvm_mmu_topup_memory_cache(mc, PT64_ROOT_MAX_LEVEL);
+	if (r)
+		return r;
+
+	if (shadow_init_value) {
+		end = kvm_mmu_memory_cache_nr_free_objects(mc);
+		for (i = start; i < end; i++)
+			kvm_init_shadow_page(mc->objects[i]);
+	}
+	return 0;
+}
+
 static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
 {
 	int r;
@@ -669,8 +705,7 @@  static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
 				       1 + PT64_ROOT_MAX_LEVEL + PTE_PREFETCH_NUM);
 	if (r)
 		return r;
-	r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadow_page_cache,
-				       PT64_ROOT_MAX_LEVEL);
+	r = mmu_topup_shadow_page_cache(vcpu);
 	if (r)
 		return r;
 	if (maybe_indirect) {
@@ -3041,7 +3076,7 @@  static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
 	struct kvm_shadow_walk_iterator iterator;
 	struct kvm_mmu_page *sp;
 	int ret = RET_PF_INVALID;
-	u64 spte = 0ull;
+	u64 spte = shadow_init_value;
 	uint retry_count = 0;
 
 	if (!page_fault_can_be_fast(error_code))
@@ -5383,7 +5418,8 @@  int kvm_mmu_create(struct kvm_vcpu *vcpu)
 	vcpu->arch.mmu_page_header_cache.kmem_cache = mmu_page_header_cache;
 	vcpu->arch.mmu_page_header_cache.gfp_zero = __GFP_ZERO;
 
-	vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO;
+	if (!shadow_init_value)
+		vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO;
 
 	vcpu->arch.mmu = &vcpu->arch.root_mmu;
 	vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
index 66d43cec0c31..0b931f1c2210 100644
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -34,6 +34,7 @@  u64 __read_mostly shadow_mmio_access_mask;
 u64 __read_mostly shadow_present_mask;
 u64 __read_mostly shadow_me_mask;
 u64 __read_mostly shadow_acc_track_mask;
+u64 __read_mostly shadow_init_value;
 
 u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
 u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask;
@@ -211,6 +212,14 @@  u64 kvm_mmu_changed_pte_notifier_make_spte(u64 old_spte, kvm_pfn_t new_pfn)
 	return new_spte;
 }
 
+void kvm_mmu_set_spte_init_value(u64 init_value)
+{
+	if (WARN_ON(!IS_ENABLED(CONFIG_X86_64) && init_value))
+		init_value = 0;
+	shadow_init_value = init_value;
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_set_spte_init_value);
+
 static u8 kvm_get_shadow_phys_bits(void)
 {
 	/*
@@ -355,6 +364,7 @@  void kvm_mmu_reset_all_pte_masks(void)
 	shadow_present_mask	= PT_PRESENT_MASK;
 	shadow_acc_track_mask	= 0;
 	shadow_me_mask		= sme_me_mask;
+	shadow_init_value	= 0;
 
 	shadow_host_writable_mask = DEFAULT_SPTE_HOST_WRITEABLE;
 	shadow_mmu_writable_mask  = DEFAULT_SPTE_MMU_WRITEABLE;
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index bca0ba11cccf..f88cf3db31c7 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -152,6 +152,8 @@  extern u64 __read_mostly shadow_mmio_access_mask;
 extern u64 __read_mostly shadow_present_mask;
 extern u64 __read_mostly shadow_me_mask;
 
+extern u64 __read_mostly shadow_init_value;
+
 /*
  * SPTEs in MMUs without A/D bits are marked with SPTE_TDP_AD_DISABLED_MASK;
  * shadow_acc_track_mask is the set of bits to be cleared in non-accessed