diff mbox series

[v2,10/10] KVM: VMX: Track PGD instead of EPTP for paravirt Hyper-V TLB flush

Message ID 20201020215613.8972-11-sean.j.christopherson@intel.com (mailing list archive)
State New, archived
Headers show
Series None | expand

Commit Message

Sean Christopherson Oct. 20, 2020, 9:56 p.m. UTC
Track the address of the top-level EPT struct, a.k.a. the PGD, instead
of the EPTP itself for Hyper-V's paravirt TLB flush.  The paravirt API
takes the PGD, not the EPTP, and in theory tracking the EPTP could lead
to false negatives, e.g. if the PGD matched but the attributes in the
EPTP do not.  In practice, such a mismatch is extremely unlikely, if not
flat out impossible, given how KVM generates the EPTP.

Opportunistically rename the related fields to use the 'pgd'
nomenclature, and to prefix them with 'hv_tlb' to connect them to
Hyper-V's paravirt TLB flushing.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
---
 arch/x86/kvm/vmx/vmx.c | 53 +++++++++++++++++++-----------------------
 arch/x86/kvm/vmx/vmx.h |  6 ++---
 2 files changed, 27 insertions(+), 32 deletions(-)

Comments

Sean Christopherson Oct. 21, 2020, 5:59 p.m. UTC | #1
On Wed, Oct 21, 2020 at 04:39:28PM +0200, Vitaly Kuznetsov wrote:
> Sean Christopherson <sean.j.christopherson@intel.com> writes:
> > diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> > index e0fea09a6e42..89019e6476b3 100644
> > --- a/arch/x86/kvm/vmx/vmx.c
> > +++ b/arch/x86/kvm/vmx/vmx.c
> > @@ -478,18 +478,13 @@ static int kvm_fill_hv_flush_list_func(struct hv_guest_mapping_flush_list *flush
> >  			range->pages);
> >  }
> >  
> > -static inline int hv_remote_flush_eptp(u64 eptp, struct kvm_tlb_range *range)
> > +static inline int hv_remote_flush_pgd(u64 pgd, struct kvm_tlb_range *range)
> >  {
> > -	/*
> > -	 * FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE hypercall needs address
> > -	 * of the base of EPT PML4 table, strip off EPT configuration
> > -	 * information.
> > -	 */
> >  	if (range)
> > -		return hyperv_flush_guest_mapping_range(eptp & PAGE_MASK,
> > +		return hyperv_flush_guest_mapping_range(pgd,
> >  				kvm_fill_hv_flush_list_func, (void *)range);
> >  	else
> > -		return hyperv_flush_guest_mapping(eptp & PAGE_MASK);
> > +		return hyperv_flush_guest_mapping(pgd);
> >  }
> 
> (I'm probably missing something, please bear with me -- this is the last
> patch of the series after all :-) but PGD which comes from
> kvm_mmu_load_pgd() has PCID bits encoded and you're dropping
> '&PAGE_MASK' here ...

...

> > @@ -564,17 +559,17 @@ static int hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu)
> >  
> >  #endif /* IS_ENABLED(CONFIG_HYPERV) */
> >  
> > -static void hv_load_mmu_eptp(struct kvm_vcpu *vcpu, u64 eptp)
> > +static void hv_load_mmu_pgd(struct kvm_vcpu *vcpu, u64 pgd)
> >  {
> >  #if IS_ENABLED(CONFIG_HYPERV)
> >  	struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm);
> >  
> >  	if (kvm_x86_ops.tlb_remote_flush == hv_remote_flush_tlb) {
> > -		spin_lock(&kvm_vmx->ept_pointer_lock);
> > -		to_vmx(vcpu)->ept_pointer = eptp;
> > -		if (eptp != kvm_vmx->hv_tlb_eptp)
> > -			kvm_vmx->hv_tlb_eptp = INVALID_PAGE;
> > -		spin_unlock(&kvm_vmx->ept_pointer_lock);
> > +		spin_lock(&kvm_vmx->hv_tlb_pgd_lock);
> > +		to_vmx(vcpu)->hv_tlb_pgd = pgd;
> > +		if (pgd != kvm_vmx->hv_tlb_pgd)
> > +			kvm_vmx->hv_tlb_pgd = INVALID_PAGE;
> > +		spin_unlock(&kvm_vmx->hv_tlb_pgd_lock);
> >  	}
> >  #endif
> >  }
> > @@ -3059,7 +3054,7 @@ static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd,
> >  		eptp = construct_eptp(vcpu, pgd, pgd_level);
> >  		vmcs_write64(EPT_POINTER, eptp);
> >  
> > -		hv_load_mmu_eptp(vcpu, eptp);
> > +		hv_load_mmu_pgd(vcpu, pgd);
> 
> ... and not adding it here. (construct_eptp() seems to drop PCID bits
> but add its own stuff). Is this on purpose?

No, I completely forgot KVM crams the PCID bits into pgd.  I'll think I'll add
a patch to rework .load_mmu_pgd() to move the PCID bits to a separate param,
and change construct_eptp() to do WARN_ON_ONCE(pgd & ~PAGE_MASK).

Actually, I think it makes more sense to have VMX and SVM, grab the PCID via
kvm_get_active_pcid(vcpu) when necessary.  For EPTP, getting the PCID bits may
unnecessarily read CR3 from the VMCS.

Ugh, which brings up another issue.  I'm pretty sure the "vmcs01.GUEST_CR3 is
already up-to-date" is dead code:

		if (!enable_unrestricted_guest && !is_paging(vcpu))
			guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr;
		else if (test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
			guest_cr3 = vcpu->arch.cr3;
		else /* vmcs01.GUEST_CR3 is already up-to-date. */
			update_guest_cr3 = false;
		vmx_ept_load_pdptrs(vcpu);

The sole caller of .load_mmu_pgd() always invokes kvm_get_active_pcid(), which
in turn always does kvm_read_cr3(), i.e. CR3 will always be available.

So yeah, I think moving kvm_get_active_pcid() in VMX/SVM is the right approach.
I'll rename "pgd" to "root_hpa" and "pgd_level" to "root_level" so that we
don't end up with inconsistencies, e.g. where pgd may or may not contain PCID
bits.

Nice catch!
diff mbox series

Patch

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index e0fea09a6e42..89019e6476b3 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -478,18 +478,13 @@  static int kvm_fill_hv_flush_list_func(struct hv_guest_mapping_flush_list *flush
 			range->pages);
 }
 
-static inline int hv_remote_flush_eptp(u64 eptp, struct kvm_tlb_range *range)
+static inline int hv_remote_flush_pgd(u64 pgd, struct kvm_tlb_range *range)
 {
-	/*
-	 * FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE hypercall needs address
-	 * of the base of EPT PML4 table, strip off EPT configuration
-	 * information.
-	 */
 	if (range)
-		return hyperv_flush_guest_mapping_range(eptp & PAGE_MASK,
+		return hyperv_flush_guest_mapping_range(pgd,
 				kvm_fill_hv_flush_list_func, (void *)range);
 	else
-		return hyperv_flush_guest_mapping(eptp & PAGE_MASK);
+		return hyperv_flush_guest_mapping(pgd);
 }
 
 static int hv_remote_flush_tlb_with_range(struct kvm *kvm,
@@ -499,37 +494,37 @@  static int hv_remote_flush_tlb_with_range(struct kvm *kvm,
 	struct kvm_vcpu *vcpu;
 	int ret = 0, i;
 	bool mismatch;
-	u64 tmp_eptp;
+	u64 tmp_pgd;
 
-	spin_lock(&kvm_vmx->ept_pointer_lock);
+	spin_lock(&kvm_vmx->hv_tlb_pgd_lock);
 
-	if (!VALID_PAGE(kvm_vmx->hv_tlb_eptp)) {
+	if (!VALID_PAGE(kvm_vmx->hv_tlb_pgd)) {
 		mismatch = false;
 
 		kvm_for_each_vcpu(i, vcpu, kvm) {
-			tmp_eptp = to_vmx(vcpu)->ept_pointer;
-			if (!VALID_PAGE(tmp_eptp) ||
-			    tmp_eptp == kvm_vmx->hv_tlb_eptp)
+			tmp_pgd = to_vmx(vcpu)->hv_tlb_pgd;
+			if (!VALID_PAGE(tmp_pgd) ||
+			    tmp_pgd == kvm_vmx->hv_tlb_pgd)
 				continue;
 
-			if (!VALID_PAGE(kvm_vmx->hv_tlb_eptp))
-				kvm_vmx->hv_tlb_eptp = tmp_eptp;
+			if (!VALID_PAGE(kvm_vmx->hv_tlb_pgd))
+				kvm_vmx->hv_tlb_pgd = tmp_pgd;
 			else
 				mismatch = true;
 
 			if (!ret)
-				ret = hv_remote_flush_eptp(tmp_eptp, range);
+				ret = hv_remote_flush_pgd(tmp_pgd, range);
 
 			if (ret && mismatch)
 				break;
 		}
 		if (mismatch)
-			kvm_vmx->hv_tlb_eptp = INVALID_PAGE;
+			kvm_vmx->hv_tlb_pgd = INVALID_PAGE;
 	} else {
-		ret = hv_remote_flush_eptp(kvm_vmx->hv_tlb_eptp, range);
+		ret = hv_remote_flush_pgd(kvm_vmx->hv_tlb_pgd, range);
 	}
 
-	spin_unlock(&kvm_vmx->ept_pointer_lock);
+	spin_unlock(&kvm_vmx->hv_tlb_pgd_lock);
 	return ret;
 }
 static int hv_remote_flush_tlb(struct kvm *kvm)
@@ -564,17 +559,17 @@  static int hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu)
 
 #endif /* IS_ENABLED(CONFIG_HYPERV) */
 
-static void hv_load_mmu_eptp(struct kvm_vcpu *vcpu, u64 eptp)
+static void hv_load_mmu_pgd(struct kvm_vcpu *vcpu, u64 pgd)
 {
 #if IS_ENABLED(CONFIG_HYPERV)
 	struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm);
 
 	if (kvm_x86_ops.tlb_remote_flush == hv_remote_flush_tlb) {
-		spin_lock(&kvm_vmx->ept_pointer_lock);
-		to_vmx(vcpu)->ept_pointer = eptp;
-		if (eptp != kvm_vmx->hv_tlb_eptp)
-			kvm_vmx->hv_tlb_eptp = INVALID_PAGE;
-		spin_unlock(&kvm_vmx->ept_pointer_lock);
+		spin_lock(&kvm_vmx->hv_tlb_pgd_lock);
+		to_vmx(vcpu)->hv_tlb_pgd = pgd;
+		if (pgd != kvm_vmx->hv_tlb_pgd)
+			kvm_vmx->hv_tlb_pgd = INVALID_PAGE;
+		spin_unlock(&kvm_vmx->hv_tlb_pgd_lock);
 	}
 #endif
 }
@@ -3059,7 +3054,7 @@  static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd,
 		eptp = construct_eptp(vcpu, pgd, pgd_level);
 		vmcs_write64(EPT_POINTER, eptp);
 
-		hv_load_mmu_eptp(vcpu, eptp);
+		hv_load_mmu_pgd(vcpu, pgd);
 
 		if (!enable_unrestricted_guest && !is_paging(vcpu))
 			guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr;
@@ -6938,7 +6933,7 @@  static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
 	vmx->pi_desc.sn = 1;
 
 #if IS_ENABLED(CONFIG_HYPERV)
-	vmx->ept_pointer = INVALID_PAGE;
+	vmx->hv_tlb_pgd = INVALID_PAGE;
 #endif
 	return 0;
 
@@ -6957,7 +6952,7 @@  static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
 static int vmx_vm_init(struct kvm *kvm)
 {
 #if IS_ENABLED(CONFIG_HYPERV)
-	spin_lock_init(&to_kvm_vmx(kvm)->ept_pointer_lock);
+	spin_lock_init(&to_kvm_vmx(kvm)->hv_tlb_pgd_lock);
 #endif
 
 	if (!ple_gap)
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 1b8c08e483cd..4d61fb8e8d9d 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -277,7 +277,7 @@  struct vcpu_vmx {
 	u64 msr_ia32_feature_control;
 	u64 msr_ia32_feature_control_valid_bits;
 #if IS_ENABLED(CONFIG_HYPERV)
-	u64 ept_pointer;
+	u64 hv_tlb_pgd;
 #endif
 
 	struct pt_desc pt_desc;
@@ -298,8 +298,8 @@  struct kvm_vmx {
 	gpa_t ept_identity_map_addr;
 
 #if IS_ENABLED(CONFIG_HYPERV)
-	hpa_t hv_tlb_eptp;
-	spinlock_t ept_pointer_lock;
+	hpa_t hv_tlb_pgd;
+	spinlock_t hv_tlb_pgd_lock;
 #endif
 };