diff mbox series

[3/4] KVM: x86/mmu: Add shadow mask for effective host MTRR memtype

Message ID 20220715230016.3762909-4-seanjc@google.com (mailing list archive)
State New, archived
Headers show
Series KVM: x86/mmu: Memtype related cleanups | expand

Commit Message

Sean Christopherson July 15, 2022, 11 p.m. UTC
Add shadow_memtype_mask to capture that EPT needs a non-zero memtype mask
instead of relying on TDP being enabled, as NPT doesn't need a non-zero
mask.  This is a glorified nop as kvm_x86_ops.get_mt_mask() returns zero
for NPT anyways.

No functional change intended.

Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/mmu/spte.c | 21 ++++++++++++++++++---
 arch/x86/kvm/mmu/spte.h |  1 +
 2 files changed, 19 insertions(+), 3 deletions(-)

Comments

Maxim Levitsky July 18, 2022, 12:08 p.m. UTC | #1
On Fri, 2022-07-15 at 23:00 +0000, Sean Christopherson wrote:
> Add shadow_memtype_mask to capture that EPT needs a non-zero memtype mask
> instead of relying on TDP being enabled, as NPT doesn't need a non-zero
> mask.  This is a glorified nop as kvm_x86_ops.get_mt_mask() returns zero
> for NPT anyways.
> 
> No functional change intended.
> 
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> ---
>  arch/x86/kvm/mmu/spte.c | 21 ++++++++++++++++++---
>  arch/x86/kvm/mmu/spte.h |  1 +
>  2 files changed, 19 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
> index fb1f17504138..7314d27d57a4 100644
> --- a/arch/x86/kvm/mmu/spte.c
> +++ b/arch/x86/kvm/mmu/spte.c
> @@ -33,6 +33,7 @@ u64 __read_mostly shadow_mmio_value;
>  u64 __read_mostly shadow_mmio_mask;
>  u64 __read_mostly shadow_mmio_access_mask;
>  u64 __read_mostly shadow_present_mask;
> +u64 __read_mostly shadow_memtype_mask;
>  u64 __read_mostly shadow_me_value;
>  u64 __read_mostly shadow_me_mask;
>  u64 __read_mostly shadow_acc_track_mask;
> @@ -161,10 +162,10 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
>  
>         if (level > PG_LEVEL_4K)
>                 spte |= PT_PAGE_SIZE_MASK;
> -       if (tdp_enabled)
> +
> +       if (shadow_memtype_mask)
>                 spte |= static_call(kvm_x86_get_mt_mask)(vcpu, gfn,
> -                       kvm_is_mmio_pfn(pfn));
> -
> +                                                        kvm_is_mmio_pfn(pfn));
>         if (host_writable)
>                 spte |= shadow_host_writable_mask;
>         else
> @@ -391,6 +392,13 @@ void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only)
>         shadow_nx_mask          = 0ull;
>         shadow_x_mask           = VMX_EPT_EXECUTABLE_MASK;
>         shadow_present_mask     = has_exec_only ? 0ull : VMX_EPT_READABLE_MASK;
> +       /*
> +        * EPT overrides the host MTRRs, and so KVM must program the desired
> +        * memtype directly into the SPTEs.  Note, this mask is just the mask
> +        * of all bits that factor into the memtype, the actual memtype must be
> +        * dynamically calculated, e.g. to ensure host MMIO is mapped UC.
> +        */
> +       shadow_memtype_mask     = VMX_EPT_MT_MASK | VMX_EPT_IPAT_BIT;
>         shadow_acc_track_mask   = VMX_EPT_RWX_MASK;
>         shadow_host_writable_mask = EPT_SPTE_HOST_WRITABLE;
>         shadow_mmu_writable_mask  = EPT_SPTE_MMU_WRITABLE;
> @@ -441,6 +449,13 @@ void kvm_mmu_reset_all_pte_masks(void)
>         shadow_nx_mask          = PT64_NX_MASK;
>         shadow_x_mask           = 0;
>         shadow_present_mask     = PT_PRESENT_MASK;
> +
> +       /*
> +        * For shadow paging and NPT, KVM uses PAT entry '0' to encode WB
> +        * memtype in the SPTEs, i.e. relies on host MTRRs to provide the
> +        * correct memtype (WB is the "weakest" memtype).
> +        */
> +       shadow_memtype_mask     = 0;
>         shadow_acc_track_mask   = 0;
>         shadow_me_mask          = 0;
>         shadow_me_value         = 0;
> diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
> index ba3dccb202bc..cabe3fbb4f39 100644
> --- a/arch/x86/kvm/mmu/spte.h
> +++ b/arch/x86/kvm/mmu/spte.h
> @@ -147,6 +147,7 @@ extern u64 __read_mostly shadow_mmio_value;
>  extern u64 __read_mostly shadow_mmio_mask;
>  extern u64 __read_mostly shadow_mmio_access_mask;
>  extern u64 __read_mostly shadow_present_mask;
> +extern u64 __read_mostly shadow_memtype_mask;
>  extern u64 __read_mostly shadow_me_value;
>  extern u64 __read_mostly shadow_me_mask;
>  


So if I understand correctly:


VMX:

- host MTRRs are ignored.

- all *host* mmio ranges (can only be VFIO's pci BARs), are mapped UC in EPT,
 but guest can override this with its PAT to WC)


- all regular memory is mapped WB + guest PAT ignored unless there is noncoherent dma,
 (an older Intel's IOMMU? I think current Intel's IOMMLU are coherent?)


- In case of noncoherent dma guest MTRRs and PAT are respected.



SVM:

- host MTRRs are respected, and can enforce UC on *host* mmio areas.


- WB is always used in NPT, *always*, however NPT doesn't have the 'IPAT'
 bit, so the guest is free to overrride it for its its MMIO areas to any memory type as it wishes,
 using its own PAT, and we do allow the guest to change IA32_PAT to any value it wishes to.

 (e.g VFIO's PCI bars, memory which a VFIO devices needs to access, etc)

 (This reminds me that PAT is somewhat broken in regard to nesting, we ignore L2's PAT)


With all this said, it makes sense.


Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com>

Best regards,
	Maxim Levitsky
Sean Christopherson July 18, 2022, 4:07 p.m. UTC | #2
On Mon, Jul 18, 2022, Maxim Levitsky wrote:
> On Fri, 2022-07-15 at 23:00 +0000, Sean Christopherson wrote:
> > diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
> > index ba3dccb202bc..cabe3fbb4f39 100644
> > --- a/arch/x86/kvm/mmu/spte.h
> > +++ b/arch/x86/kvm/mmu/spte.h
> > @@ -147,6 +147,7 @@ extern u64 __read_mostly shadow_mmio_value;
> >  extern u64 __read_mostly shadow_mmio_mask;
> >  extern u64 __read_mostly shadow_mmio_access_mask;
> >  extern u64 __read_mostly shadow_present_mask;
> > +extern u64 __read_mostly shadow_memtype_mask;
> >  extern u64 __read_mostly shadow_me_value;
> >  extern u64 __read_mostly shadow_me_mask;
> >  
> 
> 
> So if I understand correctly:
> 
> 
> VMX:
> 
> - host MTRRs are ignored.
> 
> - all *host* mmio ranges (can only be VFIO's pci BARs), are mapped UC in EPT,
>  but guest can override this with its PAT to WC)
> 
> 
> - all regular memory is mapped WB + guest PAT ignored unless there is noncoherent dma,
>  (an older Intel's IOMMU? I think current Intel's IOMMLU are coherent?)

Effectively, yes.

My understanding is that on x86, everything is cache-coherent by default, but devices
can set a no-snoop flag, which breaks cache coherency.  But then the IOMMU, except for
old Intel IOMMUs, can block such packets, and VFIO forces the block setting in the IOMMU
when it's supported by hardware.

Note, at first glance, commit e8ae0e140c05 ("vfio: Require that devices support DMA
cache coherence") makes it seem like exposing non-coherent DMA to KVM is impossible,
but IIUC that's just enforcing that the _default_ device behavior provides coherency.
I.e. VFIO will still allow an old Intel IOMMU plus a device that sets no-snoop.
diff mbox series

Patch

diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
index fb1f17504138..7314d27d57a4 100644
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -33,6 +33,7 @@  u64 __read_mostly shadow_mmio_value;
 u64 __read_mostly shadow_mmio_mask;
 u64 __read_mostly shadow_mmio_access_mask;
 u64 __read_mostly shadow_present_mask;
+u64 __read_mostly shadow_memtype_mask;
 u64 __read_mostly shadow_me_value;
 u64 __read_mostly shadow_me_mask;
 u64 __read_mostly shadow_acc_track_mask;
@@ -161,10 +162,10 @@  bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 
 	if (level > PG_LEVEL_4K)
 		spte |= PT_PAGE_SIZE_MASK;
-	if (tdp_enabled)
+
+	if (shadow_memtype_mask)
 		spte |= static_call(kvm_x86_get_mt_mask)(vcpu, gfn,
-			kvm_is_mmio_pfn(pfn));
-
+							 kvm_is_mmio_pfn(pfn));
 	if (host_writable)
 		spte |= shadow_host_writable_mask;
 	else
@@ -391,6 +392,13 @@  void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only)
 	shadow_nx_mask		= 0ull;
 	shadow_x_mask		= VMX_EPT_EXECUTABLE_MASK;
 	shadow_present_mask	= has_exec_only ? 0ull : VMX_EPT_READABLE_MASK;
+	/*
+	 * EPT overrides the host MTRRs, and so KVM must program the desired
+	 * memtype directly into the SPTEs.  Note, this mask is just the mask
+	 * of all bits that factor into the memtype, the actual memtype must be
+	 * dynamically calculated, e.g. to ensure host MMIO is mapped UC.
+	 */
+	shadow_memtype_mask	= VMX_EPT_MT_MASK | VMX_EPT_IPAT_BIT;
 	shadow_acc_track_mask	= VMX_EPT_RWX_MASK;
 	shadow_host_writable_mask = EPT_SPTE_HOST_WRITABLE;
 	shadow_mmu_writable_mask  = EPT_SPTE_MMU_WRITABLE;
@@ -441,6 +449,13 @@  void kvm_mmu_reset_all_pte_masks(void)
 	shadow_nx_mask		= PT64_NX_MASK;
 	shadow_x_mask		= 0;
 	shadow_present_mask	= PT_PRESENT_MASK;
+
+	/*
+	 * For shadow paging and NPT, KVM uses PAT entry '0' to encode WB
+	 * memtype in the SPTEs, i.e. relies on host MTRRs to provide the
+	 * correct memtype (WB is the "weakest" memtype).
+	 */
+	shadow_memtype_mask	= 0;
 	shadow_acc_track_mask	= 0;
 	shadow_me_mask		= 0;
 	shadow_me_value		= 0;
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index ba3dccb202bc..cabe3fbb4f39 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -147,6 +147,7 @@  extern u64 __read_mostly shadow_mmio_value;
 extern u64 __read_mostly shadow_mmio_mask;
 extern u64 __read_mostly shadow_mmio_access_mask;
 extern u64 __read_mostly shadow_present_mask;
+extern u64 __read_mostly shadow_memtype_mask;
 extern u64 __read_mostly shadow_me_value;
 extern u64 __read_mostly shadow_me_mask;