diff mbox

KVM: Enable snooping control for supported hardware

Message ID 1240476182-26254-1-git-send-email-sheng@linux.intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Sheng Yang April 23, 2009, 8:43 a.m. UTC
Memory aliases with different memory type is a problem for guest. For the guest
without assigned device, the memory type of guest memory would always been the
same as host(WB); but for the assigned device, some part of memory may be used
as DMA and then set to uncacheable memory type(UC/WC), which would be a conflict of
host memory type then be a potential issue.

Snooping control can guarantee the cache correctness of accessing the DMA engine
of VT-d.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
---
 arch/x86/include/asm/kvm_host.h |    2 ++
 arch/x86/kvm/mmu.c              |   23 ++++++++++++++++++++---
 virt/kvm/iommu.c                |   27 ++++++++++++++++++++++++---
 3 files changed, 46 insertions(+), 6 deletions(-)

Comments

Marcelo Tosatti April 24, 2009, 10:37 p.m. UTC | #1
Sheng,

On Thu, Apr 23, 2009 at 04:43:02PM +0800, Sheng Yang wrote:
> Memory aliases with different memory type is a problem for guest. For the guest
> without assigned device, the memory type of guest memory would always been the
> same as host(WB); but for the assigned device, some part of memory may be used
> as DMA and then set to uncacheable memory type(UC/WC), which would be a conflict of
> host memory type then be a potential issue.

So the issue arises when the host attempts to access the DMA memory
(which is typed as uncached by the guest) with its WB typing (MTRR).
That would mean that host accesses of such memory are potentially stale?

Because "24.3.2 Creating and Using Cached Translation Information"
mentions that with EPT enabled the guest will only use translations that
are EPTP-tagged, therefore with the correct (UC) typing, which you set
in the EPT pagetable?

More comments below.

> Snooping control can guarantee the cache correctness of accessing the DMA engine
> of VT-d.
> 
> Signed-off-by: Sheng Yang <sheng@linux.intel.com>
> ---
>  arch/x86/include/asm/kvm_host.h |    2 ++
>  arch/x86/kvm/mmu.c              |   23 ++++++++++++++++++++---
>  virt/kvm/iommu.c                |   27 ++++++++++++++++++++++++---
>  3 files changed, 46 insertions(+), 6 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 3fc4623..d2da40f 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -397,6 +397,8 @@ struct kvm_arch{
>  	struct list_head assigned_dev_head;
>  	struct list_head oos_global_pages;
>  	struct iommu_domain *iommu_domain;
> +#define KVM_IOMMU_CACHE_COHERENCY	0x1
> +	int iommu_flags;
>  	struct kvm_pic *vpic;
>  	struct kvm_ioapic *vioapic;
>  	struct kvm_pit *vpit;
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index 409d08e..1e63a87 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -1713,10 +1713,27 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
>  	if (largepage)
>  		spte |= PT_PAGE_SIZE_MASK;
>  	if (mt_mask) {
> +		/* For VT-d and EPT combination
> +		 * 1. MMIO: always map as UC
> +		 * 2. EPT without VT-d: always map as WB and set IGMT=1 to
> +		 *    keep consistent with host MTRR
> +		 * 3. EPT with VT-d:
> +		 *   a. VT-d with snooping control feature: keep consistent
> +		 *	with host MTRR can guarantee the correctness
> +		 *   b. VT-d without snooping control feature: can't
> +		 *      guarantee the result, try to trust guest.
> +		 */
>  		if (!kvm_is_mmio_pfn(pfn)) {
> -			mt_mask = get_memory_type(vcpu, gfn) <<
> -				kvm_x86_ops->get_mt_mask_shift();
> -			mt_mask |= VMX_EPT_IGMT_BIT;
> +			if (vcpu->kvm->arch.iommu_domain &&
> +			    !(vcpu->kvm->arch.iommu_flags &
> +				    KVM_IOMMU_CACHE_COHERENCY)) {
> +				mt_mask = get_memory_type(vcpu, gfn) <<
> +					  kvm_x86_ops->get_mt_mask_shift();
> +			} else {
> +				mt_mask = MTRR_TYPE_WRBACK <<
> +					kvm_x86_ops->get_mt_mask_shift();
> +				mt_mask |= VMX_EPT_IGMT_BIT;
> +			}
>  		} else
>  			mt_mask = MTRR_TYPE_UNCACHABLE <<
>  				kvm_x86_ops->get_mt_mask_shift();

Can you move this to subarch code? 

Perhaps replace get_mt_mask_shift with 

    u64 (*get_spte_mt_mask)(bool is_iommu_page);

And then just do

    if (mt_mask)
        spte |= kvm_x86_ops->get_spte_mt_mask();


> diff --git a/virt/kvm/iommu.c b/virt/kvm/iommu.c
> index 4c40375..1514758 100644
> --- a/virt/kvm/iommu.c
> +++ b/virt/kvm/iommu.c
> @@ -39,11 +39,16 @@ int kvm_iommu_map_pages(struct kvm *kvm,
>  	pfn_t pfn;
>  	int i, r = 0;
>  	struct iommu_domain *domain = kvm->arch.iommu_domain;
> +	int flags;
>  
>  	/* check if iommu exists and in use */
>  	if (!domain)
>  		return 0;
>  
> +	flags = IOMMU_READ | IOMMU_WRITE;
> +	if (kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY)
> +		flags |= IOMMU_CACHE;
> +
>  	for (i = 0; i < npages; i++) {
>  		/* check if already mapped */
>  		if (iommu_iova_to_phys(domain, gfn_to_gpa(gfn)))
> @@ -53,8 +58,7 @@ int kvm_iommu_map_pages(struct kvm *kvm,
>  		r = iommu_map_range(domain,
>  				    gfn_to_gpa(gfn),
>  				    pfn_to_hpa(pfn),
> -				    PAGE_SIZE,
> -				    IOMMU_READ | IOMMU_WRITE);
> +				    PAGE_SIZE, flags);
>  		if (r) {
>  			printk(KERN_ERR "kvm_iommu_map_address:"
>  			       "iommu failed to map pfn=%lx\n", pfn);
> @@ -88,7 +92,7 @@ int kvm_assign_device(struct kvm *kvm,
>  {
>  	struct pci_dev *pdev = NULL;
>  	struct iommu_domain *domain = kvm->arch.iommu_domain;
> -	int r;
> +	int r, last_flags;
>  
>  	/* check if iommu exists and in use */
>  	if (!domain)
> @@ -107,12 +111,29 @@ int kvm_assign_device(struct kvm *kvm,
>  		return r;
>  	}
>  
> +	last_flags = kvm->arch.iommu_flags;
> +	if (iommu_domain_has_cap(kvm->arch.iommu_domain,
> +				 IOMMU_CAP_CACHE_COHERENCY))
> +		kvm->arch.iommu_flags |= KVM_IOMMU_CACHE_COHERENCY;
> +
> +	/* Check if need to update IOMMU page table for guest memory */
> +	if ((last_flags ^ kvm->arch.iommu_flags) ==
> +			KVM_IOMMU_CACHE_COHERENCY) {
> +		kvm_iommu_unmap_memslots(kvm);
> +		r = kvm_iommu_map_memslots(kvm);
> +		if (r)
> +			goto out_unmap;
> +	}

You really need to check for a change? How can the IOMMU cache coherency
capability change while a guest is operational?

> +
>  	printk(KERN_DEBUG "assign device: host bdf = %x:%x:%x\n",
>  		assigned_dev->host_busnr,
>  		PCI_SLOT(assigned_dev->host_devfn),
>  		PCI_FUNC(assigned_dev->host_devfn));
>  
>  	return 0;
> +out_unmap:
> +	kvm_iommu_unmap_memslots(kvm);
> +	return r;
>  }
>  
>  int kvm_deassign_device(struct kvm *kvm,
> -- 
> 1.5.4.5
> 
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Sheng Yang April 26, 2009, 11:12 a.m. UTC | #2
On Saturday 25 April 2009 06:37:17 Marcelo Tosatti wrote:
 > Sheng,
 >
 > On Thu, Apr 23, 2009 at 04:43:02PM +0800, Sheng Yang wrote:
 > > Memory aliases with different memory type is a problem for guest. For the
 > > guest without assigned device, the memory type of guest memory would
 > > always been the same as host(WB); but for the assigned device, some part
 > > of memory may be used as DMA and then set to uncacheable memory
 > > type(UC/WC), which would be a conflict of host memory type then be a
 > > potential issue.
 >
 > So the issue arises when the host attempts to access the DMA memory
 > (which is typed as uncached by the guest) with its WB typing (MTRR).
 > That would mean that host accesses of such memory are potentially stale?
 
(sorry, resend, HTML is enabled by mistake.)

Yes. In fact it's usually usable, though we still have to deal with some 
exceptions.
 
> Because "24.3.2 Creating and Using Cached Translation Information"
 > mentions that with EPT enabled the guest will only use translations that
 > are EPTP-tagged, therefore with the correct (UC) typing, which you set
 > in the EPT pagetable?
 

With snooping control, we set it to be WB, as the same of host, for IOMMU 
engine would deal with the cache issue if it used a DMA address. So direct 
accessing to the memory also won't cause trouble.
 

Guest can set any memory as UC or WC, but we think the only purpose for this 
is DMA/MMIO. MMIO address can be tell, and we ignored the setting to 
DMA(that's means WB, and explained above), for it would handled by IOMMU 
engine with snooping control feature.
 

> More comments below.
 >
 > > Snooping control can guarantee the cache correctness of accessing the DMA
 > > engine of VT-d.
 > >
 > > Signed-off-by: Sheng Yang <sheng@linux.intel.com>
 > > ---
 > > arch/x86/include/asm/kvm_host.h | 2 ++
 > > arch/x86/kvm/mmu.c | 23 ++++++++++++++++++++---
 > > virt/kvm/iommu.c | 27 ++++++++++++++++++++++++---
 > > 3 files changed, 46 insertions(+), 6 deletions(-)
 > >
 > > diff --git a/arch/x86/include/asm/kvm_host.h
 > > b/arch/x86/include/asm/kvm_host.h index 3fc4623..d2da40f 100644
 > > --- a/arch/x86/include/asm/kvm_host.h
 > > +++ b/arch/x86/include/asm/kvm_host.h
 > > @@ -397,6 +397,8 @@ struct kvm_arch{
 > > struct list_head assigned_dev_head;
 > > struct list_head oos_global_pages;
 > > struct iommu_domain *iommu_domain;
 > > +#define KVM_IOMMU_CACHE_COHERENCY 0x1
 > > + int iommu_flags;
 > > struct kvm_pic *vpic;
 > > struct kvm_ioapic *vioapic;
 > > struct kvm_pit *vpit;
 > > diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
 > > index 409d08e..1e63a87 100644
 > > --- a/arch/x86/kvm/mmu.c
 > > +++ b/arch/x86/kvm/mmu.c
 > > @@ -1713,10 +1713,27 @@ static int set_spte(struct kvm_vcpu *vcpu, u64
 > > *shadow_pte, if (largepage)
 > > spte |= PT_PAGE_SIZE_MASK;
 > > if (mt_mask) {
 > > + /* For VT-d and EPT combination
 > > + * 1. MMIO: always map as UC
 > > + * 2. EPT without VT-d: always map as WB and set IGMT=1 to
 > > + * keep consistent with host MTRR
 > > + * 3. EPT with VT-d:
 > > + * a. VT-d with snooping control feature: keep consistent
 > > + * with host MTRR can guarantee the correctness
 > > + * b. VT-d without snooping control feature: can't
 > > + * guarantee the result, try to trust guest.
 > > + */
 > > if (!kvm_is_mmio_pfn(pfn)) {
 > > - mt_mask = get_memory_type(vcpu, gfn) <<
 > > - kvm_x86_ops->get_mt_mask_shift();
 > > - mt_mask |= VMX_EPT_IGMT_BIT;
 > > + if (vcpu->kvm->arch.iommu_domain &&
 > > + !(vcpu->kvm->arch.iommu_flags &
 > > + KVM_IOMMU_CACHE_COHERENCY)) {
 > > + mt_mask = get_memory_type(vcpu, gfn) <<
 > > + kvm_x86_ops->get_mt_mask_shift();
 > > + } else {
 > > + mt_mask = MTRR_TYPE_WRBACK <<
 > > + kvm_x86_ops->get_mt_mask_shift();
 > > + mt_mask |= VMX_EPT_IGMT_BIT;
 > > + }
 > > } else
 > > mt_mask = MTRR_TYPE_UNCACHABLE <<
 > > kvm_x86_ops->get_mt_mask_shift();
 >
 > Can you move this to subarch code?
 >
 > Perhaps replace get_mt_mask_shift with
 >
 > u64 (*get_spte_mt_mask)(bool is_iommu_page);
 >
 > And then just do
 >
 > if (mt_mask)
 > spte |= kvm_x86_ops->get_spte_mt_mask();
 

Yeah. And sorry, I now remembered that you mentioned it before...
 >
 > > diff --git a/virt/kvm/iommu.c b/virt/kvm/iommu.c
 > > index 4c40375..1514758 100644
 > > --- a/virt/kvm/iommu.c
 > > +++ b/virt/kvm/iommu.c
 > > @@ -39,11 +39,16 @@ int kvm_iommu_map_pages(struct kvm *kvm,
 > > pfn_t pfn;
 > > int i, r = 0;
 > > struct iommu_domain *domain = kvm->arch.iommu_domain;
 > > + int flags;
 > >
 > > /* check if iommu exists and in use */
 > > if (!domain)
 > > return 0;
 > >
 > > + flags = IOMMU_READ | IOMMU_WRITE;
 > > + if (kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY)
 > > + flags |= IOMMU_CACHE;
 > > +
 > > for (i = 0; i < npages; i++) {
 > > /* check if already mapped */
 > > if (iommu_iova_to_phys(domain, gfn_to_gpa(gfn)))
 > > @@ -53,8 +58,7 @@ int kvm_iommu_map_pages(struct kvm *kvm,
 > > r = iommu_map_range(domain,
 > > gfn_to_gpa(gfn),
 > > pfn_to_hpa(pfn),
 > > - PAGE_SIZE,
 > > - IOMMU_READ | IOMMU_WRITE);
 > > + PAGE_SIZE, flags);
 > > if (r) {
 > > printk(KERN_ERR "kvm_iommu_map_address:"
 > > "iommu failed to map pfn=%lx\n", pfn);
 > > @@ -88,7 +92,7 @@ int kvm_assign_device(struct kvm *kvm,
 > > {
 > > struct pci_dev *pdev = NULL;
 > > struct iommu_domain *domain = kvm->arch.iommu_domain;
 > > - int r;
 > > + int r, last_flags;
 > >
 > > /* check if iommu exists and in use */
 > > if (!domain)
 > > @@ -107,12 +111,29 @@ int kvm_assign_device(struct kvm *kvm,
 > > return r;
 > > }
 > >
 > > + last_flags = kvm->arch.iommu_flags;
 > > + if (iommu_domain_has_cap(kvm->arch.iommu_domain,
 > > + IOMMU_CAP_CACHE_COHERENCY))
 > > + kvm->arch.iommu_flags |= KVM_IOMMU_CACHE_COHERENCY;
 > > +
 > > + /* Check if need to update IOMMU page table for guest memory */
 > > + if ((last_flags ^ kvm->arch.iommu_flags) ==
 > > + KVM_IOMMU_CACHE_COHERENCY) {
 > > + kvm_iommu_unmap_memslots(kvm);
 > > + r = kvm_iommu_map_memslots(kvm);
 > > + if (r)
 > > + goto out_unmap;
 > > + }
 >
 > You really need to check for a change? How can the IOMMU cache coherency
 > capability change while a guest is operational?
 

This point is for the device assigned (should still in initialized stage, but 
maybe also hot-plug which can happen at any time). Before assign, we don't 
know which devices would for KVM, and then don't know which engine they're 
hooked, so also don't know if the engine support snooping control...
diff mbox

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3fc4623..d2da40f 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -397,6 +397,8 @@  struct kvm_arch{
 	struct list_head assigned_dev_head;
 	struct list_head oos_global_pages;
 	struct iommu_domain *iommu_domain;
+#define KVM_IOMMU_CACHE_COHERENCY	0x1
+	int iommu_flags;
 	struct kvm_pic *vpic;
 	struct kvm_ioapic *vioapic;
 	struct kvm_pit *vpit;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 409d08e..1e63a87 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1713,10 +1713,27 @@  static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 	if (largepage)
 		spte |= PT_PAGE_SIZE_MASK;
 	if (mt_mask) {
+		/* For VT-d and EPT combination
+		 * 1. MMIO: always map as UC
+		 * 2. EPT without VT-d: always map as WB and set IGMT=1 to
+		 *    keep consistent with host MTRR
+		 * 3. EPT with VT-d:
+		 *   a. VT-d with snooping control feature: keep consistent
+		 *	with host MTRR can guarantee the correctness
+		 *   b. VT-d without snooping control feature: can't
+		 *      guarantee the result, try to trust guest.
+		 */
 		if (!kvm_is_mmio_pfn(pfn)) {
-			mt_mask = get_memory_type(vcpu, gfn) <<
-				kvm_x86_ops->get_mt_mask_shift();
-			mt_mask |= VMX_EPT_IGMT_BIT;
+			if (vcpu->kvm->arch.iommu_domain &&
+			    !(vcpu->kvm->arch.iommu_flags &
+				    KVM_IOMMU_CACHE_COHERENCY)) {
+				mt_mask = get_memory_type(vcpu, gfn) <<
+					  kvm_x86_ops->get_mt_mask_shift();
+			} else {
+				mt_mask = MTRR_TYPE_WRBACK <<
+					kvm_x86_ops->get_mt_mask_shift();
+				mt_mask |= VMX_EPT_IGMT_BIT;
+			}
 		} else
 			mt_mask = MTRR_TYPE_UNCACHABLE <<
 				kvm_x86_ops->get_mt_mask_shift();
diff --git a/virt/kvm/iommu.c b/virt/kvm/iommu.c
index 4c40375..1514758 100644
--- a/virt/kvm/iommu.c
+++ b/virt/kvm/iommu.c
@@ -39,11 +39,16 @@  int kvm_iommu_map_pages(struct kvm *kvm,
 	pfn_t pfn;
 	int i, r = 0;
 	struct iommu_domain *domain = kvm->arch.iommu_domain;
+	int flags;
 
 	/* check if iommu exists and in use */
 	if (!domain)
 		return 0;
 
+	flags = IOMMU_READ | IOMMU_WRITE;
+	if (kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY)
+		flags |= IOMMU_CACHE;
+
 	for (i = 0; i < npages; i++) {
 		/* check if already mapped */
 		if (iommu_iova_to_phys(domain, gfn_to_gpa(gfn)))
@@ -53,8 +58,7 @@  int kvm_iommu_map_pages(struct kvm *kvm,
 		r = iommu_map_range(domain,
 				    gfn_to_gpa(gfn),
 				    pfn_to_hpa(pfn),
-				    PAGE_SIZE,
-				    IOMMU_READ | IOMMU_WRITE);
+				    PAGE_SIZE, flags);
 		if (r) {
 			printk(KERN_ERR "kvm_iommu_map_address:"
 			       "iommu failed to map pfn=%lx\n", pfn);
@@ -88,7 +92,7 @@  int kvm_assign_device(struct kvm *kvm,
 {
 	struct pci_dev *pdev = NULL;
 	struct iommu_domain *domain = kvm->arch.iommu_domain;
-	int r;
+	int r, last_flags;
 
 	/* check if iommu exists and in use */
 	if (!domain)
@@ -107,12 +111,29 @@  int kvm_assign_device(struct kvm *kvm,
 		return r;
 	}
 
+	last_flags = kvm->arch.iommu_flags;
+	if (iommu_domain_has_cap(kvm->arch.iommu_domain,
+				 IOMMU_CAP_CACHE_COHERENCY))
+		kvm->arch.iommu_flags |= KVM_IOMMU_CACHE_COHERENCY;
+
+	/* Check if need to update IOMMU page table for guest memory */
+	if ((last_flags ^ kvm->arch.iommu_flags) ==
+			KVM_IOMMU_CACHE_COHERENCY) {
+		kvm_iommu_unmap_memslots(kvm);
+		r = kvm_iommu_map_memslots(kvm);
+		if (r)
+			goto out_unmap;
+	}
+
 	printk(KERN_DEBUG "assign device: host bdf = %x:%x:%x\n",
 		assigned_dev->host_busnr,
 		PCI_SLOT(assigned_dev->host_devfn),
 		PCI_FUNC(assigned_dev->host_devfn));
 
 	return 0;
+out_unmap:
+	kvm_iommu_unmap_memslots(kvm);
+	return r;
 }
 
 int kvm_deassign_device(struct kvm *kvm,