diff mbox

[v13,3/3] IOMMU: fix vt-d Device-TLB flush timeout issue

Message ID 1467179974-57317-4-git-send-email-quan.xu@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Quan Xu June 29, 2016, 5:59 a.m. UTC
From: Quan Xu <quan.xu@intel.com>

If Device-TLB flush timed out, we hide the target ATS device
immediately. By hiding the device, we make sure it can't be
assigned to any domain any longer (see device_assigned).

Signed-off-by: Quan Xu <quan.xu@intel.com>

CC: Jan Beulich <jbeulich@suse.com>
CC: Kevin Tian <kevin.tian@intel.com>
CC: Feng Wu <feng.wu@intel.com>

---
v13:
   1. drop domain crash logic, which is added to the vendor
      independent layer in patch #2.
   2. rename dev_invalidate_iotlb_timeout() to iommu_dev_iotlb_flush_timeout()
      and move it to the vendor independent layer.
---
 xen/drivers/passthrough/iommu.c       | 21 +++++++++++++
 xen/drivers/passthrough/pci.c         |  6 ++--
 xen/drivers/passthrough/vtd/extern.h  |  5 ++--
 xen/drivers/passthrough/vtd/qinval.c  | 56 +++++++++++++++++++++++++++--------
 xen/drivers/passthrough/vtd/x86/ats.c | 11 ++-----
 xen/include/xen/iommu.h               |  3 ++
 xen/include/xen/pci.h                 |  1 +
 7 files changed, 76 insertions(+), 27 deletions(-)

Comments

Tian, Kevin July 4, 2016, 6:16 a.m. UTC | #1
> From: Xu, Quan
> Sent: Wednesday, June 29, 2016 2:00 PM
> 
> From: Quan Xu <quan.xu@intel.com>
> 
> If Device-TLB flush timed out, we hide the target ATS device
> immediately. By hiding the device, we make sure it can't be
> assigned to any domain any longer (see device_assigned).
> 
> Signed-off-by: Quan Xu <quan.xu@intel.com>
> 
> CC: Jan Beulich <jbeulich@suse.com>
> CC: Kevin Tian <kevin.tian@intel.com>
> CC: Feng Wu <feng.wu@intel.com>
> 
> ---
> v13:
>    1. drop domain crash logic, which is added to the vendor
>       independent layer in patch #2.
>    2. rename dev_invalidate_iotlb_timeout() to iommu_dev_iotlb_flush_timeout()
>       and move it to the vendor independent layer.
> ---
>  xen/drivers/passthrough/iommu.c       | 21 +++++++++++++
>  xen/drivers/passthrough/pci.c         |  6 ++--
>  xen/drivers/passthrough/vtd/extern.h  |  5 ++--
>  xen/drivers/passthrough/vtd/qinval.c  | 56
> +++++++++++++++++++++++++++--------
>  xen/drivers/passthrough/vtd/x86/ats.c | 11 ++-----
>  xen/include/xen/iommu.h               |  3 ++
>  xen/include/xen/pci.h                 |  1 +
>  7 files changed, 76 insertions(+), 27 deletions(-)
> 
> diff --git a/xen/drivers/passthrough/iommu.c b/xen/drivers/passthrough/iommu.c
> index d793f5d..5db8ae6 100644
> --- a/xen/drivers/passthrough/iommu.c
> +++ b/xen/drivers/passthrough/iommu.c
> @@ -361,6 +361,27 @@ int iommu_iotlb_flush_all(struct domain *d)
>      return rc;
>  }
> 
> +void iommu_dev_iotlb_flush_timeout(struct domain *d,
> +                                   struct pci_dev *pdev)
> +{
> +    pcidevs_lock();
> +
> +    ASSERT(pdev->domain);
> +    if ( d != pdev->domain )
> +        return;

return w/o releasing the lock!

and is above scenario actually possible (a flush timeout is captured when
the device doesn't belong to previous domain)? If not, better to move
the condition into ASSERT.

> +
> +    list_del(&pdev->domain_list);
> +    pdev->domain = NULL;
> +    pci_hide_existing_device(pdev);
> +    if ( !d->is_shutting_down && printk_ratelimit() )
> +        printk(XENLOG_ERR
> +               "dom%d: ATS device %04x:%02x:%02x.%u flush failed\n",
> +               d->domain_id, pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn),
> +               PCI_FUNC(pdev->devfn));
> +
> +    pcidevs_unlock();

please move above warning out of the lock.

Thanks
Kevin
Quan Xu July 4, 2016, 6:45 a.m. UTC | #2
On July 04, 2016 2:16 PM, Tian, Kevin <kevin.tian@intel.com> wrote:
> > From: Xu, Quan
> > Sent: Wednesday, June 29, 2016 2:00 PM
> >
> > From: Quan Xu <quan.xu@intel.com>
> >
> > If Device-TLB flush timed out, we hide the target ATS device
> > immediately. By hiding the device, we make sure it can't be assigned
> > to any domain any longer (see device_assigned).
> >
> > Signed-off-by: Quan Xu <quan.xu@intel.com>
> >
> > CC: Jan Beulich <jbeulich@suse.com>
> > CC: Kevin Tian <kevin.tian@intel.com>
> > CC: Feng Wu <feng.wu@intel.com>
> >
> > ---
> > v13:
> >    1. drop domain crash logic, which is added to the vendor
> >       independent layer in patch #2.
> >    2. rename dev_invalidate_iotlb_timeout() to
> iommu_dev_iotlb_flush_timeout()
> >       and move it to the vendor independent layer.
> > ---
> >  xen/drivers/passthrough/iommu.c       | 21 +++++++++++++
> >  xen/drivers/passthrough/pci.c         |  6 ++--
> >  xen/drivers/passthrough/vtd/extern.h  |  5 ++--
> > xen/drivers/passthrough/vtd/qinval.c  | 56
> > +++++++++++++++++++++++++++--------
> >  xen/drivers/passthrough/vtd/x86/ats.c | 11 ++-----
> >  xen/include/xen/iommu.h               |  3 ++
> >  xen/include/xen/pci.h                 |  1 +
> >  7 files changed, 76 insertions(+), 27 deletions(-)
> >
> > diff --git a/xen/drivers/passthrough/iommu.c
> > b/xen/drivers/passthrough/iommu.c index d793f5d..5db8ae6 100644
> > --- a/xen/drivers/passthrough/iommu.c
> > +++ b/xen/drivers/passthrough/iommu.c
> > @@ -361,6 +361,27 @@ int iommu_iotlb_flush_all(struct domain *d)
> >      return rc;
> >  }
> >
> > +void iommu_dev_iotlb_flush_timeout(struct domain *d,
> > +                                   struct pci_dev *pdev) {
> > +    pcidevs_lock();
> > +
> > +    ASSERT(pdev->domain);
> > +    if ( d != pdev->domain )
> > +        return;
> 
> return w/o releasing the lock!
> 
Yes, I really need releasing the lock before return.

> and is above scenario actually possible (a flush timeout is captured when the
> device doesn't belong to previous domain)? If not, better to move the
> condition into ASSERT.

IMO, this is possible.
  -- not all of call trees of device iotlb flush are under pcidevs_lock, (.i.e  ...--iommu_iotlb_flush()-- xenmem_add_to_physmap()... )
  -- In extreme cases , the domain may has been freed or the device may has been detached or even attached to another domain.
That's also why to introduce a domain point here.

> 
> > +
> > +    list_del(&pdev->domain_list);
> > +    pdev->domain = NULL;
> > +    pci_hide_existing_device(pdev);
> > +    if ( !d->is_shutting_down && printk_ratelimit() )
> > +        printk(XENLOG_ERR
> > +               "dom%d: ATS device %04x:%02x:%02x.%u flush failed\n",
> > +               d->domain_id, pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn),
> > +               PCI_FUNC(pdev->devfn));
> > +
> > +    pcidevs_unlock();
> 
> please move above warning out of the lock.
> 

I think I'm better leave it as is.

as I use 'pdev' to print information, as similar as pci_release_devices().
If I use seg, bus, devfn variables directly, instead of 'pdev', I agree to move out of the lock, as similar as:

iommu_do_pci_domctl()
{
     case XEN_DOMCTL_assign_device... 
}

.... correct me if I am not right.

Quan
diff mbox

Patch

diff --git a/xen/drivers/passthrough/iommu.c b/xen/drivers/passthrough/iommu.c
index d793f5d..5db8ae6 100644
--- a/xen/drivers/passthrough/iommu.c
+++ b/xen/drivers/passthrough/iommu.c
@@ -361,6 +361,27 @@  int iommu_iotlb_flush_all(struct domain *d)
     return rc;
 }
 
+void iommu_dev_iotlb_flush_timeout(struct domain *d,
+                                   struct pci_dev *pdev)
+{
+    pcidevs_lock();
+
+    ASSERT(pdev->domain);
+    if ( d != pdev->domain )
+        return;
+
+    list_del(&pdev->domain_list);
+    pdev->domain = NULL;
+    pci_hide_existing_device(pdev);
+    if ( !d->is_shutting_down && printk_ratelimit() )
+        printk(XENLOG_ERR
+               "dom%d: ATS device %04x:%02x:%02x.%u flush failed\n",
+               d->domain_id, pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn),
+               PCI_FUNC(pdev->devfn));
+
+    pcidevs_unlock();
+}
+
 int __init iommu_setup(void)
 {
     int rc = -ENODEV;
diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c
index bb5f344..58bfb79 100644
--- a/xen/drivers/passthrough/pci.c
+++ b/xen/drivers/passthrough/pci.c
@@ -419,7 +419,7 @@  static void free_pdev(struct pci_seg *pseg, struct pci_dev *pdev)
     xfree(pdev);
 }
 
-static void _pci_hide_device(struct pci_dev *pdev)
+void pci_hide_existing_device(struct pci_dev *pdev)
 {
     if ( pdev->domain )
         return;
@@ -436,7 +436,7 @@  int __init pci_hide_device(int bus, int devfn)
     pdev = alloc_pdev(get_pseg(0), bus, devfn);
     if ( pdev )
     {
-        _pci_hide_device(pdev);
+        pci_hide_existing_device(pdev);
         rc = 0;
     }
     pcidevs_unlock();
@@ -466,7 +466,7 @@  int __init pci_ro_device(int seg, int bus, int devfn)
     }
 
     __set_bit(PCI_BDF2(bus, devfn), pseg->ro_map);
-    _pci_hide_device(pdev);
+    pci_hide_existing_device(pdev);
 
     return 0;
 }
diff --git a/xen/drivers/passthrough/vtd/extern.h b/xen/drivers/passthrough/vtd/extern.h
index 45357f2..efaff28 100644
--- a/xen/drivers/passthrough/vtd/extern.h
+++ b/xen/drivers/passthrough/vtd/extern.h
@@ -25,6 +25,7 @@ 
 
 #define VTDPREFIX "[VT-D]"
 
+struct pci_ats_dev;
 extern bool_t rwbf_quirk;
 
 void print_iommu_regs(struct acpi_drhd_unit *drhd);
@@ -60,8 +61,8 @@  int dev_invalidate_iotlb(struct iommu *iommu, u16 did,
                          u64 addr, unsigned int size_order, u64 type);
 
 int __must_check qinval_device_iotlb_sync(struct iommu *iommu,
-                                          u32 max_invs_pend,
-                                          u16 sid, u16 size, u64 addr);
+                                          struct pci_ats_dev *ats_dev,
+                                          u16 did, u16 size, u64 addr);
 
 unsigned int get_cache_line_size(void);
 void cacheline_flush(char *);
diff --git a/xen/drivers/passthrough/vtd/qinval.c b/xen/drivers/passthrough/vtd/qinval.c
index 4492b29..7a5c433 100644
--- a/xen/drivers/passthrough/vtd/qinval.c
+++ b/xen/drivers/passthrough/vtd/qinval.c
@@ -27,11 +27,11 @@ 
 #include "dmar.h"
 #include "vtd.h"
 #include "extern.h"
+#include "../ats.h"
 
 #define VTD_QI_TIMEOUT	1
 
-static int __must_check invalidate_sync(struct iommu *iommu,
-                                        bool_t flush_dev_iotlb);
+static int __must_check invalidate_sync(struct iommu *iommu);
 
 static void print_qi_regs(struct iommu *iommu)
 {
@@ -103,7 +103,7 @@  static int __must_check queue_invalidate_context_sync(struct iommu *iommu,
 
     unmap_vtd_domain_page(qinval_entries);
 
-    return invalidate_sync(iommu, 0);
+    return invalidate_sync(iommu);
 }
 
 static int __must_check queue_invalidate_iotlb_sync(struct iommu *iommu,
@@ -140,7 +140,7 @@  static int __must_check queue_invalidate_iotlb_sync(struct iommu *iommu,
     qinval_update_qtail(iommu, index);
     spin_unlock_irqrestore(&iommu->register_lock, flags);
 
-    return invalidate_sync(iommu, 0);
+    return invalidate_sync(iommu);
 }
 
 static int __must_check queue_invalidate_wait(struct iommu *iommu,
@@ -199,25 +199,55 @@  static int __must_check queue_invalidate_wait(struct iommu *iommu,
     return -EOPNOTSUPP;
 }
 
-static int __must_check invalidate_sync(struct iommu *iommu,
-                                        bool_t flush_dev_iotlb)
+static int __must_check invalidate_sync(struct iommu *iommu)
 {
     struct qi_ctrl *qi_ctrl = iommu_qi_ctrl(iommu);
 
     ASSERT(qi_ctrl->qinval_maddr);
 
-    return queue_invalidate_wait(iommu, 0, 1, 1, flush_dev_iotlb);
+    return queue_invalidate_wait(iommu, 0, 1, 1, 0);
+}
+
+static int __must_check dev_invalidate_sync(struct iommu *iommu,
+                                            struct pci_dev *pdev, u16 did)
+{
+    struct qi_ctrl *qi_ctrl = iommu_qi_ctrl(iommu);
+    int rc;
+
+    ASSERT(qi_ctrl->qinval_maddr);
+    rc = queue_invalidate_wait(iommu, 0, 1, 1, 1);
+    if ( rc == -ETIMEDOUT )
+    {
+        struct domain *d = NULL;
+
+        if ( test_bit(did, iommu->domid_bitmap) )
+            d = rcu_lock_domain_by_id(iommu->domid_map[did]);
+
+        /*
+         * In case the domain has been freed or the IOMMU domid bitmap is
+         * not valid, the device no longer belongs to this domain.
+         */
+        if ( d == NULL )
+            return rc;
+
+        iommu_dev_iotlb_flush_timeout(d, pdev);
+        rcu_unlock_domain(d);
+    }
+
+    return rc;
 }
 
 int qinval_device_iotlb_sync(struct iommu *iommu,
-                             u32 max_invs_pend,
-                             u16 sid, u16 size, u64 addr)
+                             struct pci_ats_dev *ats_dev,
+                             u16 did, u16 size, u64 addr)
 {
     unsigned long flags;
     unsigned int index;
     u64 entry_base;
     struct qinval_entry *qinval_entry, *qinval_entries;
+    struct pci_dev *pdev = ats_dev->pdev;
 
+    ASSERT(pdev);
     spin_lock_irqsave(&iommu->register_lock, flags);
     index = qinval_next_index(iommu);
     entry_base = iommu_qi_ctrl(iommu)->qinval_maddr +
@@ -227,9 +257,9 @@  int qinval_device_iotlb_sync(struct iommu *iommu,
 
     qinval_entry->q.dev_iotlb_inv_dsc.lo.type = TYPE_INVAL_DEVICE_IOTLB;
     qinval_entry->q.dev_iotlb_inv_dsc.lo.res_1 = 0;
-    qinval_entry->q.dev_iotlb_inv_dsc.lo.max_invs_pend = max_invs_pend;
+    qinval_entry->q.dev_iotlb_inv_dsc.lo.max_invs_pend = ats_dev->ats_queue_depth;
     qinval_entry->q.dev_iotlb_inv_dsc.lo.res_2 = 0;
-    qinval_entry->q.dev_iotlb_inv_dsc.lo.sid = sid;
+    qinval_entry->q.dev_iotlb_inv_dsc.lo.sid = PCI_BDF2(pdev->bus, pdev->devfn);
     qinval_entry->q.dev_iotlb_inv_dsc.lo.res_3 = 0;
 
     qinval_entry->q.dev_iotlb_inv_dsc.hi.size = size;
@@ -240,7 +270,7 @@  int qinval_device_iotlb_sync(struct iommu *iommu,
     qinval_update_qtail(iommu, index);
     spin_unlock_irqrestore(&iommu->register_lock, flags);
 
-    return invalidate_sync(iommu, 1);
+    return dev_invalidate_sync(iommu, pdev, did);
 }
 
 static int __must_check queue_invalidate_iec_sync(struct iommu *iommu,
@@ -271,7 +301,7 @@  static int __must_check queue_invalidate_iec_sync(struct iommu *iommu,
     qinval_update_qtail(iommu, index);
     spin_unlock_irqrestore(&iommu->register_lock, flags);
 
-    ret = invalidate_sync(iommu, 0);
+    ret = invalidate_sync(iommu);
 
     /*
      * reading vt-d architecture register will ensure
diff --git a/xen/drivers/passthrough/vtd/x86/ats.c b/xen/drivers/passthrough/vtd/x86/ats.c
index 11fe9bb..23e46b1 100644
--- a/xen/drivers/passthrough/vtd/x86/ats.c
+++ b/xen/drivers/passthrough/vtd/x86/ats.c
@@ -118,14 +118,9 @@  int dev_invalidate_iotlb(struct iommu *iommu, u16 did,
 
     list_for_each_entry( ats_dev, &ats_devices, list )
     {
-        struct pci_dev *pdev = ats_dev->pdev;
-        u16 sid;
         bool_t sbit;
         int rc = 0;
 
-        ASSERT(pdev);
-        sid = PCI_BDF2(pdev->bus, pdev->devfn);
-
         /* Only invalidate devices that belong to this IOMMU */
         if ( ats_dev->iommu != iommu )
             continue;
@@ -140,8 +135,7 @@  int dev_invalidate_iotlb(struct iommu *iommu, u16 did,
             /* invalidate all translations: sbit=1,bit_63=0,bit[62:12]=1 */
             sbit = 1;
             addr = (~0UL << PAGE_SHIFT_4K) & 0x7FFFFFFFFFFFFFFF;
-            rc = qinval_device_iotlb_sync(iommu, ats_dev->ats_queue_depth,
-                                          sid, sbit, addr);
+            rc = qinval_device_iotlb_sync(iommu, ats_dev, did, sbit, addr);
             break;
         case DMA_TLB_PSI_FLUSH:
             if ( !device_in_domain(iommu, ats_dev, did) )
@@ -160,8 +154,7 @@  int dev_invalidate_iotlb(struct iommu *iommu, u16 did,
                 addr |= (((u64)1 << (size_order - 1)) - 1) << PAGE_SHIFT_4K;
             }
 
-            rc = qinval_device_iotlb_sync(iommu, ats_dev->ats_queue_depth,
-                                          sid, sbit, addr);
+            rc = qinval_device_iotlb_sync(iommu, ats_dev, did, sbit, addr);
             break;
         default:
             dprintk(XENLOG_WARNING VTDPREFIX, "invalid vt-d flush type\n");
diff --git a/xen/include/xen/iommu.h b/xen/include/xen/iommu.h
index a759f2b..e0f7d52 100644
--- a/xen/include/xen/iommu.h
+++ b/xen/include/xen/iommu.h
@@ -207,6 +207,9 @@  int __must_check iommu_iotlb_flush(struct domain *d, unsigned long gfn,
                                    unsigned int page_count);
 int __must_check iommu_iotlb_flush_all(struct domain *d);
 
+void iommu_dev_iotlb_flush_timeout(struct domain *d,
+                                   struct pci_dev *pdev);
+
 /*
  * The purpose of the iommu_dont_flush_iotlb optional cpu flag is to
  * avoid unecessary iotlb_flush in the low level IOMMU code.
diff --git a/xen/include/xen/pci.h b/xen/include/xen/pci.h
index 6ed29dd..e4940cd 100644
--- a/xen/include/xen/pci.h
+++ b/xen/include/xen/pci.h
@@ -118,6 +118,7 @@  int pci_add_device(u16 seg, u8 bus, u8 devfn,
 int pci_remove_device(u16 seg, u8 bus, u8 devfn);
 int pci_ro_device(int seg, int bus, int devfn);
 int pci_hide_device(int bus, int devfn);
+void pci_hide_existing_device(struct pci_dev *pdev);
 struct pci_dev *pci_get_pdev(int seg, int bus, int devfn);
 struct pci_dev *pci_get_real_pdev(int seg, int bus, int devfn);
 struct pci_dev *pci_get_pdev_by_domain(