@@ -237,7 +237,10 @@ struct arch_msix {
int table_refcnt[MAX_MSIX_TABLE_PAGES];
int table_idx[MAX_MSIX_TABLE_PAGES];
spinlock_t table_lock;
+ spinlock_t control_lock;
bool host_maskall, guest_maskall;
+ uint16_t host_enable;
+ bool guest_enable;
domid_t warned;
};
@@ -147,6 +147,57 @@ static bool msix_memory_decoded(const struct pci_dev *dev, unsigned int pos)
return memory_decoded(dev);
}
+
+/*
+ * Ensure MSI-X interrupts are masked during setup. Some devices require
+ * MSI-X to be enabled before we can touch the MSI-X registers. We need
+ * to mask all the vectors to prevent interrupts coming in before they're
+ * fully set up.
+ */
+static uint16_t msix_update_lock(struct pci_dev *dev, unsigned int pos)
+{
+ uint16_t control, new_control;
+ unsigned long flags;
+
+ spin_lock_irqsave(&dev->msix->control_lock, flags);
+
+ dev->msix->host_enable++;
+
+ control = pci_conf_read16(dev->sbdf, msix_control_reg(pos));
+ if ( !(control & PCI_MSIX_FLAGS_ENABLE) )
+ {
+ new_control = control | PCI_MSIX_FLAGS_ENABLE | PCI_MSIX_FLAGS_MASKALL;
+ pci_conf_write16(dev->sbdf, msix_control_reg(pos), new_control);
+ }
+ else
+ dev->msix->guest_enable = true;
+
+ spin_unlock_irqrestore(&dev->msix->control_lock, flags);
+
+ return control;
+}
+
+static void msix_update_unlock(struct pci_dev *dev, unsigned int pos, uint16_t control)
+{
+ uint16_t new_control;
+ unsigned long flags;
+
+ spin_lock_irqsave(&dev->msix->control_lock, flags);
+
+ dev->msix->host_enable--;
+
+ new_control = control & ~(PCI_MSIX_FLAGS_ENABLE | PCI_MSIX_FLAGS_MASKALL);
+
+ if ( dev->msix->host_enable || dev->msix->guest_enable )
+ new_control |= PCI_MSIX_FLAGS_ENABLE;
+ if ( dev->msix->host_maskall || dev->msix->guest_maskall || dev->msix->host_enable )
+ new_control |= PCI_MSIX_FLAGS_MASKALL;
+ if ( new_control != control )
+ pci_conf_write16(dev->sbdf, msix_control_reg(pos), control);
+
+ spin_unlock_irqrestore(&dev->msix->control_lock, flags);
+}
+
/*
* MSI message composition
*/
@@ -288,7 +339,7 @@ static void msi_set_enable(struct pci_dev *dev, int enable)
__msi_set_enable(seg, bus, slot, func, pos, enable);
}
-static void msix_set_enable(struct pci_dev *dev, int enable)
+static void msix_force_disable(struct pci_dev *dev)
{
int pos;
u16 control, seg = dev->seg;
@@ -299,11 +350,16 @@ static void msix_set_enable(struct pci_dev *dev, int enable)
pos = pci_find_cap_offset(seg, bus, slot, func, PCI_CAP_ID_MSIX);
if ( pos )
{
+ spin_lock_irq(&dev->msix->control_lock);
+
+ dev->msix->host_enable = false;
+ dev->msix->guest_enable = false;
+
control = pci_conf_read16(dev->sbdf, msix_control_reg(pos));
control &= ~PCI_MSIX_FLAGS_ENABLE;
- if ( enable )
- control |= PCI_MSIX_FLAGS_ENABLE;
pci_conf_write16(dev->sbdf, msix_control_reg(pos), control);
+
+ spin_unlock_irq(&dev->msix->control_lock);
}
}
@@ -318,9 +374,10 @@ static void msi_set_mask_bit(struct irq_desc *desc, bool host, bool guest)
{
struct msi_desc *entry = desc->msi_desc;
struct pci_dev *pdev;
- u16 seg, control;
+ u16 seg;
u8 bus, slot, func;
- bool flag = host || guest, maskall;
+ bool flag = host || guest;
+ uint16_t control;
ASSERT(spin_is_locked(&desc->lock));
BUG_ON(!entry || !entry->dev);
@@ -343,30 +400,18 @@ static void msi_set_mask_bit(struct irq_desc *desc, bool host, bool guest)
}
break;
case PCI_CAP_ID_MSIX:
- maskall = pdev->msix->host_maskall;
- control = pci_conf_read16(pdev->sbdf,
- msix_control_reg(entry->msi_attrib.pos));
- if ( unlikely(!(control & PCI_MSIX_FLAGS_ENABLE)) )
- {
- pdev->msix->host_maskall = 1;
- pci_conf_write16(pdev->sbdf,
- msix_control_reg(entry->msi_attrib.pos),
- control | (PCI_MSIX_FLAGS_ENABLE |
- PCI_MSIX_FLAGS_MASKALL));
- }
+ control = msix_update_lock(pdev, entry->msi_attrib.pos);
+
if ( likely(memory_decoded(pdev)) )
{
writel(flag, entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
readl(entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
-
- if ( likely(control & PCI_MSIX_FLAGS_ENABLE) )
- break;
}
- else if ( flag && !(control & PCI_MSIX_FLAGS_MASKALL) )
+ else if ( !pdev->msix->host_maskall && !pdev->msix->guest_maskall )
{
domid_t domid = pdev->domain->domain_id;
- maskall = true;
+ pdev->msix->host_maskall = true;
if ( pdev->msix->warned != domid )
{
pdev->msix->warned = domid;
@@ -375,11 +420,8 @@ static void msi_set_mask_bit(struct irq_desc *desc, bool host, bool guest)
desc->irq, domid, &pdev->sbdf);
}
}
- pdev->msix->host_maskall = maskall;
- if ( maskall || pdev->msix->guest_maskall )
- control |= PCI_MSIX_FLAGS_MASKALL;
- pci_conf_write16(pdev->sbdf,
- msix_control_reg(entry->msi_attrib.pos), control);
+
+ msix_update_unlock(pdev, entry->msi_attrib.pos, control);
break;
}
entry->msi_attrib.host_masked = host;
@@ -494,26 +536,19 @@ static struct msi_desc *alloc_msi_entry(unsigned int nr)
int setup_msi_irq(struct irq_desc *desc, struct msi_desc *msidesc)
{
- const struct pci_dev *pdev = msidesc->dev;
- unsigned int cpos = msix_control_reg(msidesc->msi_attrib.pos);
- u16 control = ~0;
+ struct pci_dev *pdev = msidesc->dev;
+ uint16_t control = 0;
int rc;
if ( msidesc->msi_attrib.type == PCI_CAP_ID_MSIX )
- {
- control = pci_conf_read16(pdev->sbdf, cpos);
- if ( !(control & PCI_MSIX_FLAGS_ENABLE) )
- pci_conf_write16(pdev->sbdf, cpos,
- control | (PCI_MSIX_FLAGS_ENABLE |
- PCI_MSIX_FLAGS_MASKALL));
- }
+ control = msix_update_lock(pdev, msidesc->msi_attrib.pos);
rc = __setup_msi_irq(desc, msidesc,
msi_maskable_irq(msidesc) ? &pci_msi_maskable
: &pci_msi_nonmaskable);
- if ( !(control & PCI_MSIX_FLAGS_ENABLE) )
- pci_conf_write16(pdev->sbdf, cpos, control);
+ if ( control )
+ msix_update_unlock(pdev, msidesc->msi_attrib.pos, control);
return rc;
}
@@ -754,14 +789,14 @@ static int msix_capability_init(struct pci_dev *dev,
{
struct arch_msix *msix = dev->msix;
struct msi_desc *entry = NULL;
- u16 control;
u64 table_paddr;
u32 table_offset;
u16 seg = dev->seg;
u8 bus = dev->bus;
u8 slot = PCI_SLOT(dev->devfn);
u8 func = PCI_FUNC(dev->devfn);
- bool maskall = msix->host_maskall;
+ uint16_t control;
+ int ret = 0;
unsigned int pos = pci_find_cap_offset(seg, bus, slot, func,
PCI_CAP_ID_MSIX);
@@ -770,37 +805,22 @@ static int msix_capability_init(struct pci_dev *dev,
ASSERT(pcidevs_locked());
- control = pci_conf_read16(dev->sbdf, msix_control_reg(pos));
- /*
- * Ensure MSI-X interrupts are masked during setup. Some devices require
- * MSI-X to be enabled before we can touch the MSI-X registers. We need
- * to mask all the vectors to prevent interrupts coming in before they're
- * fully set up.
- */
- msix->host_maskall = 1;
- pci_conf_write16(dev->sbdf, msix_control_reg(pos),
- control | (PCI_MSIX_FLAGS_ENABLE |
- PCI_MSIX_FLAGS_MASKALL));
-
- if ( unlikely(!memory_decoded(dev)) )
- {
- pci_conf_write16(dev->sbdf, msix_control_reg(pos),
- control & ~PCI_MSIX_FLAGS_ENABLE);
- return -ENXIO;
- }
-
if ( desc )
{
entry = alloc_msi_entry(1);
if ( !entry )
- {
- pci_conf_write16(dev->sbdf, msix_control_reg(pos),
- control & ~PCI_MSIX_FLAGS_ENABLE);
return -ENOMEM;
- }
ASSERT(msi);
}
+ control = msix_update_lock(dev, pos);
+
+ if ( unlikely(!memory_decoded(dev)) )
+ {
+ ret = -ENXIO;
+ goto out;
+ }
+
/* Locate MSI-X table region */
table_offset = pci_conf_read32(dev->sbdf, msix_table_offset_reg(pos));
if ( !msix->used_entries &&
@@ -834,10 +854,8 @@ static int msix_capability_init(struct pci_dev *dev,
{
if ( !msi || !msi->table_base )
{
- pci_conf_write16(dev->sbdf, msix_control_reg(pos),
- control & ~PCI_MSIX_FLAGS_ENABLE);
- xfree(entry);
- return -ENXIO;
+ ret = -ENXIO;
+ goto out;
}
table_paddr = msi->table_base;
}
@@ -863,9 +881,8 @@ static int msix_capability_init(struct pci_dev *dev,
}
else if ( !msix->table.first )
{
- pci_conf_write16(dev->sbdf, msix_control_reg(pos), control);
- xfree(entry);
- return -ENODATA;
+ ret = -ENODATA;
+ goto out;
}
else
table_paddr = (msix->table.first << PAGE_SHIFT) +
@@ -880,9 +897,8 @@ static int msix_capability_init(struct pci_dev *dev,
if ( idx < 0 )
{
- pci_conf_write16(dev->sbdf, msix_control_reg(pos), control);
- xfree(entry);
- return idx;
+ ret = idx;
+ goto out;
}
base = fix_to_virt(idx) + (entry_paddr & (PAGE_SIZE - 1));
@@ -906,12 +922,6 @@ static int msix_capability_init(struct pci_dev *dev,
if ( !msix->used_entries )
{
- maskall = false;
- if ( !msix->guest_maskall )
- control &= ~PCI_MSIX_FLAGS_MASKALL;
- else
- control |= PCI_MSIX_FLAGS_MASKALL;
-
if ( rangeset_add_range(mmio_ro_ranges, msix->table.first,
msix->table.last) )
WARN();
@@ -940,23 +950,13 @@ static int msix_capability_init(struct pci_dev *dev,
WARN_ON(msix->table.first != (table_paddr >> PAGE_SHIFT));
++msix->used_entries;
- /* Restore MSI-X enabled bits */
- if ( !hardware_domain )
- {
- /*
- * ..., except for internal requests (before Dom0 starts), in which
- * case we rather need to behave "normally", i.e. not follow the split
- * brain model where Dom0 actually enables MSI (and disables INTx).
- */
- pci_intx(dev, false);
- control |= PCI_MSIX_FLAGS_ENABLE;
- control &= ~PCI_MSIX_FLAGS_MASKALL;
- maskall = 0;
- }
- msix->host_maskall = maskall;
- pci_conf_write16(dev->sbdf, msix_control_reg(pos), control);
+ out:
+ if ( ret < 0 )
+ xfree(entry);
- return 0;
+ msix_update_unlock(dev, pos, control);
+
+ return ret;
}
/**
@@ -1180,7 +1180,7 @@ void pci_cleanup_msi(struct pci_dev *pdev)
{
/* Disable MSI and/or MSI-X */
msi_set_enable(pdev, 0);
- msix_set_enable(pdev, 0);
+ msix_force_disable(pdev);
msi_free_irqs(pdev);
}
@@ -1229,11 +1229,20 @@ int pci_msi_conf_write_intercept(struct pci_dev *pdev, unsigned int reg,
if ( reg != msix_control_reg(pos) || size != 2 )
return -EACCES;
+ spin_lock_irq(&pdev->msix->control_lock);
+
+ pdev->msix->guest_enable = !!(*data & PCI_MSIX_FLAGS_ENABLE);
+ if ( pdev->msix->host_enable )
+ *data |= PCI_MSIX_FLAGS_ENABLE;
pdev->msix->guest_maskall = !!(*data & PCI_MSIX_FLAGS_MASKALL);
if ( pdev->msix->host_maskall )
*data |= PCI_MSIX_FLAGS_MASKALL;
- return 1;
+ pci_conf_write16(pdev->sbdf, reg, *data);
+
+ spin_unlock_irq(&pdev->msix->control_lock);
+
+ return -EPERM; /* Already done the write. */
}
}
@@ -1324,15 +1333,12 @@ int pci_restore_msi_state(struct pci_dev *pdev)
}
else if ( !type && entry->msi_attrib.type == PCI_CAP_ID_MSIX )
{
- control = pci_conf_read16(pdev->sbdf, msix_control_reg(pos));
- pci_conf_write16(pdev->sbdf, msix_control_reg(pos),
- control | (PCI_MSIX_FLAGS_ENABLE |
- PCI_MSIX_FLAGS_MASKALL));
+ control = msix_update_lock(pdev, pos);
+
if ( unlikely(!memory_decoded(pdev)) )
{
+ msix_update_unlock(pdev, pos, control);
spin_unlock_irqrestore(&desc->lock, flags);
- pci_conf_write16(pdev->sbdf, msix_control_reg(pos),
- control & ~PCI_MSIX_FLAGS_ENABLE);
return -ENXIO;
}
}
@@ -1372,8 +1378,7 @@ int pci_restore_msi_state(struct pci_dev *pdev)
}
if ( type == PCI_CAP_ID_MSIX )
- pci_conf_write16(pdev->sbdf, msix_control_reg(pos),
- control | PCI_MSIX_FLAGS_ENABLE);
+ msix_update_unlock(pdev, pos, control);
return 0;
}
@@ -44,6 +44,7 @@ int pdev_msi_init(struct pci_dev *pdev)
return -ENOMEM;
spin_lock_init(&msix->table_lock);
+ spin_lock_init(&msix->control_lock);
ctrl = pci_conf_read16(pdev->sbdf, msix_control_reg(pos));
msix->nr_entries = msix_table_size(ctrl);
Concurrent access the the MSI-X control register are not serialized with a suitable lock. For example, in msix_capability_init() access use the pcidevs_lock() but some calls to msi_set_mask_bit() use the interrupt descriptor lock. This can lead to MSI-X being incorrectly disabled and subsequent failures due to msix_memory_decoded() calls that check for MSI-X being enabled. This was seen with some non-compliant hardware that gated MSI-X messages on the per-vector mask bit only (i.e., the MSI-X Enable bit and Function Mask bits in the MSI-X Control register were ignored). An interrupt (with a pending move) for vector 0 would occur while vector 1 was being initialized in msix_capability_init(). Updates the the Control register would race and the vector 1 initialization would intermittently fail with -ENXIO. Typically a race between initializing a vector and another vector being moved doesn't occur because: 1. Racy Control accesses only occur when MSI-X is (guest) disabled 2 Hardware should only raise interrupts when MSI-X is enabled and unmasked. 3. Xen always sets Function Mask when temporarily enabling MSI-X. But there may be other race conditions depending on hardware and guest driver behaviour (e.g., disabling MSI-X when a IRQ has a pending move on another PCPU). Fix this by: 1. Tracking the host and guest enable state in a similar way to the host and guest maskall state. Note that since multiple CPUs can be updating different vectors concurrently, a counter is needed for the host enable state. 2. Add a new lock for serialize the Control read/modify/write sequence. 3. Wrap the above in two helper functions (msix_update_lock(), and msix_update_unlock()), which bracket any MSI-X register updates that require MSI-X to be (temporarily) enabled. Signed-off-by: David Vrabel <dvrabel@amazon.co.uk> SIM: https://t.corp.amazon.com/P63914633 CR: https://code.amazon.com/reviews/CR-79020945 --- xen/arch/x86/include/asm/msi.h | 3 + xen/arch/x86/msi.c | 215 +++++++++++++++++---------------- xen/drivers/passthrough/msi.c | 1 + 3 files changed, 114 insertions(+), 105 deletions(-)