diff mbox

[v4,9/9] vpci/msix: add MSI-X handlers

Message ID 20170630150117.88489-10-roger.pau@citrix.com (mailing list archive)
State New, archived
Headers show

Commit Message

Roger Pau Monné June 30, 2017, 3:01 p.m. UTC
Add handlers for accesses to the MSI-X message control field on the
PCI configuration space, and traps for accesses to the memory region
that contains the MSI-X table and PBA. This traps detect attempts from
the guest to configure MSI-X interrupts and properly sets them up.

Note that accesses to the Table Offset, Table BIR, PBA Offset and PBA
BIR are not trapped by Xen at the moment.

Finally, turn the panic in the Dom0 PVH builder into a warning.

Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
---
Cc: Jan Beulich <jbeulich@suse.com>
Cc: Andrew Cooper <andrew.cooper3@citrix.com>
---
Changes since v3:
 - Propagate changes from previous versions: remove xen_ prefix, use
   the new fields in vpci_val and remove the return value from
   handlers.
 - Remove the usage of GENMASK.
 - Mave the arch-specific parts of the dump routine to the
   x86/hvm/vmsi.c dump handler.
 - Chain the MSI-X dump handler to the 'M' debug key.
 - Fix the header BAR mappings so that the MSI-X regions inside of
   BARs are unmapped from the domain p2m in order for the handlers to
   work properly.
 - Unconditionally trap and forward accesses to the PBA MSI-X area.
 - Simplify the conditionals in vpci_msix_control_write.
 - Fix vpci_msix_accept to use a bool type.
 - Allow all supported accesses as described in the spec to the MSI-X
   table.
 - Truncate the returned address when the access is a 32b read.
 - Always return X86EMUL_OKAY from the handlers, returning ~0 in the
   read case if the access is not supported, or ignoring writes.
 - Do not check that max_entries is != 0 in the init handler.
 - Use trylock in the dump handler.

Changes since v2:
 - Split out arch-specific code.

This patch has been tested with devices using both a single MSI-X
entry and multiple ones.
---
 xen/arch/x86/hvm/dom0_build.c    |   2 +-
 xen/arch/x86/hvm/hvm.c           |   1 +
 xen/arch/x86/hvm/vmsi.c          | 128 +++++++++-
 xen/arch/x86/msi.c               |   1 +
 xen/drivers/vpci/Makefile        |   2 +-
 xen/drivers/vpci/header.c        |  85 ++++++-
 xen/drivers/vpci/msix.c          | 503 +++++++++++++++++++++++++++++++++++++++
 xen/include/asm-x86/hvm/domain.h |   3 +
 xen/include/asm-x86/hvm/io.h     |  18 ++
 xen/include/xen/vpci.h           |  39 +++
 10 files changed, 774 insertions(+), 8 deletions(-)
 create mode 100644 xen/drivers/vpci/msix.c

Comments

Jan Beulich Aug. 2, 2017, 3:07 p.m. UTC | #1
>>> Roger Pau Monne <roger.pau@citrix.com> 06/30/17 5:01 PM >>>
>Note that accesses to the Table Offset, Table BIR, PBA Offset and PBA
>BIR are not trapped by Xen at the moment.

They're mandated r/o by the spec anyway.

>@@ -113,6 +148,35 @@ static int vpci_modify_bar(struct domain *d, const struct vpci_bar *bar,
>if ( IS_ERR(mem) )
>return -PTR_ERR(mem);
 >
>+    /*
>+     * Make sure the MSI-X regions of the BAR are not mapped into the domain
>+     * p2m, or else the MSI-X handlers are useless. Only do this when mapping,
>+     * since that's when the memory decoding on the device is enabled.
>+     */
>+    for ( i = 0; map && i < ARRAY_SIZE(bar->msix); i++ )
>+    {
>+        struct vpci_msix_mem *msix = bar->msix[i];
>+
>+        if ( !msix || msix->addr == INVALID_PADDR )
>+            continue;
>+
>+        rc = vpci_unmap_msix(d, msix);

Why do you need this, instead of being able to simply rely on the rangeset
based (un)mapping?

>@@ -405,7 +475,20 @@ static int vpci_init_bars(struct pci_dev *pdev)
>continue;
>}
 >
>-        bars[i].addr = (cmd & PCI_COMMAND_MEMORY) ? addr : INVALID_PADDR;
>+        if ( cmd & PCI_COMMAND_MEMORY )
>+        {
>+            unsigned int j;
>+
>+            bars[i].addr = addr;
>+
>+            for ( j = 0; j < ARRAY_SIZE(bars[i].msix); j++ )
>+                if ( bars[i].msix[j] )
>+                    bars[i].msix[j]->addr = bars[i].addr +
>+                                            bars[i].msix[j]->offset;
>+        }
>+        else
>+            bars[i].addr = INVALID_PADDR;

As (I think) mentioned elsewhere already, this init-time special case looks
dangerous (and unnecessary) to me (or else I'd expect you to also zap
the field when the memory decode bit is being cleared).

>--- /dev/null
>+++ b/xen/drivers/vpci/msix.c
>@@ -0,0 +1,503 @@
>+/*
>+ * Handlers for accesses to the MSI-X capability structure and the memory
>+ * region.
>+ *
>+ * Copyright (C) 2017 Citrix Systems R&D
>+ *
>+ * This program is free software; you can redistribute it and/or
>+ * modify it under the terms and conditions of the GNU General Public
>+ * License, version 2, as published by the Free Software Foundation.
>+ *
>+ * This program is distributed in the hope that it will be useful,
>+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
>+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>+ * General Public License for more details.
>+ *
>+ * You should have received a copy of the GNU General Public
>+ * License along with this program; If not, see <http://www.gnu.org/licenses/>.
>+ */
>+
>+#include <xen/sched.h>
>+#include <xen/vpci.h>
>+#include <asm/msi.h>
>+#include <xen/p2m-common.h>
>+#include <xen/keyhandler.h>
>+
>+#define MSIX_SIZE(num) (offsetof(struct vpci_msix, entries[num]))

The outermost parens are pointless here.

>+static void vpci_msix_control_write(struct pci_dev *pdev, unsigned int reg,
>+                                    union vpci_val val, void *data)
>+{
>+    uint8_t seg = pdev->seg, bus = pdev->bus;
>+    uint8_t slot = PCI_SLOT(pdev->devfn), func = PCI_FUNC(pdev->devfn);
>+    struct vpci_msix *msix = data;
>+    paddr_t table_base = pdev->vpci->header.bars[msix->table.bir].addr;
>+    bool new_masked, new_enabled;
>+    unsigned int i;
>+    int rc;
>+
>+    new_masked = val.u16 & PCI_MSIX_FLAGS_MASKALL;
>+    new_enabled = val.u16 & PCI_MSIX_FLAGS_ENABLE;
>+
>+    if ( !msix->enabled && new_enabled )
>+    {
>+        /* MSI-X enabled. */

Insert "being"?

>+        for ( i = 0; i < msix->max_entries; i++ )
>+        {
>+            if ( msix->entries[i].masked )
>+                continue;

Why is the mask bit relevant here, but not the mask-all one?

>+            rc = vpci_msix_arch_enable(&msix->entries[i].arch, pdev,
>+                                       msix->entries[i].addr,
>+                                       msix->entries[i].data,
>+                                       msix->entries[i].nr, table_base);
>+            if ( rc )
>+            {
>+                gdprintk(XENLOG_ERR,
<+                         "%04x:%02x:%02x.%u: unable to update entry %u: %d\n",
>+                         seg, bus, slot, func, i, rc);
>+                return;
>+            }
>+
>+            vpci_msix_arch_mask(&msix->entries[i].arch, pdev, false);

Same question here.

>+        }
>+    }
>+    else if ( msix->enabled && !new_enabled )
>+    {
>+        /* MSI-X disabled. */
>+        for ( i = 0; i < msix->max_entries; i++ )
>+        {
>+            rc = vpci_msix_arch_disable(&msix->entries[i].arch, pdev);
>+            if ( rc )
>+            {
>+                gdprintk(XENLOG_ERR,
>+                         "%04x:%02x:%02x.%u: unable to disable entry %u: %d\n",
>+                         seg, bus, slot, func, i, rc);
>+                return;
>+            }
>+        }
>+    }
>+
>+    if ( (new_enabled != msix->enabled || new_masked != msix->masked) &&
>+         pci_msi_conf_write_intercept(pdev, reg, 2, &val.u32) >= 0 )
>+        pci_conf_write16(seg, bus, slot, func, reg, val.u32);

DYM val.u16 here?

>+static struct vpci_msix *vpci_msix_find(struct domain *d, unsigned long addr)
>+{
>+    struct vpci_msix *msix;
>+
>+    ASSERT(vpci_locked(d));
>+    list_for_each_entry ( msix,  &d->arch.hvm_domain.msix_tables, next )
>+    {
>+        uint8_t seg = msix->pdev->seg, bus = msix->pdev->bus;
>+        uint8_t slot = PCI_SLOT(msix->pdev->devfn);
>+        uint8_t func = PCI_FUNC(msix->pdev->devfn);
>+        uint16_t cmd = pci_conf_read16(seg, bus, slot, func, PCI_COMMAND);

Perhaps better to keep a cached copy of the command register value?

>+static int vpci_msix_read(struct vcpu *v, unsigned long addr,
>+                          unsigned int len, unsigned long *data)
>+{
>+    struct domain *d = v->domain;
>+    struct vpci_msix *msix;
>+    const struct vpci_msix_entry *entry;
>+    unsigned int offset;
>+
>+    vpci_lock(d);
>+
>+    msix = vpci_msix_find(d, addr);
>+    if ( !msix )
>+    {
>+        vpci_unlock(d);
>+        *data = ~0ul;
>+        return X86EMUL_OKAY;
>+    }
>+
>+    if ( vpci_msix_access_check(msix->pdev, addr, len) )
>+    {
>+        vpci_unlock(d);
>+        *data = ~0ul;
>+        return X86EMUL_OKAY;
>+    }
>+
>+    if ( MSIX_ADDR_IN_RANGE(addr, &msix->pba) )
>+    {
>+        /* Access to PBA. */
>+        switch ( len )
>+        {
>+        case 4:
>+            *data = readl(addr);
>+            break;
>+        case 8:
>+            *data = readq(addr);
>+            break;
>+        default:
>+            ASSERT_UNREACHABLE();

data = ~0ul;

>+static int vpci_msix_write(struct vcpu *v, unsigned long addr,
>+                                 unsigned int len, unsigned long data)
>+{
>+    struct domain *d = v->domain;
>+    struct vpci_msix *msix;
>+    struct vpci_msix_entry *entry;
>+    unsigned int offset;
>+
>+    vpci_lock(d);
>+    msix = vpci_msix_find(d, addr);
>+    if ( !msix )
>+    {
>+        vpci_unlock(d);
>+        return X86EMUL_OKAY;
>+    }
>+
>+    if ( MSIX_ADDR_IN_RANGE(addr, &msix->pba) )
>+    {
>+        /* Ignore writes to PBA, it's behavior is undefined. */
>+        vpci_unlock(d);
>+        return X86EMUL_OKAY;
>+    }
>+
>+    if ( vpci_msix_access_check(msix->pdev, addr, len) )
>+    {
>+        vpci_unlock(d);
>+        return X86EMUL_OKAY;
>+    }
>+
>+    /* Get the table entry and offset. */
>+    entry = vpci_msix_get_entry(msix, addr);
>+    offset = addr & (PCI_MSIX_ENTRY_SIZE - 1);
>+
>+    switch ( offset )
>+    {
>+    case PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET:
>+        if ( len == 8 )
>+        {
>+            entry->addr = data;
>+            break;
>+        }
>+        entry->addr &= ~0xffffffff;

With this, why not ...

>+        entry->addr |= data;
>+        break;
>+    case PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET:
>+        entry->addr &= ~((uint64_t)0xffffffff << 32);

... simply 0xffffffff here?

>+        entry->addr |= data << 32;

Iirc we've already talked about this being undefined for 32-bit arches (e.g.
ARM32), and the resulting need to make "data" uint64_t.

>+        break;
>+    case PCI_MSIX_ENTRY_DATA_OFFSET:
>+        /*
>+         * 8 byte writes to the msg data and vector control fields are
>+         * only allowed if the entry is masked.
>+         */
>+        if ( len == 8 && !entry->masked && !msix->masked && msix->enabled )
>+        {
>+            vpci_unlock(d);
>+            return X86EMUL_OKAY;
>+        }

I don't think this is correct - iirc such writes simply don't take effect immediately
(but I then seem to recall this to apply to the address field and 32-bit writes to
the data field as well). They'd instead take effect the next time the entry is being
unmasked (or some such). A while ago I did fix the qemu code to behave in this
way.

>+        entry->data = data;
>+
>+        if ( len == 4 )
>+            break;
>+
>+        data >>= 32;
>+        /* fallthrough */
>+    case PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET:
>+    {
>+        bool new_masked = data & PCI_MSIX_VECTOR_BITMASK;
>+        struct pci_dev *pdev = msix->pdev;
>+        paddr_t table_base = pdev->vpci->header.bars[msix->table.bir].addr;
>+        int rc;
>+
>+        if ( !msix->enabled )
>+        {
>+            entry->masked = new_masked;
>+            break;
>+        }
>+
>+        if ( new_masked != entry->masked && !new_masked )

if ( !new_masked && entry->masked )

(or the other way around)

>+static int vpci_init_msix(struct pci_dev *pdev)
>+{
>+    struct domain *d = pdev->domain;
>+    uint8_t seg = pdev->seg, bus = pdev->bus;
>+    uint8_t slot = PCI_SLOT(pdev->devfn), func = PCI_FUNC(pdev->devfn);
>+    struct vpci_msix *msix;
>+    unsigned int msix_offset, i, max_entries;
>+    struct vpci_bar *table_bar, *pba_bar;
>+    uint16_t control;
>+    int rc;
>+
>+    msix_offset = pci_find_cap_offset(seg, bus, slot, func, PCI_CAP_ID_MSIX);
>+    if ( !msix_offset )
>+        return 0;
>+
>+    control = pci_conf_read16(seg, bus, slot, func,
>+                              msix_control_reg(msix_offset));
>+
>+    /* Get the maximum number of vectors the device supports. */
>+    max_entries = msix_table_size(control);
>+
>+    msix = xzalloc_bytes(MSIX_SIZE(max_entries));
>+    if ( !msix )
>+        return -ENOMEM;
>+
>+    msix->max_entries = max_entries;
>+    msix->pdev = pdev;
>+
>+    /* Find the MSI-X table address. */
>+    msix->table.offset = pci_conf_read32(seg, bus, slot, func,
>+                                         msix_table_offset_reg(msix_offset));
>+    msix->table.bir = msix->table.offset & PCI_MSIX_BIRMASK;
>+    msix->table.offset &= ~PCI_MSIX_BIRMASK;
>+    msix->table.size = msix->max_entries * PCI_MSIX_ENTRY_SIZE;
>+    msix->table.addr = INVALID_PADDR;
>+
>+    /* Find the MSI-X pba address. */
>+    msix->pba.offset = pci_conf_read32(seg, bus, slot, func,
>+                                       msix_pba_offset_reg(msix_offset));
>+    msix->pba.bir = msix->pba.offset & PCI_MSIX_BIRMASK;
>+    msix->pba.offset &= ~PCI_MSIX_BIRMASK;
>+    msix->pba.size = DIV_ROUND_UP(msix->max_entries, 8);

I think you want to round up to at least the next 32-bit boundary; the
spec talking about bits 63..00 even suggests a 64-bit boundary. The
table addresses being required to be qword aligned also supports this.

>+void vpci_dump_msix(void)
>+{
>+    struct domain *d;
>+    struct pci_dev *pdev;

const for all pointers in dump handlers, as far as possible.

>+    for_each_domain ( d )
>+    {
>+        if ( !has_vpci(d) )
>+            continue;
>+
>+        printk("vPCI MSI-X information for guest %u\n", d->domain_id);

Wouldn't it be better (more useful) to dump the MSI and MSI-X data for a
domain next to each other?

Apart from the comments here the ones give for the MSI patch apply
respectively.

Jan
Roger Pau Monné Aug. 10, 2017, 5:04 p.m. UTC | #2
On Wed, Aug 02, 2017 at 09:07:54AM -0600, Jan Beulich wrote:
> >>> Roger Pau Monne <roger.pau@citrix.com> 06/30/17 5:01 PM >>>
> >Note that accesses to the Table Offset, Table BIR, PBA Offset and PBA
> >BIR are not trapped by Xen at the moment.
> 
> They're mandated r/o by the spec anyway.

> 
> >@@ -113,6 +148,35 @@ static int vpci_modify_bar(struct domain *d, const struct vpci_bar *bar,
> >if ( IS_ERR(mem) )
> >return -PTR_ERR(mem);
>  >
> >+    /*
> >+     * Make sure the MSI-X regions of the BAR are not mapped into the domain
> >+     * p2m, or else the MSI-X handlers are useless. Only do this when mapping,
> >+     * since that's when the memory decoding on the device is enabled.
> >+     */
> >+    for ( i = 0; map && i < ARRAY_SIZE(bar->msix); i++ )
> >+    {
> >+        struct vpci_msix_mem *msix = bar->msix[i];
> >+
> >+        if ( !msix || msix->addr == INVALID_PADDR )
> >+            continue;
> >+
> >+        rc = vpci_unmap_msix(d, msix);
> 
> Why do you need this, instead of being able to simply rely on the rangeset
> based (un)mapping?

This is because the series that I've sent called: "x86/pvh: implement
iommu_inclusive_mapping for PVH Dom0" will map the MSI-X memory areas
into the guest, and thus we need to make sure they are not mapped
here for the emulation path to work.

https://lists.xenproject.org/archives/html/xen-devel/2017-04/msg02849.html

> >@@ -405,7 +475,20 @@ static int vpci_init_bars(struct pci_dev *pdev)
> >continue;
> >}
>  >
> >-        bars[i].addr = (cmd & PCI_COMMAND_MEMORY) ? addr : INVALID_PADDR;
> >+        if ( cmd & PCI_COMMAND_MEMORY )
> >+        {
> >+            unsigned int j;
> >+
> >+            bars[i].addr = addr;
> >+
> >+            for ( j = 0; j < ARRAY_SIZE(bars[i].msix); j++ )
> >+                if ( bars[i].msix[j] )
> >+                    bars[i].msix[j]->addr = bars[i].addr +
> >+                                            bars[i].msix[j]->offset;
> >+        }
> >+        else
> >+            bars[i].addr = INVALID_PADDR;
> 
> As (I think) mentioned elsewhere already, this init-time special case looks
> dangerous (and unnecessary) to me (or else I'd expect you to also zap
> the field when the memory decode bit is being cleared).

OK, so I'm simply going to set this to addr + offset, regardless of
whether the BAR has memory decoding enabled of not. If the BAR is not
yet positioned Dom0 will have to position it anyway before enabling
memory decoding.

> >+        for ( i = 0; i < msix->max_entries; i++ )
> >+        {
> >+            if ( msix->entries[i].masked )
> >+                continue;
> 
> Why is the mask bit relevant here, but not the mask-all one?

Not taking the mask-all into account here is wrong, since setting
mask-all from 1 to 0 should force a recalculation of all the entries
address and data fields. I will fix this in the next version.

> >+            rc = vpci_msix_arch_enable(&msix->entries[i].arch, pdev,
> >+                                       msix->entries[i].addr,
> >+                                       msix->entries[i].data,
> >+                                       msix->entries[i].nr, table_base);
> >+            if ( rc )
> >+            {
> >+                gdprintk(XENLOG_ERR,
> <+                         "%04x:%02x:%02x.%u: unable to update entry %u: %d\n",
> >+                         seg, bus, slot, func, i, rc);
> >+                return;
> >+            }
> >+
> >+            vpci_msix_arch_mask(&msix->entries[i].arch, pdev, false);
> 
> Same question here.

This is needed because after a vpci_msix_arch_enable the pirq is still
masked, and hence needs to be unmasked to match the guest's view.

> >+        }
> >+    }
> >+    else if ( msix->enabled && !new_enabled )
> >+    {
> >+        /* MSI-X disabled. */
> >+        for ( i = 0; i < msix->max_entries; i++ )
> >+        {
> >+            rc = vpci_msix_arch_disable(&msix->entries[i].arch, pdev);
> >+            if ( rc )
> >+            {
> >+                gdprintk(XENLOG_ERR,
> >+                         "%04x:%02x:%02x.%u: unable to disable entry %u: %d\n",
> >+                         seg, bus, slot, func, i, rc);
> >+                return;
> >+            }
> >+        }
> >+    }
> >+
> >+    if ( (new_enabled != msix->enabled || new_masked != msix->masked) &&
> >+         pci_msi_conf_write_intercept(pdev, reg, 2, &val.u32) >= 0 )
> >+        pci_conf_write16(seg, bus, slot, func, reg, val.u32);
> 
> DYM val.u16 here?

Now this is simply val, since the union has been removed.

> >+static struct vpci_msix *vpci_msix_find(struct domain *d, unsigned long addr)
> >+{
> >+    struct vpci_msix *msix;
> >+
> >+    ASSERT(vpci_locked(d));
> >+    list_for_each_entry ( msix,  &d->arch.hvm_domain.msix_tables, next )
> >+    {
> >+        uint8_t seg = msix->pdev->seg, bus = msix->pdev->bus;
> >+        uint8_t slot = PCI_SLOT(msix->pdev->devfn);
> >+        uint8_t func = PCI_FUNC(msix->pdev->devfn);
> >+        uint16_t cmd = pci_conf_read16(seg, bus, slot, func, PCI_COMMAND);
> 
> Perhaps better to keep a cached copy of the command register value?

I'm now using the enabled field of the vpci_bar struct instead of
checking the command register.

> >+        break;
> >+    case PCI_MSIX_ENTRY_DATA_OFFSET:
> >+        /*
> >+         * 8 byte writes to the msg data and vector control fields are
> >+         * only allowed if the entry is masked.
> >+         */
> >+        if ( len == 8 && !entry->masked && !msix->masked && msix->enabled )
> >+        {
> >+            vpci_unlock(d);
> >+            return X86EMUL_OKAY;
> >+        }
> 
> I don't think this is correct - iirc such writes simply don't take effect immediately
> (but I then seem to recall this to apply to the address field and 32-bit writes to
> the data field as well). They'd instead take effect the next time the entry is being
> unmasked (or some such). A while ago I did fix the qemu code to behave in this
> way.

There's an Implementation Note called "Special Considerations for QWORD
Accesses" in the MSI-X section of the PCI 3.0 spec that states:

If a given entry is currently masked (via its Mask bit or the Function
Mask bit), software is permitted to fill in the Message Data and
Vector Control fields with a single QWORD write, taking advantage of
the fact the Message Data field is guaranteed to become visible to
hardware no later than the Vector Control field.

So I think the above chunk is correct. The specification also states
that:

Software must not modify the Address or Data fields of an entry while
it is unmasked. Refer to Section 6.8.3.5 for details.

AFAICT this is not enforced by QEMU, and you can write to the
address/data fields while the message is not masked. The update will
only take effect once the message is masked and unmasked.

> >+static int vpci_init_msix(struct pci_dev *pdev)
> >+{
> >+    struct domain *d = pdev->domain;
> >+    uint8_t seg = pdev->seg, bus = pdev->bus;
> >+    uint8_t slot = PCI_SLOT(pdev->devfn), func = PCI_FUNC(pdev->devfn);
> >+    struct vpci_msix *msix;
> >+    unsigned int msix_offset, i, max_entries;
> >+    struct vpci_bar *table_bar, *pba_bar;
> >+    uint16_t control;
> >+    int rc;
> >+
> >+    msix_offset = pci_find_cap_offset(seg, bus, slot, func, PCI_CAP_ID_MSIX);
> >+    if ( !msix_offset )
> >+        return 0;
> >+
> >+    control = pci_conf_read16(seg, bus, slot, func,
> >+                              msix_control_reg(msix_offset));
> >+
> >+    /* Get the maximum number of vectors the device supports. */
> >+    max_entries = msix_table_size(control);
> >+
> >+    msix = xzalloc_bytes(MSIX_SIZE(max_entries));
> >+    if ( !msix )
> >+        return -ENOMEM;
> >+
> >+    msix->max_entries = max_entries;
> >+    msix->pdev = pdev;
> >+
> >+    /* Find the MSI-X table address. */
> >+    msix->table.offset = pci_conf_read32(seg, bus, slot, func,
> >+                                         msix_table_offset_reg(msix_offset));
> >+    msix->table.bir = msix->table.offset & PCI_MSIX_BIRMASK;
> >+    msix->table.offset &= ~PCI_MSIX_BIRMASK;
> >+    msix->table.size = msix->max_entries * PCI_MSIX_ENTRY_SIZE;
> >+    msix->table.addr = INVALID_PADDR;
> >+
> >+    /* Find the MSI-X pba address. */
> >+    msix->pba.offset = pci_conf_read32(seg, bus, slot, func,
> >+                                       msix_pba_offset_reg(msix_offset));
> >+    msix->pba.bir = msix->pba.offset & PCI_MSIX_BIRMASK;
> >+    msix->pba.offset &= ~PCI_MSIX_BIRMASK;
> >+    msix->pba.size = DIV_ROUND_UP(msix->max_entries, 8);
> 
> I think you want to round up to at least the next 32-bit boundary; the
> spec talking about bits 63..00 even suggests a 64-bit boundary. The
> table addresses being required to be qword aligned also supports this.

The spec mentions that the last QWORD of the PBA doesn't need to be
fully populated, so yes, I assume this needs to be rounded up to a
64-bit boundary.

> >+void vpci_dump_msix(void)
> >+{
> >+    struct domain *d;
> >+    struct pci_dev *pdev;
> 
> const for all pointers in dump handlers, as far as possible.
> 
> >+    for_each_domain ( d )
> >+    {
> >+        if ( !has_vpci(d) )
> >+            continue;
> >+
> >+        printk("vPCI MSI-X information for guest %u\n", d->domain_id);
> 
> Wouldn't it be better (more useful) to dump the MSI and MSI-X data for a
> domain next to each other?

Possibly yes, and printing the MSI and MSI-X data of each device
together would be even better IMHO.

> Apart from the comments here the ones give for the MSI patch apply
> respectively.

I've added the MSI-X dumping to vpci_dump_msi instead.

Roger.
Jan Beulich Aug. 11, 2017, 10:01 a.m. UTC | #3
>>> On 10.08.17 at 19:04, <roger.pau@citrix.com> wrote:
> On Wed, Aug 02, 2017 at 09:07:54AM -0600, Jan Beulich wrote:
>> >>> Roger Pau Monne <roger.pau@citrix.com> 06/30/17 5:01 PM >>>
>> >@@ -113,6 +148,35 @@ static int vpci_modify_bar(struct domain *d, const struct vpci_bar *bar,
>> >if ( IS_ERR(mem) )
>> >return -PTR_ERR(mem);
>>  >
>> >+    /*
>> >+     * Make sure the MSI-X regions of the BAR are not mapped into the domain
>> >+     * p2m, or else the MSI-X handlers are useless. Only do this when mapping,
>> >+     * since that's when the memory decoding on the device is enabled.
>> >+     */
>> >+    for ( i = 0; map && i < ARRAY_SIZE(bar->msix); i++ )
>> >+    {
>> >+        struct vpci_msix_mem *msix = bar->msix[i];
>> >+
>> >+        if ( !msix || msix->addr == INVALID_PADDR )
>> >+            continue;
>> >+
>> >+        rc = vpci_unmap_msix(d, msix);
>> 
>> Why do you need this, instead of being able to simply rely on the rangeset
>> based (un)mapping?
> 
> This is because the series that I've sent called: "x86/pvh: implement
> iommu_inclusive_mapping for PVH Dom0" will map the MSI-X memory areas
> into the guest, and thus we need to make sure they are not mapped
> here for the emulation path to work.
> 
> https://lists.xenproject.org/archives/html/xen-devel/2017-04/msg02849.html 

Oh, okay. The patch description doesn't mention any such
dependency though.

>> >+        break;
>> >+    case PCI_MSIX_ENTRY_DATA_OFFSET:
>> >+        /*
>> >+         * 8 byte writes to the msg data and vector control fields are
>> >+         * only allowed if the entry is masked.
>> >+         */
>> >+        if ( len == 8 && !entry->masked && !msix->masked && msix->enabled )
>> >+        {
>> >+            vpci_unlock(d);
>> >+            return X86EMUL_OKAY;
>> >+        }
>> 
>> I don't think this is correct - iirc such writes simply don't take effect immediately
>> (but I then seem to recall this to apply to the address field and 32-bit writes to
>> the data field as well). They'd instead take effect the next time the entry is being
>> unmasked (or some such). A while ago I did fix the qemu code to behave in this
>> way.
> 
> There's an Implementation Note called "Special Considerations for QWORD
> Accesses" in the MSI-X section of the PCI 3.0 spec that states:
> 
> If a given entry is currently masked (via its Mask bit or the Function
> Mask bit), software is permitted to fill in the Message Data and
> Vector Control fields with a single QWORD write, taking advantage of
> the fact the Message Data field is guaranteed to become visible to
> hardware no later than the Vector Control field.
> 
> So I think the above chunk is correct. The specification also states
> that:
> 
> Software must not modify the Address or Data fields of an entry while
> it is unmasked. Refer to Section 6.8.3.5 for details.
> 
> AFAICT this is not enforced by QEMU, and you can write to the
> address/data fields while the message is not masked. The update will
> only take effect once the message is masked and unmasked.

The spec also says:

"For MSI-X, a function is permitted to cache Address and Data values
 from unmasked MSIX Table entries. However, anytime software
 unmasks a currently masked MSI-X Table entry either by clearing its
 Mask bit or by clearing the Function Mask bit, the function must
 update any Address or Data values that it cached from that entry. If
 software changes the Address or Data value of an entry while the
 entry is unmasked, the result is undefined."

>> >+    for_each_domain ( d )
>> >+    {
>> >+        if ( !has_vpci(d) )
>> >+            continue;
>> >+
>> >+        printk("vPCI MSI-X information for guest %u\n", d->domain_id);
>> 
>> Wouldn't it be better (more useful) to dump the MSI and MSI-X data for a
>> domain next to each other?
> 
> Possibly yes, and printing the MSI and MSI-X data of each device
> together would be even better IMHO.

Not sure about that last aspect - devices aren't permitted to enable
both at the same time, so only one of them can really be relevant.

Jan
Roger Pau Monné Aug. 11, 2017, 10:11 a.m. UTC | #4
On Fri, Aug 11, 2017 at 04:01:05AM -0600, Jan Beulich wrote:
> >>> On 10.08.17 at 19:04, <roger.pau@citrix.com> wrote:
> > On Wed, Aug 02, 2017 at 09:07:54AM -0600, Jan Beulich wrote:
> >> >>> Roger Pau Monne <roger.pau@citrix.com> 06/30/17 5:01 PM >>>
> >> >@@ -113,6 +148,35 @@ static int vpci_modify_bar(struct domain *d, const struct vpci_bar *bar,
> >> >if ( IS_ERR(mem) )
> >> >return -PTR_ERR(mem);
> >>  >
> >> >+    /*
> >> >+     * Make sure the MSI-X regions of the BAR are not mapped into the domain
> >> >+     * p2m, or else the MSI-X handlers are useless. Only do this when mapping,
> >> >+     * since that's when the memory decoding on the device is enabled.
> >> >+     */
> >> >+    for ( i = 0; map && i < ARRAY_SIZE(bar->msix); i++ )
> >> >+    {
> >> >+        struct vpci_msix_mem *msix = bar->msix[i];
> >> >+
> >> >+        if ( !msix || msix->addr == INVALID_PADDR )
> >> >+            continue;
> >> >+
> >> >+        rc = vpci_unmap_msix(d, msix);
> >> 
> >> Why do you need this, instead of being able to simply rely on the rangeset
> >> based (un)mapping?
> > 
> > This is because the series that I've sent called: "x86/pvh: implement
> > iommu_inclusive_mapping for PVH Dom0" will map the MSI-X memory areas
> > into the guest, and thus we need to make sure they are not mapped
> > here for the emulation path to work.
> > 
> > https://lists.xenproject.org/archives/html/xen-devel/2017-04/msg02849.html 
> 
> Oh, okay. The patch description doesn't mention any such
> dependency though.

Will make that clearer on the next version, in fact I'm going to send
this series rebased on top of the iommu_inclusive_mapping one. AFAICT
that one is closer to being committed, and in any case changing the
order is trivial, there are not conflicts.

> >> >+        break;
> >> >+    case PCI_MSIX_ENTRY_DATA_OFFSET:
> >> >+        /*
> >> >+         * 8 byte writes to the msg data and vector control fields are
> >> >+         * only allowed if the entry is masked.
> >> >+         */
> >> >+        if ( len == 8 && !entry->masked && !msix->masked && msix->enabled )
> >> >+        {
> >> >+            vpci_unlock(d);
> >> >+            return X86EMUL_OKAY;
> >> >+        }
> >> 
> >> I don't think this is correct - iirc such writes simply don't take effect immediately
> >> (but I then seem to recall this to apply to the address field and 32-bit writes to
> >> the data field as well). They'd instead take effect the next time the entry is being
> >> unmasked (or some such). A while ago I did fix the qemu code to behave in this
> >> way.
> > 
> > There's an Implementation Note called "Special Considerations for QWORD
> > Accesses" in the MSI-X section of the PCI 3.0 spec that states:
> > 
> > If a given entry is currently masked (via its Mask bit or the Function
> > Mask bit), software is permitted to fill in the Message Data and
> > Vector Control fields with a single QWORD write, taking advantage of
> > the fact the Message Data field is guaranteed to become visible to
> > hardware no later than the Vector Control field.
> > 
> > So I think the above chunk is correct. The specification also states
> > that:
> > 
> > Software must not modify the Address or Data fields of an entry while
> > it is unmasked. Refer to Section 6.8.3.5 for details.
> > 
> > AFAICT this is not enforced by QEMU, and you can write to the
> > address/data fields while the message is not masked. The update will
> > only take effect once the message is masked and unmasked.
> 
> The spec also says:
> 
> "For MSI-X, a function is permitted to cache Address and Data values
>  from unmasked MSIX Table entries. However, anytime software
>  unmasks a currently masked MSI-X Table entry either by clearing its
>  Mask bit or by clearing the Function Mask bit, the function must
>  update any Address or Data values that it cached from that entry. If
>  software changes the Address or Data value of an entry while the
>  entry is unmasked, the result is undefined."

I'm not sure it's clear to me what to do if the guest writes to an
entry while unmasked. For once it says that it must not modify it, but
OTHO it says result is undefined when doing so.

Would you be fine with ignoring writes to the address and data fields
if the entry is unmasked?

> >> >+    for_each_domain ( d )
> >> >+    {
> >> >+        if ( !has_vpci(d) )
> >> >+            continue;
> >> >+
> >> >+        printk("vPCI MSI-X information for guest %u\n", d->domain_id);
> >> 
> >> Wouldn't it be better (more useful) to dump the MSI and MSI-X data for a
> >> domain next to each other?
> > 
> > Possibly yes, and printing the MSI and MSI-X data of each device
> > together would be even better IMHO.
> 
> Not sure about that last aspect - devices aren't permitted to enable
> both at the same time, so only one of them can really be relevant.

I think (for debugging purposes) it's useful to print both together
in order to spot if the guest is doing something wrong.

Roger.
Jan Beulich Aug. 11, 2017, 10:20 a.m. UTC | #5
>>> On 11.08.17 at 12:11, <roger.pau@citrix.com> wrote:
> On Fri, Aug 11, 2017 at 04:01:05AM -0600, Jan Beulich wrote:
>> >>> On 10.08.17 at 19:04, <roger.pau@citrix.com> wrote:
>> > On Wed, Aug 02, 2017 at 09:07:54AM -0600, Jan Beulich wrote:
>> >> >>> Roger Pau Monne <roger.pau@citrix.com> 06/30/17 5:01 PM >>>
>> >> >+    case PCI_MSIX_ENTRY_DATA_OFFSET:
>> >> >+        /*
>> >> >+         * 8 byte writes to the msg data and vector control fields are
>> >> >+         * only allowed if the entry is masked.
>> >> >+         */
>> >> >+        if ( len == 8 && !entry->masked && !msix->masked && msix->enabled )
>> >> >+        {
>> >> >+            vpci_unlock(d);
>> >> >+            return X86EMUL_OKAY;
>> >> >+        }
>> >> 
>> >> I don't think this is correct - iirc such writes simply don't take effect immediately
>> >> (but I then seem to recall this to apply to the address field and 32-bit writes to
>> >> the data field as well). They'd instead take effect the next time the entry is being
>> >> unmasked (or some such). A while ago I did fix the qemu code to behave in this
>> >> way.
>> > 
>> > There's an Implementation Note called "Special Considerations for QWORD
>> > Accesses" in the MSI-X section of the PCI 3.0 spec that states:
>> > 
>> > If a given entry is currently masked (via its Mask bit or the Function
>> > Mask bit), software is permitted to fill in the Message Data and
>> > Vector Control fields with a single QWORD write, taking advantage of
>> > the fact the Message Data field is guaranteed to become visible to
>> > hardware no later than the Vector Control field.
>> > 
>> > So I think the above chunk is correct. The specification also states
>> > that:
>> > 
>> > Software must not modify the Address or Data fields of an entry while
>> > it is unmasked. Refer to Section 6.8.3.5 for details.
>> > 
>> > AFAICT this is not enforced by QEMU, and you can write to the
>> > address/data fields while the message is not masked. The update will
>> > only take effect once the message is masked and unmasked.
>> 
>> The spec also says:
>> 
>> "For MSI-X, a function is permitted to cache Address and Data values
>>  from unmasked MSIX Table entries. However, anytime software
>>  unmasks a currently masked MSI-X Table entry either by clearing its
>>  Mask bit or by clearing the Function Mask bit, the function must
>>  update any Address or Data values that it cached from that entry. If
>>  software changes the Address or Data value of an entry while the
>>  entry is unmasked, the result is undefined."
> 
> I'm not sure it's clear to me what to do if the guest writes to an
> entry while unmasked. For once it says that it must not modify it, but
> OTHO it says result is undefined when doing so.
> 
> Would you be fine with ignoring writes to the address and data fields
> if the entry is unmasked?

No, not really. I've intentionally pointed you to the qemu code,
as there I've implemented the caching behavior described by the
quote above. I'd expect vPCI to behave similarly.

>> >> >+    for_each_domain ( d )
>> >> >+    {
>> >> >+        if ( !has_vpci(d) )
>> >> >+            continue;
>> >> >+
>> >> >+        printk("vPCI MSI-X information for guest %u\n", d->domain_id);
>> >> 
>> >> Wouldn't it be better (more useful) to dump the MSI and MSI-X data for a
>> >> domain next to each other?
>> > 
>> > Possibly yes, and printing the MSI and MSI-X data of each device
>> > together would be even better IMHO.
>> 
>> Not sure about that last aspect - devices aren't permitted to enable
>> both at the same time, so only one of them can really be relevant.
> 
> I think (for debugging purposes) it's useful to print both together
> in order to spot if the guest is doing something wrong.

For Dom0 maybe. For DomU we'd have to refuse guest attempts
to do anything possibly resulting in undefined behavior.

Jan
diff mbox

Patch

diff --git a/xen/arch/x86/hvm/dom0_build.c b/xen/arch/x86/hvm/dom0_build.c
index 6b9f76ec36..c060eb85eb 100644
--- a/xen/arch/x86/hvm/dom0_build.c
+++ b/xen/arch/x86/hvm/dom0_build.c
@@ -1091,7 +1091,7 @@  int __init dom0_construct_pvh(struct domain *d, const module_t *image,
         return rc;
     }
 
-    panic("Building a PVHv2 Dom0 is not yet supported.");
+    printk("WARNING: PVH is an experimental mode with limited functionality\n");
     return 0;
 }
 
diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
index f45e2bd23d..9277e84150 100644
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -585,6 +585,7 @@  int hvm_domain_initialise(struct domain *d, unsigned long domcr_flags,
     INIT_LIST_HEAD(&d->arch.hvm_domain.write_map.list);
     INIT_LIST_HEAD(&d->arch.hvm_domain.g2m_ioport_list);
     INIT_LIST_HEAD(&d->arch.hvm_domain.mmcfg_regions);
+    INIT_LIST_HEAD(&d->arch.hvm_domain.msix_tables);
 
     rc = create_perdomain_mapping(d, PERDOMAIN_VIRT_START, 0, NULL, NULL);
     if ( rc )
diff --git a/xen/arch/x86/hvm/vmsi.c b/xen/arch/x86/hvm/vmsi.c
index 5732c70b5c..f1c72f23d9 100644
--- a/xen/arch/x86/hvm/vmsi.c
+++ b/xen/arch/x86/hvm/vmsi.c
@@ -643,17 +643,15 @@  static unsigned int msi_flags(uint16_t data, uint64_t addr)
            (trig_mode << GFLAGS_SHIFT_TRG_MODE);
 }
 
-void vpci_msi_arch_mask(struct vpci_arch_msi *arch, struct pci_dev *pdev,
-                        unsigned int entry, bool mask)
+static void vpci_mask_pirq(struct domain *d, int pirq, bool mask)
 {
-    struct domain *d = pdev->domain;
     const struct pirq *pinfo;
     struct irq_desc *desc;
     unsigned long flags;
     int irq;
 
-    ASSERT(arch->pirq >= 0);
-    pinfo = pirq_info(d, arch->pirq + entry);
+    ASSERT(pirq >= 0);
+    pinfo = pirq_info(d, pirq);
     ASSERT(pinfo);
 
     irq = pinfo->arch.irq;
@@ -667,6 +665,12 @@  void vpci_msi_arch_mask(struct vpci_arch_msi *arch, struct pci_dev *pdev,
     spin_unlock_irqrestore(&desc->lock, flags);
 }
 
+void vpci_msi_arch_mask(struct vpci_arch_msi *arch, struct pci_dev *pdev,
+                        unsigned int entry, bool mask)
+{
+    vpci_mask_pirq(pdev->domain, arch->pirq + entry, mask);
+}
+
 int vpci_msi_arch_enable(struct vpci_arch_msi *arch, struct pci_dev *pdev,
                          uint64_t address, uint32_t data, unsigned int vectors)
 {
@@ -771,3 +775,117 @@  void vpci_msi_arch_print(struct vpci_arch_msi *arch, uint16_t data,
            MASK_EXTR(addr, MSI_ADDR_DEST_ID_MASK),
            arch->pirq);
 }
+
+void vpci_msix_arch_mask(struct vpci_arch_msix_entry *arch,
+                         struct pci_dev *pdev, bool mask)
+{
+    vpci_mask_pirq(pdev->domain, arch->pirq, mask);
+}
+
+int vpci_msix_arch_enable(struct vpci_arch_msix_entry *arch,
+                          struct pci_dev *pdev, uint64_t address,
+                          uint32_t data, unsigned int entry_nr,
+                          paddr_t table_base)
+{
+    struct domain *d = pdev->domain;
+    xen_domctl_bind_pt_irq_t bind = {
+        .irq_type = PT_IRQ_TYPE_MSI,
+        .u.msi.gvec = msi_vector(data),
+        .u.msi.gflags = msi_flags(data, address),
+    };
+    int rc;
+
+    if ( arch->pirq == -1 )
+    {
+        struct msi_info msi_info = {
+            .seg = pdev->seg,
+            .bus = pdev->bus,
+            .devfn = pdev->devfn,
+            .table_base = table_base,
+            .entry_nr = entry_nr,
+        };
+
+        /* Map PIRQ. */
+        rc = allocate_and_map_msi_pirq(d, -1, &arch->pirq,
+                                       MAP_PIRQ_TYPE_MSI, &msi_info);
+        if ( rc )
+        {
+            dprintk(XENLOG_ERR,
+                    "%04x:%02x:%02x.%u: unable to map MSI-X PIRQ entry %u: %d\n",
+                    pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn),
+                    PCI_FUNC(pdev->devfn), entry_nr, rc);
+            return rc;
+        }
+    }
+
+    bind.machine_irq = arch->pirq;
+    pcidevs_lock();
+    rc = pt_irq_create_bind(d, &bind);
+    if ( rc )
+    {
+        dprintk(XENLOG_ERR,
+                "%04x:%02x:%02x.%u: unable to create MSI-X bind %u: %d\n",
+                pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn),
+                PCI_FUNC(pdev->devfn), entry_nr, rc);
+        spin_lock(&d->event_lock);
+        unmap_domain_pirq(d, arch->pirq);
+        spin_unlock(&d->event_lock);
+        pcidevs_unlock();
+        arch->pirq = -1;
+        return rc;
+    }
+    pcidevs_unlock();
+
+    return 0;
+}
+
+int vpci_msix_arch_disable(struct vpci_arch_msix_entry *arch,
+                           struct pci_dev *pdev)
+{
+    struct domain *d = pdev->domain;
+    xen_domctl_bind_pt_irq_t bind = {
+        .irq_type = PT_IRQ_TYPE_MSI,
+        .machine_irq = arch->pirq,
+    };
+    int rc;
+
+    if ( arch->pirq == -1 )
+        return 0;
+
+    pcidevs_lock();
+    rc = pt_irq_destroy_bind(d, &bind);
+    if ( rc )
+    {
+        pcidevs_unlock();
+        return rc;
+    }
+
+    spin_lock(&d->event_lock);
+    unmap_domain_pirq(d, arch->pirq);
+    spin_unlock(&d->event_lock);
+    pcidevs_unlock();
+
+    arch->pirq = -1;
+
+    return 0;
+}
+
+int vpci_msix_arch_init(struct vpci_arch_msix_entry *arch)
+{
+    arch->pirq = -1;
+    return 0;
+}
+
+void vpci_msix_arch_print(struct vpci_arch_msix_entry *entry, uint32_t data,
+                          uint64_t addr, bool masked, unsigned int pos)
+{
+    printk("%4u vec=%#02x%7s%6s%3sassert%5s%7s dest_id=%lu mask=%u pirq: %d\n",
+           pos, MASK_EXTR(data, MSI_DATA_VECTOR_MASK),
+           data & MSI_DATA_DELIVERY_LOWPRI ? "lowest" : "fixed",
+           data & MSI_DATA_TRIGGER_LEVEL ? "level" : "edge",
+           data & MSI_DATA_LEVEL_ASSERT ? "" : "de",
+           addr & MSI_ADDR_DESTMODE_LOGIC ? "log" : "phys",
+           addr & MSI_ADDR_REDIRECTION_LOWPRI ? "lowest" : "cpu",
+           MASK_EXTR(addr, MSI_ADDR_DEST_ID_MASK),
+           masked, entry->pirq);
+}
diff --git a/xen/arch/x86/msi.c b/xen/arch/x86/msi.c
index 573378d6c3..ad5c27df18 100644
--- a/xen/arch/x86/msi.c
+++ b/xen/arch/x86/msi.c
@@ -1539,6 +1539,7 @@  static void dump_msi(unsigned char key)
     }
 
     vpci_dump_msi();
+    vpci_dump_msix();
 }
 
 static int __init msi_setup_keyhandler(void)
diff --git a/xen/drivers/vpci/Makefile b/xen/drivers/vpci/Makefile
index 62cec9e82b..55d1bdfda0 100644
--- a/xen/drivers/vpci/Makefile
+++ b/xen/drivers/vpci/Makefile
@@ -1 +1 @@ 
-obj-y += vpci.o header.o msi.o
+obj-y += vpci.o header.o msi.o msix.o
diff --git a/xen/drivers/vpci/header.c b/xen/drivers/vpci/header.c
index ae5719ab1a..07056350d6 100644
--- a/xen/drivers/vpci/header.c
+++ b/xen/drivers/vpci/header.c
@@ -20,6 +20,7 @@ 
 #include <xen/sched.h>
 #include <xen/vpci.h>
 #include <xen/p2m-common.h>
+#include <asm/p2m.h>
 
 #define MAPPABLE_BAR(x)                                                 \
     (((x)->type == VPCI_BAR_MEM32 || (x)->type == VPCI_BAR_MEM64_LO ||  \
@@ -100,11 +101,45 @@  static int vpci_map_range(unsigned long s, unsigned long e, void *data)
     return modify_mmio(map->d, _gfn(s), _mfn(s), e - s + 1, map->map);
 }
 
+static int vpci_unmap_msix(struct domain *d, struct vpci_msix_mem *msix)
+{
+    unsigned long gfn;
+
+    for ( gfn = PFN_DOWN(msix->addr); gfn <= PFN_UP(msix->addr + msix->size);
+          gfn++ )
+    {
+        p2m_type_t t;
+        mfn_t mfn = get_gfn(d, gfn, &t);
+        int rc;
+
+        if ( mfn_eq(mfn, INVALID_MFN) )
+        {
+            /* Nothing to do, this is already a hole. */
+            put_gfn(d, gfn);
+            continue;
+        }
+
+        if ( !p2m_is_mmio(t) )
+        {
+            put_gfn(d, gfn);
+            return -EINVAL;
+        }
+
+        rc = modify_mmio(d, _gfn(gfn), mfn, 1, false);
+        put_gfn(d, gfn);
+        if ( rc )
+            return rc;
+    }
+
+    return 0;
+}
+
 static int vpci_modify_bar(struct domain *d, const struct vpci_bar *bar,
                            const bool map)
 {
     struct rangeset *mem;
     struct map_data data = { .d = d, .map = map };
+    unsigned int i;
     int rc;
 
     ASSERT(MAPPABLE_BAR(bar));
@@ -113,6 +148,35 @@  static int vpci_modify_bar(struct domain *d, const struct vpci_bar *bar,
     if ( IS_ERR(mem) )
         return -PTR_ERR(mem);
 
+    /*
+     * Make sure the MSI-X regions of the BAR are not mapped into the domain
+     * p2m, or else the MSI-X handlers are useless. Only do this when mapping,
+     * since that's when the memory decoding on the device is enabled.
+     */
+    for ( i = 0; map && i < ARRAY_SIZE(bar->msix); i++ )
+    {
+        struct vpci_msix_mem *msix = bar->msix[i];
+
+        if ( !msix || msix->addr == INVALID_PADDR )
+            continue;
+
+        rc = vpci_unmap_msix(d, msix);
+        if ( rc )
+        {
+            rangeset_destroy(mem);
+            return rc;
+        }
+
+        rc = rangeset_remove_range(mem, PFN_DOWN(msix->addr),
+                                   PFN_DOWN(msix->addr + msix->size));
+        if ( rc )
+        {
+            rangeset_destroy(mem);
+            return rc;
+        }
+
+    }
+
     rc = rangeset_report_ranges(mem, 0, ~0ul, vpci_map_range, &data);
     rangeset_destroy(mem);
     if ( rc )
@@ -221,6 +285,7 @@  static void vpci_bar_write(struct pci_dev *pdev, unsigned int reg,
     uint8_t seg = pdev->seg, bus = pdev->bus;
     uint8_t slot = PCI_SLOT(pdev->devfn), func = PCI_FUNC(pdev->devfn);
     uint32_t wdata = val.u32, size_mask;
+    unsigned int i;
     bool hi = false;
 
     switch ( bar->type )
@@ -269,6 +334,11 @@  static void vpci_bar_write(struct pci_dev *pdev, unsigned int reg,
     bar->addr &= ~((uint64_t)0xffffffff << (hi ? 32 : 0));
     bar->addr |= (uint64_t)wdata << (hi ? 32 : 0);
 
+    /* Update any MSI-X areas contained in this BAR. */
+    for ( i = 0; i < ARRAY_SIZE(bar->msix); i++ )
+        if ( bar->msix[i] )
+            bar->msix[i]->addr = bar->addr + bar->msix[i]->offset;
+
     /* Make sure Xen writes back the same value for the BAR RO bits. */
     if ( !hi )
         wdata |= pci_conf_read32(pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn),
@@ -405,7 +475,20 @@  static int vpci_init_bars(struct pci_dev *pdev)
             continue;
         }
 
-        bars[i].addr = (cmd & PCI_COMMAND_MEMORY) ? addr : INVALID_PADDR;
+        if ( cmd & PCI_COMMAND_MEMORY )
+        {
+            unsigned int j;
+
+            bars[i].addr = addr;
+
+            for ( j = 0; j < ARRAY_SIZE(bars[i].msix); j++ )
+                if ( bars[i].msix[j] )
+                    bars[i].msix[j]->addr = bars[i].addr +
+                                            bars[i].msix[j]->offset;
+        }
+        else
+            bars[i].addr = INVALID_PADDR;
+
         bars[i].size = size;
         bars[i].prefetchable = val & PCI_BASE_ADDRESS_MEM_PREFETCH;
 
diff --git a/xen/drivers/vpci/msix.c b/xen/drivers/vpci/msix.c
new file mode 100644
index 0000000000..dc0e7070e4
--- /dev/null
+++ b/xen/drivers/vpci/msix.c
@@ -0,0 +1,503 @@ 
+/*
+ * Handlers for accesses to the MSI-X capability structure and the memory
+ * region.
+ *
+ * Copyright (C) 2017 Citrix Systems R&D
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms and conditions of the GNU General Public
+ * License, version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <xen/sched.h>
+#include <xen/vpci.h>
+#include <asm/msi.h>
+#include <xen/p2m-common.h>
+#include <xen/keyhandler.h>
+
+#define MSIX_SIZE(num) (offsetof(struct vpci_msix, entries[num]))
+#define MSIX_ADDR_IN_RANGE(a, table)                                    \
+    ((table)->addr != INVALID_PADDR && (a) >= (table)->addr &&          \
+     (a) < (table)->addr + (table)->size)
+
+static void vpci_msix_control_read(struct pci_dev *pdev, unsigned int reg,
+                                   union vpci_val *val, void *data)
+{
+    const struct vpci_msix *msix = data;
+
+    val->u16 = (msix->max_entries - 1) & PCI_MSIX_FLAGS_QSIZE;
+    val->u16 |= msix->enabled ? PCI_MSIX_FLAGS_ENABLE : 0;
+    val->u16 |= msix->masked ? PCI_MSIX_FLAGS_MASKALL : 0;
+}
+
+static void vpci_msix_control_write(struct pci_dev *pdev, unsigned int reg,
+                                    union vpci_val val, void *data)
+{
+    uint8_t seg = pdev->seg, bus = pdev->bus;
+    uint8_t slot = PCI_SLOT(pdev->devfn), func = PCI_FUNC(pdev->devfn);
+    struct vpci_msix *msix = data;
+    paddr_t table_base = pdev->vpci->header.bars[msix->table.bir].addr;
+    bool new_masked, new_enabled;
+    unsigned int i;
+    int rc;
+
+    new_masked = val.u16 & PCI_MSIX_FLAGS_MASKALL;
+    new_enabled = val.u16 & PCI_MSIX_FLAGS_ENABLE;
+
+    if ( !msix->enabled && new_enabled )
+    {
+        /* MSI-X enabled. */
+        for ( i = 0; i < msix->max_entries; i++ )
+        {
+            if ( msix->entries[i].masked )
+                continue;
+
+            rc = vpci_msix_arch_enable(&msix->entries[i].arch, pdev,
+                                       msix->entries[i].addr,
+                                       msix->entries[i].data,
+                                       msix->entries[i].nr, table_base);
+            if ( rc )
+            {
+                gdprintk(XENLOG_ERR,
+                         "%04x:%02x:%02x.%u: unable to update entry %u: %d\n",
+                         seg, bus, slot, func, i, rc);
+                return;
+            }
+
+            vpci_msix_arch_mask(&msix->entries[i].arch, pdev, false);
+        }
+    }
+    else if ( msix->enabled && !new_enabled )
+    {
+        /* MSI-X disabled. */
+        for ( i = 0; i < msix->max_entries; i++ )
+        {
+            rc = vpci_msix_arch_disable(&msix->entries[i].arch, pdev);
+            if ( rc )
+            {
+                gdprintk(XENLOG_ERR,
+                         "%04x:%02x:%02x.%u: unable to disable entry %u: %d\n",
+                         seg, bus, slot, func, i, rc);
+                return;
+            }
+        }
+    }
+
+    if ( (new_enabled != msix->enabled || new_masked != msix->masked) &&
+         pci_msi_conf_write_intercept(pdev, reg, 2, &val.u32) >= 0 )
+        pci_conf_write16(seg, bus, slot, func, reg, val.u32);
+
+    msix->masked = new_masked;
+    msix->enabled = new_enabled;
+}
+
+static struct vpci_msix *vpci_msix_find(struct domain *d, unsigned long addr)
+{
+    struct vpci_msix *msix;
+
+    ASSERT(vpci_locked(d));
+    list_for_each_entry ( msix,  &d->arch.hvm_domain.msix_tables, next )
+    {
+        uint8_t seg = msix->pdev->seg, bus = msix->pdev->bus;
+        uint8_t slot = PCI_SLOT(msix->pdev->devfn);
+        uint8_t func = PCI_FUNC(msix->pdev->devfn);
+        uint16_t cmd = pci_conf_read16(seg, bus, slot, func, PCI_COMMAND);
+
+        if ( (cmd & PCI_COMMAND_MEMORY) &&
+             (MSIX_ADDR_IN_RANGE(addr, &msix->table) ||
+             MSIX_ADDR_IN_RANGE(addr, &msix->pba)) )
+            return msix;
+    }
+
+    return NULL;
+}
+
+static int vpci_msix_accept(struct vcpu *v, unsigned long addr)
+{
+    bool found;
+
+    vpci_lock(v->domain);
+    found = vpci_msix_find(v->domain, addr);
+    vpci_unlock(v->domain);
+
+    return found;
+}
+
+static int vpci_msix_access_check(struct pci_dev *pdev, unsigned long addr,
+                                  unsigned int len)
+{
+    uint8_t seg = pdev->seg, bus = pdev->bus;
+    uint8_t slot = PCI_SLOT(pdev->devfn), func = PCI_FUNC(pdev->devfn);
+
+    /* Only allow 32/64b accesses. */
+    if ( len != 4 && len != 8 )
+    {
+        gdprintk(XENLOG_ERR,
+                 "%04x:%02x:%02x.%u: invalid MSI-X table access size: %u\n",
+                 seg, bus, slot, func, len);
+        return -EINVAL;
+    }
+
+    /* Only allow aligned accesses. */
+    if ( (addr & (len - 1)) != 0 )
+    {
+        gdprintk(XENLOG_ERR,
+                 "%04x:%02x:%02x.%u: MSI-X only allows aligned accesses\n",
+                 seg, bus, slot, func);
+        return -EINVAL;
+    }
+
+    return 0;
+}
+
+static struct vpci_msix_entry *vpci_msix_get_entry(struct vpci_msix *msix,
+                                                   unsigned long addr)
+{
+    return &msix->entries[(addr - msix->table.addr) / PCI_MSIX_ENTRY_SIZE];
+}
+
+static int vpci_msix_read(struct vcpu *v, unsigned long addr,
+                          unsigned int len, unsigned long *data)
+{
+    struct domain *d = v->domain;
+    struct vpci_msix *msix;
+    const struct vpci_msix_entry *entry;
+    unsigned int offset;
+
+    vpci_lock(d);
+
+    msix = vpci_msix_find(d, addr);
+    if ( !msix )
+    {
+        vpci_unlock(d);
+        *data = ~0ul;
+        return X86EMUL_OKAY;
+    }
+
+    if ( vpci_msix_access_check(msix->pdev, addr, len) )
+    {
+        vpci_unlock(d);
+        *data = ~0ul;
+        return X86EMUL_OKAY;
+    }
+
+    if ( MSIX_ADDR_IN_RANGE(addr, &msix->pba) )
+    {
+        /* Access to PBA. */
+        switch ( len )
+        {
+        case 4:
+            *data = readl(addr);
+            break;
+        case 8:
+            *data = readq(addr);
+            break;
+        default:
+            ASSERT_UNREACHABLE();
+            break;
+        }
+
+        vpci_unlock(d);
+        return X86EMUL_OKAY;
+    }
+
+    /* Get the table entry and offset. */
+    entry = vpci_msix_get_entry(msix, addr);
+    offset = addr & (PCI_MSIX_ENTRY_SIZE - 1);
+
+    switch ( offset )
+    {
+    case PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET:
+        /*
+         * NB: do explicit truncation to the size of the access. This shouldn't
+         * be required here, since the caller of the handler should already
+         * take the appropriate measures to truncate the value before returning
+         * to the guest, but better be safe than sorry.
+         */
+        *data = len == 8 ? entry->addr : (uint32_t)entry->addr;
+        break;
+    case PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET:
+        *data = entry->addr >> 32;
+        break;
+    case PCI_MSIX_ENTRY_DATA_OFFSET:
+        *data = entry->data;
+        if ( len == 8 )
+            *data |=
+                (uint64_t)(entry->masked ? PCI_MSIX_VECTOR_BITMASK : 0) << 32;
+        break;
+    case PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET:
+        *data = entry->masked ? PCI_MSIX_VECTOR_BITMASK : 0;
+        break;
+    default:
+        BUG();
+    }
+    vpci_unlock(d);
+
+    return X86EMUL_OKAY;
+}
+
+static int vpci_msix_write(struct vcpu *v, unsigned long addr,
+                                 unsigned int len, unsigned long data)
+{
+    struct domain *d = v->domain;
+    struct vpci_msix *msix;
+    struct vpci_msix_entry *entry;
+    unsigned int offset;
+
+    vpci_lock(d);
+    msix = vpci_msix_find(d, addr);
+    if ( !msix )
+    {
+        vpci_unlock(d);
+        return X86EMUL_OKAY;
+    }
+
+    if ( MSIX_ADDR_IN_RANGE(addr, &msix->pba) )
+    {
+        /* Ignore writes to PBA, it's behavior is undefined. */
+        vpci_unlock(d);
+        return X86EMUL_OKAY;
+    }
+
+    if ( vpci_msix_access_check(msix->pdev, addr, len) )
+    {
+        vpci_unlock(d);
+        return X86EMUL_OKAY;
+    }
+
+    /* Get the table entry and offset. */
+    entry = vpci_msix_get_entry(msix, addr);
+    offset = addr & (PCI_MSIX_ENTRY_SIZE - 1);
+
+    switch ( offset )
+    {
+    case PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET:
+        if ( len == 8 )
+        {
+            entry->addr = data;
+            break;
+        }
+        entry->addr &= ~0xffffffff;
+        entry->addr |= data;
+        break;
+    case PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET:
+        entry->addr &= ~((uint64_t)0xffffffff << 32);
+        entry->addr |= data << 32;
+        break;
+    case PCI_MSIX_ENTRY_DATA_OFFSET:
+        /*
+         * 8 byte writes to the msg data and vector control fields are
+         * only allowed if the entry is masked.
+         */
+        if ( len == 8 && !entry->masked && !msix->masked && msix->enabled )
+        {
+            vpci_unlock(d);
+            return X86EMUL_OKAY;
+        }
+
+        entry->data = data;
+
+        if ( len == 4 )
+            break;
+
+        data >>= 32;
+        /* fallthrough */
+    case PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET:
+    {
+        bool new_masked = data & PCI_MSIX_VECTOR_BITMASK;
+        struct pci_dev *pdev = msix->pdev;
+        paddr_t table_base = pdev->vpci->header.bars[msix->table.bir].addr;
+        int rc;
+
+        if ( !msix->enabled )
+        {
+            entry->masked = new_masked;
+            break;
+        }
+
+        if ( new_masked != entry->masked && !new_masked )
+        {
+            /* Unmasking an entry, update it. */
+            rc = vpci_msix_arch_enable(&entry->arch, pdev, entry->addr,
+                                       entry->data, entry->nr, table_base);
+            if ( rc )
+            {
+                vpci_unlock(d);
+                gdprintk(XENLOG_ERR,
+                         "%04x:%02x:%02x.%u: unable to update entry %u: %d\n",
+                         pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn),
+                         PCI_FUNC(pdev->devfn), entry->nr, rc);
+                return X86EMUL_OKAY;
+            }
+        }
+
+        vpci_msix_arch_mask(&entry->arch, pdev, new_masked);
+        entry->masked = new_masked;
+
+        break;
+    }
+    default:
+        BUG();
+    }
+    vpci_unlock(d);
+
+    return X86EMUL_OKAY;
+}
+
+static const struct hvm_mmio_ops vpci_msix_table_ops = {
+    .check = vpci_msix_accept,
+    .read = vpci_msix_read,
+    .write = vpci_msix_write,
+};
+
+static int vpci_init_msix(struct pci_dev *pdev)
+{
+    struct domain *d = pdev->domain;
+    uint8_t seg = pdev->seg, bus = pdev->bus;
+    uint8_t slot = PCI_SLOT(pdev->devfn), func = PCI_FUNC(pdev->devfn);
+    struct vpci_msix *msix;
+    unsigned int msix_offset, i, max_entries;
+    struct vpci_bar *table_bar, *pba_bar;
+    uint16_t control;
+    int rc;
+
+    msix_offset = pci_find_cap_offset(seg, bus, slot, func, PCI_CAP_ID_MSIX);
+    if ( !msix_offset )
+        return 0;
+
+    control = pci_conf_read16(seg, bus, slot, func,
+                              msix_control_reg(msix_offset));
+
+    /* Get the maximum number of vectors the device supports. */
+    max_entries = msix_table_size(control);
+
+    msix = xzalloc_bytes(MSIX_SIZE(max_entries));
+    if ( !msix )
+        return -ENOMEM;
+
+    msix->max_entries = max_entries;
+    msix->pdev = pdev;
+
+    /* Find the MSI-X table address. */
+    msix->table.offset = pci_conf_read32(seg, bus, slot, func,
+                                         msix_table_offset_reg(msix_offset));
+    msix->table.bir = msix->table.offset & PCI_MSIX_BIRMASK;
+    msix->table.offset &= ~PCI_MSIX_BIRMASK;
+    msix->table.size = msix->max_entries * PCI_MSIX_ENTRY_SIZE;
+    msix->table.addr = INVALID_PADDR;
+
+    /* Find the MSI-X pba address. */
+    msix->pba.offset = pci_conf_read32(seg, bus, slot, func,
+                                       msix_pba_offset_reg(msix_offset));
+    msix->pba.bir = msix->pba.offset & PCI_MSIX_BIRMASK;
+    msix->pba.offset &= ~PCI_MSIX_BIRMASK;
+    msix->pba.size = DIV_ROUND_UP(msix->max_entries, 8);
+    msix->pba.addr = INVALID_PADDR;
+
+    for ( i = 0; i < msix->max_entries; i++)
+    {
+        msix->entries[i].masked = true;
+        msix->entries[i].nr = i;
+        vpci_msix_arch_init(&msix->entries[i].arch);
+    }
+
+    if ( list_empty(&d->arch.hvm_domain.msix_tables) )
+        register_mmio_handler(d, &vpci_msix_table_ops);
+
+    list_add(&msix->next, &d->arch.hvm_domain.msix_tables);
+
+    rc = vpci_add_register(pdev, vpci_msix_control_read,
+                           vpci_msix_control_write,
+                           msix_control_reg(msix_offset), 2, msix);
+    if ( rc )
+    {
+        dprintk(XENLOG_ERR,
+                "%04x:%02x:%02x.%u: failed to add handler for MSI-X control: %d\n",
+                seg, bus, slot, func, rc);
+        goto error;
+    }
+
+    table_bar = &pdev->vpci->header.bars[msix->table.bir];
+    pba_bar = &pdev->vpci->header.bars[msix->pba.bir];
+
+    /*
+     * The header handlers will take care of leaving a hole for the MSI-X
+     * related areas, that's why MSI-X needs to be initialized before the
+     * header.
+     */
+    table_bar->msix[VPCI_BAR_MSIX_TABLE] = &msix->table;
+    pba_bar->msix[VPCI_BAR_MSIX_PBA] = &msix->pba;
+    pdev->vpci->msix = msix;
+
+    return 0;
+
+ error:
+    ASSERT(rc);
+    xfree(msix);
+    return rc;
+}
+
+REGISTER_VPCI_INIT(vpci_init_msix, VPCI_PRIORITY_HIGH);
+
+void vpci_dump_msix(void)
+{
+    struct domain *d;
+    struct pci_dev *pdev;
+
+    for_each_domain ( d )
+    {
+        if ( !has_vpci(d) )
+            continue;
+
+        printk("vPCI MSI-X information for guest %u\n", d->domain_id);
+
+        if ( !vpci_trylock(d) )
+        {
+            printk("Unable to get vPCI lock, skipping\n");
+            continue;
+        }
+
+        list_for_each_entry ( pdev, &d->arch.pdev_list, domain_list)
+        {
+            uint8_t seg = pdev->seg, bus = pdev->bus;
+            uint8_t slot = PCI_SLOT(pdev->devfn), func = PCI_FUNC(pdev->devfn);
+            struct vpci_msix *msix = pdev->vpci->msix;
+            unsigned int i;
+
+            if ( !msix )
+                continue;
+
+            printk("Device %04x:%02x:%02x.%u\n", seg, bus, slot, func);
+
+            printk("Max entries: %u maskall: %u enabled: %u\n",
+                   msix->max_entries, msix->masked, msix->enabled);
+
+            printk("Guest table entries:\n");
+            for ( i = 0; i < msix->max_entries; i++ )
+                vpci_msix_arch_print(&msix->entries[i].arch,
+                                     msix->entries[i].data,
+                                     msix->entries[i].addr,
+                                     msix->entries[i].masked, i);
+        }
+        vpci_unlock(d);
+    }
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
+
diff --git a/xen/include/asm-x86/hvm/domain.h b/xen/include/asm-x86/hvm/domain.h
index 7028f93861..980d718327 100644
--- a/xen/include/asm-x86/hvm/domain.h
+++ b/xen/include/asm-x86/hvm/domain.h
@@ -190,6 +190,9 @@  struct hvm_domain {
     /* List of ECAM (MMCFG) regions trapped by Xen. */
     struct list_head mmcfg_regions;
 
+    /* List of MSI-X tables. */
+    struct list_head msix_tables;
+
     /* List of permanently write-mapped pages. */
     struct {
         spinlock_t lock;
diff --git a/xen/include/asm-x86/hvm/io.h b/xen/include/asm-x86/hvm/io.h
index 55ed094734..739eefe541 100644
--- a/xen/include/asm-x86/hvm/io.h
+++ b/xen/include/asm-x86/hvm/io.h
@@ -144,6 +144,24 @@  int vpci_msi_arch_init(struct vpci_arch_msi *arch);
 void vpci_msi_arch_print(struct vpci_arch_msi *arch, uint16_t data,
                          uint64_t addr);
 
+/* Arch-specific MSI-X entry data for vPCI. */
+struct vpci_arch_msix_entry {
+    int pirq;
+};
+
+/* Arch-specific vPCI MSI-X helpers. */
+void vpci_msix_arch_mask(struct vpci_arch_msix_entry *arch,
+                         struct pci_dev *pdev, bool mask);
+int vpci_msix_arch_enable(struct vpci_arch_msix_entry *arch,
+                          struct pci_dev *pdev, uint64_t address,
+                          uint32_t data, unsigned int entry_nr,
+                          paddr_t table_base);
+int vpci_msix_arch_disable(struct vpci_arch_msix_entry *arch,
+                           struct pci_dev *pdev);
+int vpci_msix_arch_init(struct vpci_arch_msix_entry *arch);
+void vpci_msix_arch_print(struct vpci_arch_msix_entry *entry, uint32_t data,
+                          uint64_t addr, bool masked, unsigned int pos);
+
 enum stdvga_cache_state {
     STDVGA_CACHE_UNINITIALIZED,
     STDVGA_CACHE_ENABLED,
diff --git a/xen/include/xen/vpci.h b/xen/include/xen/vpci.h
index ca693f3667..5bc6380531 100644
--- a/xen/include/xen/vpci.h
+++ b/xen/include/xen/vpci.h
@@ -83,6 +83,10 @@  struct vpci {
             } type;
             paddr_t addr;
             uint64_t size;
+#define VPCI_BAR_MSIX_TABLE     0
+#define VPCI_BAR_MSIX_PBA       1
+#define VPCI_BAR_MSIX_NUM       2
+            struct vpci_msix_mem *msix[VPCI_BAR_MSIX_NUM];
             bool prefetchable;
             bool sizing;
             bool enabled;
@@ -112,10 +116,45 @@  struct vpci {
         /* Arch-specific data. */
         struct vpci_arch_msi arch;
     } *msi;
+
+    /* MSI-X data. */
+    struct vpci_msix {
+        struct pci_dev *pdev;
+        /* Maximum number of vectors supported by the device. */
+        unsigned int max_entries;
+        /* MSI-X enabled? */
+        bool enabled;
+        /* Masked? */
+        bool masked;
+        /* List link. */
+        struct list_head next;
+        /* Table information. */
+        struct vpci_msix_mem {
+            /* MSI-X table offset. */
+            unsigned int offset;
+            /* MSI-X table BIR. */
+            unsigned int bir;
+            /* Table addr. */
+            paddr_t addr;
+            /* Table size. */
+            unsigned int size;
+        } table;
+        /* PBA */
+        struct vpci_msix_mem pba;
+        /* Entries. */
+        struct vpci_msix_entry {
+            unsigned int nr;
+            uint64_t addr;
+            uint32_t data;
+            bool masked;
+            struct vpci_arch_msix_entry arch;
+        } entries[];
+    } *msix;
 #endif
 };
 
 void vpci_dump_msi(void);
+void vpci_dump_msix(void);
 
 #endif